static_generator 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +6 -0
- data/README.textile +32 -0
- data/Rakefile +22 -0
- data/lib/static_generator.rb +76 -0
- data/lib/static_generator/version.rb +3 -0
- data/spec/crawler_spec.rb +133 -0
- data/spec/destination_directory/.gitkeep +0 -0
- data/spec/fakeweb_helper.rb +65 -0
- data/spec/spec_helper.rb +8 -0
- data/static_generator.gemspec +27 -0
- metadata +144 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.textile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
h1. Why did i create Static Generator?
|
2
|
+
|
3
|
+
Because i wanted a simple way to grab the HTML from a site generated with my Ruby on Rails CMS.
|
4
|
+
The cool thing with StaticGenerator compared to other solutions (wget), is that when i have an URL like /home/subpage , instead
|
5
|
+
of having this file structure:
|
6
|
+
|
7
|
+
pre. /home
|
8
|
+
/subpage.html
|
9
|
+
|
10
|
+
i have this file structure:
|
11
|
+
|
12
|
+
pre. /home
|
13
|
+
/subpage
|
14
|
+
/index.html
|
15
|
+
|
16
|
+
so all the links in the generated site will stay the same.
|
17
|
+
|
18
|
+
h2. Usage
|
19
|
+
|
20
|
+
pre. require 'static_generator'
|
21
|
+
crawler = StaticGenerator::Crawler.new({
|
22
|
+
:url=>'http://mysite.com/',
|
23
|
+
:destination_path => File.expand_path('some/destination/directory'),
|
24
|
+
:url_prefix => 'http://mysite.com/'
|
25
|
+
})
|
26
|
+
crawler.crawl!
|
27
|
+
|
28
|
+
h2. Author
|
29
|
+
|
30
|
+
Julien Desrosiers
|
31
|
+
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
namespace(:spec) do
|
9
|
+
desc "Run all specs on multiple ruby versions (requires rvm)"
|
10
|
+
task(:portability) do
|
11
|
+
%w[1.8.6 1.8.7 1.9.2].each do |version|
|
12
|
+
system <<-BASH
|
13
|
+
bash -c 'source ~/.rvm/scripts/rvm;
|
14
|
+
rvm #{version};
|
15
|
+
echo "--------- version #{version} ----------\n";
|
16
|
+
bundle install;
|
17
|
+
rake spec'
|
18
|
+
BASH
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module StaticGenerator
|
5
|
+
|
6
|
+
class WrongURLPrefixError < ArgumentError
|
7
|
+
end
|
8
|
+
|
9
|
+
class DestinationPathDoesNotExist < ArgumentError
|
10
|
+
end
|
11
|
+
|
12
|
+
class DestinationPathNotWritableError < ArgumentError
|
13
|
+
end
|
14
|
+
|
15
|
+
class Page
|
16
|
+
|
17
|
+
attr_reader :crawled_page
|
18
|
+
|
19
|
+
def initialize(crawled_page, url_prefix)
|
20
|
+
@crawled_page = crawled_page # from anemone
|
21
|
+
@url_prefix = url_prefix
|
22
|
+
end
|
23
|
+
|
24
|
+
def short_path
|
25
|
+
@crawled_page.url.to_s[@url_prefix.length, @crawled_page.url.to_s.length]
|
26
|
+
end
|
27
|
+
|
28
|
+
def body
|
29
|
+
@crawled_page.body
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Crawler
|
34
|
+
|
35
|
+
attr_reader :pages, :destination_path, :url_prefix
|
36
|
+
|
37
|
+
def initialize(opts)
|
38
|
+
@destination_path = opts[:destination_path]
|
39
|
+
@url_prefix = opts[:url_prefix]
|
40
|
+
@url = opts[:url]
|
41
|
+
if @url_prefix.nil?
|
42
|
+
raise WrongURLPrefixError, "Expected an `url_prefix` option for the given URL."
|
43
|
+
elsif @url !~ /^#{@url_prefix}/
|
44
|
+
raise WrongURLPrefixError, "Expected the `url_prefix` option to exist in the given URL."
|
45
|
+
elsif @url_prefix[-1, 1] != '/'
|
46
|
+
raise WrongURLPrefixError, "Expected the `url_prefix` to end with a '/'."
|
47
|
+
end
|
48
|
+
|
49
|
+
if ! File.directory? @destination_path
|
50
|
+
raise DestinationPathDoesNotExist
|
51
|
+
elsif ! File.writable? @destination_path
|
52
|
+
raise DestinationPathNotWritableError
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def crawl!
|
57
|
+
@pages = []
|
58
|
+
Anemone.crawl(@url) do |anemone|
|
59
|
+
anemone.on_every_page do |page|
|
60
|
+
@pages << Page.new(page, @url_prefix)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
generate_folders
|
65
|
+
end
|
66
|
+
|
67
|
+
def generate_folders
|
68
|
+
@pages.each{|page|
|
69
|
+
directory = File.expand_path(@destination_path)+File::SEPARATOR+(page.short_path.split('/').join(File::SEPARATOR))
|
70
|
+
FileUtils.mkdir_p( directory )
|
71
|
+
File.open("#{directory}/index.html", 'w') {|f| f.write(page.body) }
|
72
|
+
}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'ap'
|
3
|
+
|
4
|
+
module StaticGenerator
|
5
|
+
describe Crawler do
|
6
|
+
before(:each) do
|
7
|
+
@options = {
|
8
|
+
:destination_path => File.expand_path('spec/destination_directory'),
|
9
|
+
:url_prefix => 'http://www.example.com/'
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
after(:each) do
|
14
|
+
FileUtils.rm_rf(Dir.glob(File.expand_path('spec/destination_directory')+File::SEPARATOR+'*'))
|
15
|
+
end
|
16
|
+
|
17
|
+
context 'crawling' do
|
18
|
+
|
19
|
+
it 'should find a page that links to another pages' do
|
20
|
+
pages = []
|
21
|
+
pages << FakePage.new('0', :links => '1')
|
22
|
+
pages << FakePage.new('1')
|
23
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
24
|
+
@crawler.crawl!
|
25
|
+
|
26
|
+
@crawler.pages.size.should == 2
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should find a page that links to two other pages' do
|
30
|
+
pages = []
|
31
|
+
pages << FakePage.new('0', :links => ['1','2'])
|
32
|
+
pages << FakePage.new('1')
|
33
|
+
pages << FakePage.new('2')
|
34
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
35
|
+
@crawler.crawl!
|
36
|
+
|
37
|
+
@crawler.pages.size.should == 3
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have the right prefix for the given url' do
|
41
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
42
|
+
@crawler.crawl!
|
43
|
+
@crawler.pages[0].short_path.should == 'home'
|
44
|
+
|
45
|
+
pages = []
|
46
|
+
pages << FakePage.new('0', :links => ['0/1'])
|
47
|
+
pages << FakePage.new('0/1')
|
48
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
49
|
+
@crawler.crawl!
|
50
|
+
@crawler.pages[1].short_path.should == '0/1'
|
51
|
+
|
52
|
+
pages = []
|
53
|
+
pages << FakePage.new('root', :links => ['root/subpage/subsubpage'])
|
54
|
+
pages << FakePage.new('root/subpage/subsubpage')
|
55
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
56
|
+
@crawler.crawl!
|
57
|
+
@crawler.pages[1].short_path.should == 'root/subpage/subsubpage'
|
58
|
+
|
59
|
+
# ensure we have a / at the end
|
60
|
+
lambda {
|
61
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('root').url,:url_prefix => 'http://www.example.com'}))
|
62
|
+
}.should raise_error WrongURLPrefixError
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should follow a relative link' do
|
66
|
+
pages = []
|
67
|
+
pages << FakePage.new('home', :hrefs => ['/subpage', 'otherpage'])
|
68
|
+
pages << FakePage.new('subpage')
|
69
|
+
pages << FakePage.new('otherpage')
|
70
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
71
|
+
@crawler.crawl!
|
72
|
+
@crawler.pages[0].short_path.should == 'home'
|
73
|
+
@crawler.pages[1].short_path.should == 'subpage'
|
74
|
+
@crawler.pages[2].short_path.should == 'otherpage'
|
75
|
+
@crawler.pages[2].crawled_page.url.to_s.should == 'http://www.example.com/otherpage'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context 'folder' do
|
80
|
+
it 'should not be created if destination_path does not exist' do
|
81
|
+
lambda {
|
82
|
+
@crawler = Crawler.new(@options.merge({
|
83
|
+
:url=>FakePage.new('home').url,
|
84
|
+
:destination_path=>File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder_that_doesnt_exist'
|
85
|
+
}))
|
86
|
+
}.should raise_error DestinationPathDoesNotExist
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should be created' do
|
90
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
91
|
+
@crawler.crawl!
|
92
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'home').should == true
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should have an index.html file in it' do
|
96
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
97
|
+
@crawler.crawl!
|
98
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'home'+File::SEPARATOR+'index.html').should == true
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'should throw an error if is not writable' do
|
102
|
+
lambda {
|
103
|
+
@crawler = Crawler.new(@options.merge({
|
104
|
+
:url=>FakePage.new('home').url,
|
105
|
+
:destination_path=>File.expand_path('spec/non_writable/')
|
106
|
+
}))
|
107
|
+
}.should raise_error DestinationPathNotWritableError
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context 'folder with a sub folder in it' do
|
112
|
+
|
113
|
+
it 'should be created' do
|
114
|
+
pages = []
|
115
|
+
pages << FakePage.new('folder', :links => ['folder/subfolder'])
|
116
|
+
pages << FakePage.new('folder/subfolder')
|
117
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
118
|
+
@crawler.crawl!
|
119
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder'+File::SEPARATOR+'subfolder').should == true
|
120
|
+
end
|
121
|
+
|
122
|
+
it 'should have an index.html file in it' do
|
123
|
+
pages = []
|
124
|
+
pages << FakePage.new('folder', :links => ['folder/subfolder'])
|
125
|
+
pages << FakePage.new('folder/subfolder')
|
126
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
127
|
+
@crawler.crawl!
|
128
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder'+File::SEPARATOR+'subfolder'+File::SEPARATOR+'index.html').should == true
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
File without changes
|
@@ -0,0 +1,65 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test StaticGenerator"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
module StaticGenerator
|
11
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
|
13
|
+
class FakePage
|
14
|
+
attr_accessor :links
|
15
|
+
attr_accessor :hrefs
|
16
|
+
attr_accessor :body
|
17
|
+
|
18
|
+
def initialize(name = '', options = {})
|
19
|
+
@name = name
|
20
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
21
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
22
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
23
|
+
@content_type = options[:content_type] || "text/html"
|
24
|
+
@body = options[:body]
|
25
|
+
|
26
|
+
create_body unless @body
|
27
|
+
add_to_fakeweb
|
28
|
+
end
|
29
|
+
|
30
|
+
def url
|
31
|
+
SPEC_DOMAIN + @name
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def create_body
|
37
|
+
@body = "<html><body>"
|
38
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
39
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
40
|
+
@body += "</body></html>"
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_to_fakeweb
|
44
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
45
|
+
|
46
|
+
if @redirect
|
47
|
+
options[:status] = [301, "Permanently Moved"]
|
48
|
+
|
49
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
50
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
51
|
+
options[:location] = redirect_url
|
52
|
+
|
53
|
+
# register the page this one redirects to
|
54
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
55
|
+
:content_type => @content_type,
|
56
|
+
:status => [200, "OK"]})
|
57
|
+
end
|
58
|
+
|
59
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#default root
|
65
|
+
StaticGenerator::FakePage.new
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "static_generator/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "static_generator"
|
7
|
+
s.version = StaticGenerator::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Julien Desrosiers"]
|
10
|
+
s.email = ["hello@juliendesrosiers.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Crawler that generates a file structure that can be served by Apache without any change}
|
13
|
+
s.description = %q{Crawl a site with 'clean-URLs' and generate a files and folders from it. Example: the URL /page will become /page/index.html instead of /page.html so you can serve it straight from Apache and all the links are still working.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "static_generator"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_development_dependency "rspec", "~> 2.4.0"
|
23
|
+
s.add_development_dependency "fakeweb", "~> 1.3.0"
|
24
|
+
s.add_development_dependency "awesome_print", "~> 0.3.2"
|
25
|
+
|
26
|
+
s.add_dependency "anemone", "~> 0.5.0"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: static_generator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Julien Desrosiers
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-02-10 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rspec
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 31
|
30
|
+
segments:
|
31
|
+
- 2
|
32
|
+
- 4
|
33
|
+
- 0
|
34
|
+
version: 2.4.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: fakeweb
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 27
|
46
|
+
segments:
|
47
|
+
- 1
|
48
|
+
- 3
|
49
|
+
- 0
|
50
|
+
version: 1.3.0
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: awesome_print
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 23
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
- 3
|
65
|
+
- 2
|
66
|
+
version: 0.3.2
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: anemone
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 11
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
- 5
|
81
|
+
- 0
|
82
|
+
version: 0.5.0
|
83
|
+
type: :runtime
|
84
|
+
version_requirements: *id004
|
85
|
+
description: "Crawl a site with 'clean-URLs' and generate a files and folders from it. Example: the URL /page will become /page/index.html instead of /page.html so you can serve it straight from Apache and all the links are still working."
|
86
|
+
email:
|
87
|
+
- hello@juliendesrosiers.com
|
88
|
+
executables: []
|
89
|
+
|
90
|
+
extensions: []
|
91
|
+
|
92
|
+
extra_rdoc_files: []
|
93
|
+
|
94
|
+
files:
|
95
|
+
- .gitignore
|
96
|
+
- Gemfile
|
97
|
+
- README.textile
|
98
|
+
- Rakefile
|
99
|
+
- lib/static_generator.rb
|
100
|
+
- lib/static_generator/version.rb
|
101
|
+
- spec/crawler_spec.rb
|
102
|
+
- spec/destination_directory/.gitkeep
|
103
|
+
- spec/fakeweb_helper.rb
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- static_generator.gemspec
|
106
|
+
has_rdoc: true
|
107
|
+
homepage: ""
|
108
|
+
licenses: []
|
109
|
+
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
hash: 3
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
version: "0"
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
hash: 3
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
version: "0"
|
133
|
+
requirements: []
|
134
|
+
|
135
|
+
rubyforge_project: static_generator
|
136
|
+
rubygems_version: 1.4.1
|
137
|
+
signing_key:
|
138
|
+
specification_version: 3
|
139
|
+
summary: Crawler that generates a file structure that can be served by Apache without any change
|
140
|
+
test_files:
|
141
|
+
- spec/crawler_spec.rb
|
142
|
+
- spec/destination_directory/.gitkeep
|
143
|
+
- spec/fakeweb_helper.rb
|
144
|
+
- spec/spec_helper.rb
|