static_generator 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +6 -0
- data/README.textile +32 -0
- data/Rakefile +22 -0
- data/lib/static_generator.rb +76 -0
- data/lib/static_generator/version.rb +3 -0
- data/spec/crawler_spec.rb +133 -0
- data/spec/destination_directory/.gitkeep +0 -0
- data/spec/fakeweb_helper.rb +65 -0
- data/spec/spec_helper.rb +8 -0
- data/static_generator.gemspec +27 -0
- metadata +144 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.textile
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
h1. Why did i create Static Generator?
|
2
|
+
|
3
|
+
Because i wanted a simple way to grab the HTML from a site generated with my Ruby on Rails CMS.
|
4
|
+
The cool thing with StaticGenerator compared to other solutions (wget), is that when i have an URL like /home/subpage , instead
|
5
|
+
of having this file structure:
|
6
|
+
|
7
|
+
pre. /home
|
8
|
+
/subpage.html
|
9
|
+
|
10
|
+
i have this file structure:
|
11
|
+
|
12
|
+
pre. /home
|
13
|
+
/subpage
|
14
|
+
/index.html
|
15
|
+
|
16
|
+
so all the links in the generated site will stay the same.
|
17
|
+
|
18
|
+
h2. Usage
|
19
|
+
|
20
|
+
pre. require 'static_generator'
|
21
|
+
crawler = StaticGenerator::Crawler.new({
|
22
|
+
:url=>'http://mysite.com/',
|
23
|
+
:destination_path => File.expand_path('some/destination/directory'),
|
24
|
+
:url_prefix => 'http://mysite.com/'
|
25
|
+
})
|
26
|
+
crawler.crawl!
|
27
|
+
|
28
|
+
h2. Author
|
29
|
+
|
30
|
+
Julien Desrosiers
|
31
|
+
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
namespace(:spec) do
|
9
|
+
desc "Run all specs on multiple ruby versions (requires rvm)"
|
10
|
+
task(:portability) do
|
11
|
+
%w[1.8.6 1.8.7 1.9.2].each do |version|
|
12
|
+
system <<-BASH
|
13
|
+
bash -c 'source ~/.rvm/scripts/rvm;
|
14
|
+
rvm #{version};
|
15
|
+
echo "--------- version #{version} ----------\n";
|
16
|
+
bundle install;
|
17
|
+
rake spec'
|
18
|
+
BASH
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module StaticGenerator
|
5
|
+
|
6
|
+
class WrongURLPrefixError < ArgumentError
|
7
|
+
end
|
8
|
+
|
9
|
+
class DestinationPathDoesNotExist < ArgumentError
|
10
|
+
end
|
11
|
+
|
12
|
+
class DestinationPathNotWritableError < ArgumentError
|
13
|
+
end
|
14
|
+
|
15
|
+
class Page
|
16
|
+
|
17
|
+
attr_reader :crawled_page
|
18
|
+
|
19
|
+
def initialize(crawled_page, url_prefix)
|
20
|
+
@crawled_page = crawled_page # from anemone
|
21
|
+
@url_prefix = url_prefix
|
22
|
+
end
|
23
|
+
|
24
|
+
def short_path
|
25
|
+
@crawled_page.url.to_s[@url_prefix.length, @crawled_page.url.to_s.length]
|
26
|
+
end
|
27
|
+
|
28
|
+
def body
|
29
|
+
@crawled_page.body
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Crawler
|
34
|
+
|
35
|
+
attr_reader :pages, :destination_path, :url_prefix
|
36
|
+
|
37
|
+
def initialize(opts)
|
38
|
+
@destination_path = opts[:destination_path]
|
39
|
+
@url_prefix = opts[:url_prefix]
|
40
|
+
@url = opts[:url]
|
41
|
+
if @url_prefix.nil?
|
42
|
+
raise WrongURLPrefixError, "Expected an `url_prefix` option for the given URL."
|
43
|
+
elsif @url !~ /^#{@url_prefix}/
|
44
|
+
raise WrongURLPrefixError, "Expected the `url_prefix` option to exist in the given URL."
|
45
|
+
elsif @url_prefix[-1, 1] != '/'
|
46
|
+
raise WrongURLPrefixError, "Expected the `url_prefix` to end with a '/'."
|
47
|
+
end
|
48
|
+
|
49
|
+
if ! File.directory? @destination_path
|
50
|
+
raise DestinationPathDoesNotExist
|
51
|
+
elsif ! File.writable? @destination_path
|
52
|
+
raise DestinationPathNotWritableError
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def crawl!
|
57
|
+
@pages = []
|
58
|
+
Anemone.crawl(@url) do |anemone|
|
59
|
+
anemone.on_every_page do |page|
|
60
|
+
@pages << Page.new(page, @url_prefix)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
generate_folders
|
65
|
+
end
|
66
|
+
|
67
|
+
def generate_folders
|
68
|
+
@pages.each{|page|
|
69
|
+
directory = File.expand_path(@destination_path)+File::SEPARATOR+(page.short_path.split('/').join(File::SEPARATOR))
|
70
|
+
FileUtils.mkdir_p( directory )
|
71
|
+
File.open("#{directory}/index.html", 'w') {|f| f.write(page.body) }
|
72
|
+
}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
require 'ap'
|
3
|
+
|
4
|
+
module StaticGenerator
|
5
|
+
describe Crawler do
|
6
|
+
before(:each) do
|
7
|
+
@options = {
|
8
|
+
:destination_path => File.expand_path('spec/destination_directory'),
|
9
|
+
:url_prefix => 'http://www.example.com/'
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
after(:each) do
|
14
|
+
FileUtils.rm_rf(Dir.glob(File.expand_path('spec/destination_directory')+File::SEPARATOR+'*'))
|
15
|
+
end
|
16
|
+
|
17
|
+
context 'crawling' do
|
18
|
+
|
19
|
+
it 'should find a page that links to another pages' do
|
20
|
+
pages = []
|
21
|
+
pages << FakePage.new('0', :links => '1')
|
22
|
+
pages << FakePage.new('1')
|
23
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
24
|
+
@crawler.crawl!
|
25
|
+
|
26
|
+
@crawler.pages.size.should == 2
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should find a page that links to two other pages' do
|
30
|
+
pages = []
|
31
|
+
pages << FakePage.new('0', :links => ['1','2'])
|
32
|
+
pages << FakePage.new('1')
|
33
|
+
pages << FakePage.new('2')
|
34
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
35
|
+
@crawler.crawl!
|
36
|
+
|
37
|
+
@crawler.pages.size.should == 3
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have the right prefix for the given url' do
|
41
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
42
|
+
@crawler.crawl!
|
43
|
+
@crawler.pages[0].short_path.should == 'home'
|
44
|
+
|
45
|
+
pages = []
|
46
|
+
pages << FakePage.new('0', :links => ['0/1'])
|
47
|
+
pages << FakePage.new('0/1')
|
48
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
49
|
+
@crawler.crawl!
|
50
|
+
@crawler.pages[1].short_path.should == '0/1'
|
51
|
+
|
52
|
+
pages = []
|
53
|
+
pages << FakePage.new('root', :links => ['root/subpage/subsubpage'])
|
54
|
+
pages << FakePage.new('root/subpage/subsubpage')
|
55
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
56
|
+
@crawler.crawl!
|
57
|
+
@crawler.pages[1].short_path.should == 'root/subpage/subsubpage'
|
58
|
+
|
59
|
+
# ensure we have a / at the end
|
60
|
+
lambda {
|
61
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('root').url,:url_prefix => 'http://www.example.com'}))
|
62
|
+
}.should raise_error WrongURLPrefixError
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should follow a relative link' do
|
66
|
+
pages = []
|
67
|
+
pages << FakePage.new('home', :hrefs => ['/subpage', 'otherpage'])
|
68
|
+
pages << FakePage.new('subpage')
|
69
|
+
pages << FakePage.new('otherpage')
|
70
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
71
|
+
@crawler.crawl!
|
72
|
+
@crawler.pages[0].short_path.should == 'home'
|
73
|
+
@crawler.pages[1].short_path.should == 'subpage'
|
74
|
+
@crawler.pages[2].short_path.should == 'otherpage'
|
75
|
+
@crawler.pages[2].crawled_page.url.to_s.should == 'http://www.example.com/otherpage'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context 'folder' do
|
80
|
+
it 'should not be created if destination_path does not exist' do
|
81
|
+
lambda {
|
82
|
+
@crawler = Crawler.new(@options.merge({
|
83
|
+
:url=>FakePage.new('home').url,
|
84
|
+
:destination_path=>File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder_that_doesnt_exist'
|
85
|
+
}))
|
86
|
+
}.should raise_error DestinationPathDoesNotExist
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should be created' do
|
90
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
91
|
+
@crawler.crawl!
|
92
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'home').should == true
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should have an index.html file in it' do
|
96
|
+
@crawler = Crawler.new(@options.merge({:url=>FakePage.new('home').url}))
|
97
|
+
@crawler.crawl!
|
98
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'home'+File::SEPARATOR+'index.html').should == true
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'should throw an error if is not writable' do
|
102
|
+
lambda {
|
103
|
+
@crawler = Crawler.new(@options.merge({
|
104
|
+
:url=>FakePage.new('home').url,
|
105
|
+
:destination_path=>File.expand_path('spec/non_writable/')
|
106
|
+
}))
|
107
|
+
}.should raise_error DestinationPathNotWritableError
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context 'folder with a sub folder in it' do
|
112
|
+
|
113
|
+
it 'should be created' do
|
114
|
+
pages = []
|
115
|
+
pages << FakePage.new('folder', :links => ['folder/subfolder'])
|
116
|
+
pages << FakePage.new('folder/subfolder')
|
117
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
118
|
+
@crawler.crawl!
|
119
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder'+File::SEPARATOR+'subfolder').should == true
|
120
|
+
end
|
121
|
+
|
122
|
+
it 'should have an index.html file in it' do
|
123
|
+
pages = []
|
124
|
+
pages << FakePage.new('folder', :links => ['folder/subfolder'])
|
125
|
+
pages << FakePage.new('folder/subfolder')
|
126
|
+
@crawler = Crawler.new(@options.merge({:url=>pages[0].url}))
|
127
|
+
@crawler.crawl!
|
128
|
+
File.exists?(File.expand_path('spec/destination_directory/')+File::SEPARATOR+'folder'+File::SEPARATOR+'subfolder'+File::SEPARATOR+'index.html').should == true
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
File without changes
|
@@ -0,0 +1,65 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test StaticGenerator"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
module StaticGenerator
|
11
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
|
13
|
+
class FakePage
|
14
|
+
attr_accessor :links
|
15
|
+
attr_accessor :hrefs
|
16
|
+
attr_accessor :body
|
17
|
+
|
18
|
+
def initialize(name = '', options = {})
|
19
|
+
@name = name
|
20
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
21
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
22
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
23
|
+
@content_type = options[:content_type] || "text/html"
|
24
|
+
@body = options[:body]
|
25
|
+
|
26
|
+
create_body unless @body
|
27
|
+
add_to_fakeweb
|
28
|
+
end
|
29
|
+
|
30
|
+
def url
|
31
|
+
SPEC_DOMAIN + @name
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def create_body
|
37
|
+
@body = "<html><body>"
|
38
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
39
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
40
|
+
@body += "</body></html>"
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_to_fakeweb
|
44
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
45
|
+
|
46
|
+
if @redirect
|
47
|
+
options[:status] = [301, "Permanently Moved"]
|
48
|
+
|
49
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
50
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
51
|
+
options[:location] = redirect_url
|
52
|
+
|
53
|
+
# register the page this one redirects to
|
54
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
55
|
+
:content_type => @content_type,
|
56
|
+
:status => [200, "OK"]})
|
57
|
+
end
|
58
|
+
|
59
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#default root
|
65
|
+
StaticGenerator::FakePage.new
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "static_generator/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "static_generator"
|
7
|
+
s.version = StaticGenerator::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Julien Desrosiers"]
|
10
|
+
s.email = ["hello@juliendesrosiers.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Crawler that generates a file structure that can be served by Apache without any change}
|
13
|
+
s.description = %q{Crawl a site with 'clean-URLs' and generate a files and folders from it. Example: the URL /page will become /page/index.html instead of /page.html so you can serve it straight from Apache and all the links are still working.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "static_generator"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_development_dependency "rspec", "~> 2.4.0"
|
23
|
+
s.add_development_dependency "fakeweb", "~> 1.3.0"
|
24
|
+
s.add_development_dependency "awesome_print", "~> 0.3.2"
|
25
|
+
|
26
|
+
s.add_dependency "anemone", "~> 0.5.0"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: static_generator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Julien Desrosiers
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-02-10 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rspec
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 31
|
30
|
+
segments:
|
31
|
+
- 2
|
32
|
+
- 4
|
33
|
+
- 0
|
34
|
+
version: 2.4.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: fakeweb
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 27
|
46
|
+
segments:
|
47
|
+
- 1
|
48
|
+
- 3
|
49
|
+
- 0
|
50
|
+
version: 1.3.0
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: awesome_print
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 23
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
- 3
|
65
|
+
- 2
|
66
|
+
version: 0.3.2
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: anemone
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 11
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
- 5
|
81
|
+
- 0
|
82
|
+
version: 0.5.0
|
83
|
+
type: :runtime
|
84
|
+
version_requirements: *id004
|
85
|
+
description: "Crawl a site with 'clean-URLs' and generate a files and folders from it. Example: the URL /page will become /page/index.html instead of /page.html so you can serve it straight from Apache and all the links are still working."
|
86
|
+
email:
|
87
|
+
- hello@juliendesrosiers.com
|
88
|
+
executables: []
|
89
|
+
|
90
|
+
extensions: []
|
91
|
+
|
92
|
+
extra_rdoc_files: []
|
93
|
+
|
94
|
+
files:
|
95
|
+
- .gitignore
|
96
|
+
- Gemfile
|
97
|
+
- README.textile
|
98
|
+
- Rakefile
|
99
|
+
- lib/static_generator.rb
|
100
|
+
- lib/static_generator/version.rb
|
101
|
+
- spec/crawler_spec.rb
|
102
|
+
- spec/destination_directory/.gitkeep
|
103
|
+
- spec/fakeweb_helper.rb
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- static_generator.gemspec
|
106
|
+
has_rdoc: true
|
107
|
+
homepage: ""
|
108
|
+
licenses: []
|
109
|
+
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
|
113
|
+
require_paths:
|
114
|
+
- lib
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
hash: 3
|
121
|
+
segments:
|
122
|
+
- 0
|
123
|
+
version: "0"
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
hash: 3
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
version: "0"
|
133
|
+
requirements: []
|
134
|
+
|
135
|
+
rubyforge_project: static_generator
|
136
|
+
rubygems_version: 1.4.1
|
137
|
+
signing_key:
|
138
|
+
specification_version: 3
|
139
|
+
summary: Crawler that generates a file structure that can be served by Apache without any change
|
140
|
+
test_files:
|
141
|
+
- spec/crawler_spec.rb
|
142
|
+
- spec/destination_directory/.gitkeep
|
143
|
+
- spec/fakeweb_helper.rb
|
144
|
+
- spec/spec_helper.rb
|