staticizer 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +13 -48
- data/Rakefile +0 -6
- data/lib/staticizer/command.rb +1 -5
- data/lib/staticizer/crawler.rb +79 -82
- data/lib/staticizer/version.rb +1 -1
- data/staticizer.gemspec +0 -1
- data/tests/crawler_test.rb +10 -75
- metadata +3 -18
- data/tests/fake_page.html +0 -288
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ae3789f55851bb180c8e5e58b23629b3492ff21d3918f9851e16a061d5218d88
|
4
|
+
data.tar.gz: fbe8944e876c152b066ea76d7a1d9d7769bfe82d45a4fece95aa5a8edfa4f253
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd2bde3224a14384d04a5e3883b4c296679eb9981a6cbebf2df12663cbd5108aa1e02e123f4d0b3c0da08bb49981c7652f8f7d036c7edaf9d80f63d4f46f5364
|
7
|
+
data.tar.gz: c124ef043c19155ad3b78b660f2022e2789d09b4207905e8b67b250148333f8568f108a2788c0e2d85e90c64eaadc916ff2b2dfb3a46b64f544d91e994c55595
|
data/README.md
CHANGED
@@ -9,29 +9,14 @@ website. If the website goes down this backup would be available
|
|
9
9
|
with reduced functionality.
|
10
10
|
|
11
11
|
S3 and Route 53 provide an great way to host a static emergency backup for a website.
|
12
|
-
See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
|
13
|
-
. In our experience it works well and is incredibly cheap
|
14
|
-
with a few hundred pages and assets is less than US$1 a month.
|
15
|
-
|
16
|
-
We tried using existing tools httrack/wget to crawl and create a static version
|
17
|
-
of the site to upload to S3, but we found that they did not work well with S3 hosting.
|
18
|
-
We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
|
19
|
-
the existing site. This way when the site goes down incoming links from Google search
|
20
|
-
results etc. will still work.
|
21
|
-
|
22
|
-
## TODO
|
12
|
+
See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
|
13
|
+
. In our experience it works very well and is incredibly cheap at less than US$1 a month (depending on the size of the website).
|
23
14
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
* Multithread the crawler
|
30
|
-
* Check for too many redirects
|
31
|
-
* Provide regex options for what urls are scraped
|
32
|
-
* Better handling of incorrect server mime types (ex. server returns text/plain for css instead of text/css)
|
33
|
-
* Provide more options for uploading (upload via scp, ftp, custom etc.). Split out save/uploading into an interface.
|
34
|
-
* Handle large files in a more memory efficient way by streaming uploads/downloads
|
15
|
+
We tried using exsisting tools httrack/wget to crawl and create a static version
|
16
|
+
of the site to upload to S3, but we found that they did not work well with S3 hosting.
|
17
|
+
We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
|
18
|
+
the existing site. This way when the site goes down incoming links from Google search
|
19
|
+
results etc. will still work.
|
35
20
|
|
36
21
|
## Installation
|
37
22
|
|
@@ -45,11 +30,12 @@ And then execute:
|
|
45
30
|
|
46
31
|
Or install it yourself as:
|
47
32
|
|
48
|
-
$ gem install
|
33
|
+
$ gem install s3static
|
49
34
|
|
50
35
|
## Command line usage
|
51
36
|
|
52
|
-
|
37
|
+
The tool can either be used via the 'staticizer' commandline tool or via requiring the library.
|
38
|
+
|
53
39
|
|
54
40
|
### Crawl a website and write to disk
|
55
41
|
|
@@ -75,8 +61,6 @@ This will only crawl urls in the domain squaremill.com
|
|
75
61
|
|
76
62
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
77
63
|
:aws => {
|
78
|
-
:region => "us-west-1",
|
79
|
-
:endpoint => "http://s3.amazonaws.com",
|
80
64
|
:bucket_name => "www.squaremill.com",
|
81
65
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
82
66
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -89,27 +73,10 @@ This will only crawl urls in the domain squaremill.com
|
|
89
73
|
s = Staticizer::Crawler.new("http://squaremill.com", :output_dir => "/tmp/crawl")
|
90
74
|
s.crawl
|
91
75
|
|
92
|
-
|
93
|
-
### Crawl a website and make all pages contain 'noindex' meta tag
|
94
|
-
|
95
|
-
s = Staticizer::Crawler.new("http://squaremill.com",
|
96
|
-
:output_dir => "/tmp/crawl",
|
97
|
-
:process_body => lambda {|body, uri, opts|
|
98
|
-
# not the best regex, but it will do for our use
|
99
|
-
body = body.gsub(/<meta\s+name=['"]robots[^>]+>/i,'')
|
100
|
-
body = body.gsub(/<head>/i,"<head>\n<meta name='robots' content='noindex'>")
|
101
|
-
body
|
102
|
-
}
|
103
|
-
)
|
104
|
-
s.crawl
|
105
|
-
|
106
|
-
|
107
76
|
### Crawl a website and rewrite all non www urls to www
|
108
77
|
|
109
78
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
110
79
|
:aws => {
|
111
|
-
:region => "us-west-1",
|
112
|
-
:endpoint => "http://s3.amazonaws.com",
|
113
80
|
:bucket_name => "www.squaremill.com",
|
114
81
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
115
82
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -125,16 +92,14 @@ This will only crawl urls in the domain squaremill.com
|
|
125
92
|
)
|
126
93
|
s.crawl
|
127
94
|
|
128
|
-
##
|
95
|
+
## Cralwer Options
|
129
96
|
|
130
97
|
* :aws - Hash of connection options passed to aws/sdk gem
|
131
|
-
* :filter_url -
|
98
|
+
* :filter_url - proc called to see if a discovered URL should be crawled, return nil to not crawl a url, return the url (can be modified) to crawl
|
132
99
|
* :output_dir - if writing a site to disk the directory to write to, will be created if it does not exist
|
133
100
|
* :logger - A logger object responding to the usual Ruby Logger methods.
|
134
101
|
* :log_level - Log level - defaults to INFO.
|
135
|
-
|
136
|
-
* :process_body - lambda called to pre-process body of content before writing it out.
|
137
|
-
* :skip_write - don't write retrieved files to disk or s3, just crawl the site (can be used to find 404s etc.)
|
102
|
+
# :valid_domains - Array of domains that should be crawled. Domains not in this list will be ignored.
|
138
103
|
|
139
104
|
## Contributing
|
140
105
|
|
data/Rakefile
CHANGED
data/lib/staticizer/command.rb
CHANGED
@@ -44,10 +44,6 @@ module Staticizer
|
|
44
44
|
options[:logger] = Logger.new(v)
|
45
45
|
end
|
46
46
|
|
47
|
-
opts.on("--skip-write [PATH]", "Don't write out files to disk or s3") do |v|
|
48
|
-
options[:skip_write] = true
|
49
|
-
end
|
50
|
-
|
51
47
|
opts.on("--valid-domains x,y,z", Array, "Comma separated list of domains that should be crawled, other domains will be ignored") do |v|
|
52
48
|
options[:valid_domains] = v
|
53
49
|
end
|
@@ -59,7 +55,7 @@ module Staticizer
|
|
59
55
|
end
|
60
56
|
end
|
61
57
|
|
62
|
-
begin
|
58
|
+
begin
|
63
59
|
parser.parse!(args)
|
64
60
|
initial_page = ARGV.pop
|
65
61
|
raise ArgumentError, "Need to specify an initial URL to start the crawl" unless initial_page
|
data/lib/staticizer/crawler.rb
CHANGED
@@ -6,9 +6,6 @@ require 'logger'
|
|
6
6
|
|
7
7
|
module Staticizer
|
8
8
|
class Crawler
|
9
|
-
attr_reader :url_queue
|
10
|
-
attr_accessor :output_dir
|
11
|
-
|
12
9
|
def initialize(initial_page, opts = {})
|
13
10
|
if initial_page.nil?
|
14
11
|
raise ArgumentError, "Initial page required"
|
@@ -17,36 +14,24 @@ module Staticizer
|
|
17
14
|
@opts = opts.dup
|
18
15
|
@url_queue = []
|
19
16
|
@processed_urls = []
|
20
|
-
@
|
17
|
+
@opts[:output_dir] ||= File.expand_path("crawl/")
|
21
18
|
@log = @opts[:logger] || Logger.new(STDOUT)
|
22
19
|
@log.level = @opts[:log_level] || Logger::INFO
|
23
20
|
|
24
21
|
if @opts[:aws]
|
25
22
|
bucket_name = @opts[:aws].delete(:bucket_name)
|
26
|
-
|
27
|
-
@s3_bucket =
|
23
|
+
AWS.config(opts[:aws])
|
24
|
+
@s3_bucket = AWS::S3.new.buckets[bucket_name]
|
25
|
+
@s3_bucket.acl = :public_read
|
28
26
|
end
|
29
27
|
|
30
28
|
if @opts[:valid_domains].nil?
|
31
29
|
uri = URI.parse(initial_page)
|
32
30
|
@opts[:valid_domains] ||= [uri.host]
|
33
31
|
end
|
34
|
-
|
35
|
-
if @opts[:process_body]
|
36
|
-
@process_body = @opts[:process_body]
|
37
|
-
end
|
38
|
-
|
39
32
|
add_url(initial_page)
|
40
33
|
end
|
41
34
|
|
42
|
-
def log_level
|
43
|
-
@log.level
|
44
|
-
end
|
45
|
-
|
46
|
-
def log_level=(level)
|
47
|
-
@log.level = level
|
48
|
-
end
|
49
|
-
|
50
35
|
def crawl
|
51
36
|
@log.info("Starting crawl")
|
52
37
|
while(@url_queue.length > 0)
|
@@ -57,6 +42,15 @@ module Staticizer
|
|
57
42
|
@log.info("Finished crawl")
|
58
43
|
end
|
59
44
|
|
45
|
+
def extract_videos(doc, base_uri)
|
46
|
+
doc.xpath("//video").map do |video|
|
47
|
+
sources = video.xpath("//source/@src").map {|src| make_absolute(base_uri, src)}
|
48
|
+
poster = video.attributes["poster"].to_s
|
49
|
+
make_absolute(base_uri, poster)
|
50
|
+
[poster, sources]
|
51
|
+
end.flatten.uniq.compact
|
52
|
+
end
|
53
|
+
|
60
54
|
def extract_hrefs(doc, base_uri)
|
61
55
|
doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) }
|
62
56
|
end
|
@@ -74,7 +68,12 @@ module Staticizer
|
|
74
68
|
end
|
75
69
|
|
76
70
|
def extract_css_urls(css, base_uri)
|
77
|
-
css.scan(/url\(
|
71
|
+
css.scan(/url\(([^)]+)\)/).map do |src|
|
72
|
+
path = src[0]
|
73
|
+
# URLS in css can be wrapped with " or 'ex: url("http:://something/"), strip these
|
74
|
+
path = path.strip.gsub(/^['"]/, "").gsub(/['"]$/,"")
|
75
|
+
make_absolute(base_uri, path)
|
76
|
+
end
|
78
77
|
end
|
79
78
|
|
80
79
|
def add_urls(urls, info = {})
|
@@ -82,10 +81,19 @@ module Staticizer
|
|
82
81
|
end
|
83
82
|
|
84
83
|
def make_absolute(base_uri, href)
|
85
|
-
|
84
|
+
if href.to_s =~ /^https?/i
|
85
|
+
# If the uri is already absolute then don't do anything to it except make spaces to + (otherwise
|
86
|
+
# will not retrieve)
|
87
|
+
href.to_s.gsub(" ", "+")
|
88
|
+
else
|
89
|
+
dup_uri = base_uri.dup
|
90
|
+
# Remove the query params as otherwise will try use those when making absolute uri
|
91
|
+
dup_uri.query = nil
|
92
|
+
URI::join(dup_uri.to_s, href).to_s
|
93
|
+
end
|
86
94
|
rescue StandardError => e
|
87
|
-
@log.error "Could not make absolute
|
88
|
-
|
95
|
+
@log.error "Could not make absolute #{dup_uri} - #{href}"
|
96
|
+
nil
|
89
97
|
end
|
90
98
|
|
91
99
|
def add_url(url, info = {})
|
@@ -102,23 +110,22 @@ module Staticizer
|
|
102
110
|
@url_queue << [url, info]
|
103
111
|
end
|
104
112
|
|
105
|
-
def save_page(response, uri)
|
106
|
-
return if @opts[:skip_write]
|
113
|
+
def save_page(response, uri, opts = {})
|
107
114
|
if @opts[:aws]
|
108
|
-
save_page_to_aws(response, uri)
|
115
|
+
save_page_to_aws(response, uri, opts)
|
109
116
|
else
|
110
|
-
save_page_to_disk(response, uri)
|
117
|
+
save_page_to_disk(response, uri, opts)
|
111
118
|
end
|
112
119
|
end
|
113
120
|
|
114
|
-
def save_page_to_disk(response, uri)
|
121
|
+
def save_page_to_disk(response, uri, opts = {})
|
115
122
|
path = uri.path
|
116
|
-
path += "?#{uri.query}" if uri.query
|
123
|
+
path += "?#{uri.query}" if uri.query && !opts[:no_query] && !@opts[:no_query]
|
117
124
|
|
118
125
|
path_segments = path.scan(%r{[^/]*/})
|
119
126
|
filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path
|
120
127
|
|
121
|
-
current = @output_dir
|
128
|
+
current = @opts[:output_dir]
|
122
129
|
FileUtils.mkdir_p(current) unless File.exist?(current)
|
123
130
|
|
124
131
|
# Create all the directories necessary for this file
|
@@ -138,75 +145,71 @@ module Staticizer
|
|
138
145
|
end
|
139
146
|
|
140
147
|
body = response.respond_to?(:read_body) ? response.read_body : response
|
141
|
-
body = process_body(body, uri,
|
148
|
+
body = @opts[:process_body].call(body, uri, opts) if @opts[:process_body]
|
142
149
|
outfile = File.join(current, "/#{filename}")
|
150
|
+
|
143
151
|
if filename == ""
|
144
152
|
indexfile = File.join(outfile, "/index.html")
|
153
|
+
return if opts[:no_overwrite] && File.exists?(indexfile)
|
145
154
|
@log.info "Saving #{indexfile}"
|
146
155
|
File.open(indexfile, "wb") {|f| f << body }
|
147
156
|
elsif File.directory?(outfile)
|
148
157
|
dirfile = outfile + ".d"
|
158
|
+
outfile = File.join(outfile, "/index.html")
|
159
|
+
return if opts[:no_overwrite] && File.exists?(outfile)
|
149
160
|
@log.info "Saving #{dirfile}"
|
150
161
|
File.open(dirfile, "wb") {|f| f << body }
|
151
|
-
FileUtils.cp(dirfile,
|
162
|
+
FileUtils.cp(dirfile, outfile)
|
152
163
|
else
|
164
|
+
return if opts[:no_overwrite] && File.exists?(outfile)
|
153
165
|
@log.info "Saving #{outfile}"
|
154
166
|
File.open(outfile, "wb") {|f| f << body }
|
155
167
|
end
|
156
168
|
end
|
157
169
|
|
158
|
-
def save_page_to_aws(response, uri)
|
170
|
+
def save_page_to_aws(response, uri, opts = {})
|
159
171
|
key = uri.path
|
160
172
|
key += "?#{uri.query}" if uri.query
|
161
173
|
key = key.gsub(%r{^/},"")
|
162
174
|
key = "index.html" if key == ""
|
163
175
|
# Upload this file directly to AWS::S3
|
164
|
-
opts = {:acl =>
|
176
|
+
opts = {:acl => :public_read}
|
165
177
|
opts[:content_type] = response['content-type'] rescue "text/html"
|
166
178
|
@log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
|
167
179
|
if response.respond_to?(:read_body)
|
168
|
-
|
169
|
-
@s3_bucket.object(key).put(opts.merge(body: body))
|
180
|
+
@s3_bucket.objects[key].write(response.read_body, opts)
|
170
181
|
else
|
171
|
-
|
172
|
-
|
173
|
-
end
|
182
|
+
@s3_bucket.objects[key].write(response, opts)
|
183
|
+
end
|
174
184
|
end
|
175
|
-
|
185
|
+
|
176
186
|
def process_success(response, parsed_uri)
|
177
187
|
url = parsed_uri.to_s
|
178
|
-
if @opts[:filter_process]
|
179
|
-
return if @opts[:filter_process].call(response, parsed_uri)
|
180
|
-
end
|
181
188
|
case response['content-type']
|
182
189
|
when /css/
|
183
|
-
save_page(response, parsed_uri)
|
184
|
-
add_urls(extract_css_urls(response.body,
|
190
|
+
save_page(response, parsed_uri, no_query: true)
|
191
|
+
add_urls(extract_css_urls(response.body, parsed_uri), {:type_hint => "css_url"})
|
185
192
|
when /html/
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
add_urls(
|
190
|
-
add_urls(
|
191
|
-
add_urls(
|
192
|
-
add_urls(
|
193
|
+
body = response.body
|
194
|
+
save_page(body.gsub("https://www.canaan.com", ""), parsed_uri)
|
195
|
+
doc = Nokogiri::HTML(body)
|
196
|
+
add_urls(extract_videos(doc, parsed_uri), {:type_hint => "video"})
|
197
|
+
add_urls(extract_links(doc, parsed_uri), {:type_hint => "link"})
|
198
|
+
add_urls(extract_scripts(doc, parsed_uri), {:type_hint => "script"})
|
199
|
+
add_urls(extract_images(doc, parsed_uri), {:type_hint => "image"})
|
200
|
+
add_urls(extract_hrefs(doc, parsed_uri), {:type_hint => "href"})
|
201
|
+
# extract inline style="background-image:url('https://')" type of urls
|
202
|
+
add_urls(extract_css_urls(body, parsed_uri), {:type_hint => "css_url"})
|
193
203
|
else
|
194
|
-
save_page(response, parsed_uri)
|
204
|
+
save_page(response, parsed_uri, no_query: true)
|
195
205
|
end
|
196
206
|
end
|
197
207
|
|
198
208
|
# If we hit a redirect we save the redirect as a meta refresh page
|
199
209
|
# TODO: for AWS S3 hosting we could instead create a redirect?
|
200
|
-
def process_redirect(url, destination_url)
|
210
|
+
def process_redirect(url, destination_url, opts = {})
|
201
211
|
body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>"
|
202
|
-
save_page(body, url)
|
203
|
-
end
|
204
|
-
|
205
|
-
def process_body(body, uri, opts)
|
206
|
-
if @process_body
|
207
|
-
body = @process_body.call(body, uri, opts)
|
208
|
-
end
|
209
|
-
body
|
212
|
+
save_page(body, url, opts)
|
210
213
|
end
|
211
214
|
|
212
215
|
# Fetch a URI and save it to disk
|
@@ -215,37 +218,31 @@ module Staticizer
|
|
215
218
|
parsed_uri = URI(url)
|
216
219
|
|
217
220
|
@log.debug "Fetching #{parsed_uri}"
|
218
|
-
|
221
|
+
|
219
222
|
# Attempt to use an already open Net::HTTP connection
|
220
223
|
key = parsed_uri.host + parsed_uri.port.to_s
|
221
224
|
connection = @http_connections[key]
|
222
225
|
if connection.nil?
|
223
226
|
connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
224
|
-
connection.use_ssl = true if parsed_uri.scheme
|
227
|
+
connection.use_ssl = true if parsed_uri.scheme == "https"
|
225
228
|
@http_connections[key] = connection
|
226
229
|
end
|
227
230
|
|
228
231
|
request = Net::HTTP::Get.new(parsed_uri.request_uri)
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
@log.error "Error #{response.code}:#{response.message} fetching url #{url}"
|
241
|
-
end
|
232
|
+
connection.request(request) do |response|
|
233
|
+
case response
|
234
|
+
when Net::HTTPSuccess
|
235
|
+
process_success(response, parsed_uri)
|
236
|
+
when Net::HTTPRedirection
|
237
|
+
redirect_url = response['location']
|
238
|
+
@log.debug "Processing redirect to #{redirect_url}"
|
239
|
+
process_redirect(parsed_uri, redirect_url)
|
240
|
+
add_url(redirect_url)
|
241
|
+
else
|
242
|
+
@log.error "Error #{response.code}:#{response.message} fetching url #{url}"
|
242
243
|
end
|
243
|
-
rescue OpenSSL::SSL::SSLError => e
|
244
|
-
@log.error "SSL Error #{e.message} fetching url #{url}"
|
245
|
-
rescue Errno::ECONNRESET => e
|
246
|
-
@log.error "Error #{e.class}:#{e.message} fetching url #{url}"
|
247
244
|
end
|
248
245
|
end
|
249
246
|
|
250
247
|
end
|
251
|
-
end
|
248
|
+
end
|
data/lib/staticizer/version.rb
CHANGED
data/staticizer.gemspec
CHANGED
@@ -20,7 +20,6 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
|
-
spec.add_development_dependency "webmock"
|
24
23
|
|
25
24
|
spec.add_runtime_dependency 'nokogiri'
|
26
25
|
spec.add_runtime_dependency 'aws-sdk'
|
data/tests/crawler_test.rb
CHANGED
@@ -1,80 +1,15 @@
|
|
1
1
|
require 'minitest/autorun'
|
2
|
-
require 'ostruct'
|
3
2
|
|
4
|
-
|
5
|
-
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
6
|
-
|
7
|
-
require 'staticizer'
|
3
|
+
# TODO!
|
8
4
|
|
9
5
|
class TestFilePaths < MiniTest::Unit::TestCase
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
"http://test.com" => "index.html",
|
20
|
-
"http://test.com/" => "index.html",
|
21
|
-
"http://test.com/asdfdf/dfdf" => "/asdfdf/dfdf",
|
22
|
-
"http://test.com/asdfdf/dfdf/" => ["/asdfdf/dfdf","/asdfdf/dfdf/index.html"],
|
23
|
-
"http://test.com/asdfad/asdffd.test" => "/asdfad/asdffd.test",
|
24
|
-
"http://test.com/?asdfsd=12312" => "/?asdfsd=12312",
|
25
|
-
"http://test.com/asdfad/asdffd.test?123=sdff" => "/asdfad/asdffd.test?123=sdff",
|
26
|
-
}
|
27
|
-
|
28
|
-
# TODO: Stub out file system using https://github.com/defunkt/fakefs?
|
29
|
-
outputdir = "/tmp/staticizer_crawl_test"
|
30
|
-
FileUtils.rm_rf(outputdir)
|
31
|
-
@crawler.output_dir = outputdir
|
32
|
-
|
33
|
-
file_paths.each do |k,v|
|
34
|
-
@crawler.save_page_to_disk(fake_response, URI.parse(k))
|
35
|
-
[v].flatten.each do |file|
|
36
|
-
expected = File.expand_path(outputdir + "/#{file}")
|
37
|
-
assert File.exists?(expected), "File #{expected} not created for url #{k}"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def test_save_page_to_aws
|
43
|
-
end
|
44
|
-
|
45
|
-
def test_add_url_with_valid_domains
|
46
|
-
test_url = "http://test.com/test"
|
47
|
-
@crawler.add_url(test_url)
|
48
|
-
assert(@crawler.url_queue[-1] == [test_url, {}], "URL #{test_url} not added to queue")
|
49
|
-
end
|
50
|
-
|
51
|
-
def test_add_url_with_filter
|
52
|
-
end
|
53
|
-
|
54
|
-
def test_initialize_options
|
55
|
-
end
|
56
|
-
|
57
|
-
def test_process_url
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_make_absolute
|
61
|
-
end
|
62
|
-
|
63
|
-
def test_link_extraction
|
64
|
-
end
|
65
|
-
|
66
|
-
def test_href_extraction
|
67
|
-
end
|
68
|
-
|
69
|
-
def test_css_extraction
|
70
|
-
end
|
71
|
-
|
72
|
-
def test_css_url_extraction
|
73
|
-
end
|
74
|
-
|
75
|
-
def test_image_extraction
|
76
|
-
end
|
77
|
-
|
78
|
-
def test_script_extraction
|
79
|
-
end
|
6
|
+
tests = {
|
7
|
+
"" => "index.html"
|
8
|
+
"/" => "index.html"
|
9
|
+
"/asdfdf/dfdf" => "/asdfdf/dfdf"
|
10
|
+
"/asdfdf/dfdf/" => "/asdfdf/dfdf" and "/asdfdf/dfdf/index.html"
|
11
|
+
"/asdfad/asdffd.test" => "/asdfad/asdffd.test"
|
12
|
+
"/?asdfsd=12312" => "/?asdfsd=12312"
|
13
|
+
"/asdfad/asdffd.test?123=sdff" => "/asdfad/asdffd.test?123=sdff"
|
14
|
+
}
|
80
15
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: staticizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Conor Hunt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-12-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: webmock
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: nokogiri
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -101,7 +87,6 @@ files:
|
|
101
87
|
- lib/staticizer/version.rb
|
102
88
|
- staticizer.gemspec
|
103
89
|
- tests/crawler_test.rb
|
104
|
-
- tests/fake_page.html
|
105
90
|
homepage: https://github.com/SquareMill/staticizer
|
106
91
|
licenses:
|
107
92
|
- MIT
|
@@ -122,7 +107,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
107
|
version: '0'
|
123
108
|
requirements: []
|
124
109
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.7.6
|
126
111
|
signing_key:
|
127
112
|
specification_version: 4
|
128
113
|
summary: A tool to create a static version of a website for hosting on S3.
|
data/tests/fake_page.html
DELETED
@@ -1,288 +0,0 @@
|
|
1
|
-
<!DOCTYPE html>
|
2
|
-
<html lang="en">
|
3
|
-
<head>
|
4
|
-
<title>Web Application Design and Development — Square Mill Labs</title>
|
5
|
-
<meta content="authenticity_token" name="csrf-param" />
|
6
|
-
<meta content="LshjtNLXmjVY9NINXYQds+2Ur+jxUtqKVjjbDbVl+9w=" name="csrf-token" />
|
7
|
-
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
8
|
-
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
|
9
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
10
|
-
<meta property="og:type" content="website">
|
11
|
-
<meta property="og:url" content="http://squaremill.com/">
|
12
|
-
<meta property="og:image" content="">
|
13
|
-
<meta name="viewport" content="width=device-width, maximum-scale=1.0, initial-scale=1.0">
|
14
|
-
<meta name="description" content="Web Application Design and Development — Square Mill Labs">
|
15
|
-
<link rel="shortcut icon" type="image/png" href="http://squaremill.com/assets/icons/favicon-0fecbe6b20ff5bdf623357a3fac76b4b.png">
|
16
|
-
<link data-turbolinks-track="true" href="/assets/mn_application-5ddad96f16e03ad2137bf02270506e61.css" media="all" rel="stylesheet" />
|
17
|
-
<!--[if lt IE 9]>
|
18
|
-
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
|
19
|
-
<![endif]-->
|
20
|
-
|
21
|
-
<script type="text/javascript" src="//use.typekit.net/cjr4fwy.js"></script>
|
22
|
-
<script type="text/javascript">try{Typekit.load();}catch(e){}</script>
|
23
|
-
</head>
|
24
|
-
|
25
|
-
<body id="public">
|
26
|
-
<script type="text/javascript">
|
27
|
-
|
28
|
-
var _gaq = _gaq || [];
|
29
|
-
_gaq.push(['_setAccount', 'UA-30460332-1']);
|
30
|
-
_gaq.push(['_setDomainName', 'squaremill.com']);
|
31
|
-
_gaq.push(['_trackPageview']);
|
32
|
-
|
33
|
-
(function() {
|
34
|
-
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
|
35
|
-
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
|
36
|
-
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
|
37
|
-
})();
|
38
|
-
|
39
|
-
</script>
|
40
|
-
|
41
|
-
|
42
|
-
<header id="header">
|
43
|
-
<nav class="nav container">
|
44
|
-
<a class="branding" href="http://squaremill.com/" rel="home" title="Square Mill - Digital Products for Web and Mobile">
|
45
|
-
<img alt="Square Mill Logo" class="logo" height="16" src="/assets/m2-wordmark-black-97525464acd136ce26b77e39c7ed2ba3.png" width="128" />
|
46
|
-
<p class="description">
|
47
|
-
Digital Products for Web and Mobile
|
48
|
-
</p>
|
49
|
-
</a> <a class="menu-trigger" href="#">Menu</a>
|
50
|
-
<div class="main-nav">
|
51
|
-
<ul class="container">
|
52
|
-
<li><a href="/projects">Projects</a></li>
|
53
|
-
<li><a href="/about">About Us</a></li>
|
54
|
-
<li><a href="/blog">Blog</a></li>
|
55
|
-
|
56
|
-
<!-- <li class="link-biography"><a href="http://squaremill.com/#biography">People</a></li> -->
|
57
|
-
</ul>
|
58
|
-
</div>
|
59
|
-
|
60
|
-
</nav>
|
61
|
-
|
62
|
-
</header>
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
<div id="site-content">
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
<div class="container" id="home-projects">
|
72
|
-
<section class="big-promo">
|
73
|
-
<div class="project" id="project-7">
|
74
|
-
<a href="/projects/bon-voyaging">
|
75
|
-
<div class="devices">
|
76
|
-
<div class="laptop device">
|
77
|
-
<img alt="" class="chrome" src="/assets/projects/macbook-pro-1be04a78f99b40e2c676d78fc24fcc3d.png" />
|
78
|
-
<div class="screenshot">
|
79
|
-
<img alt="" src="/uploads/project/desktop_image/7/bonvoyaging-desktop.jpg" />
|
80
|
-
</div>
|
81
|
-
</div>
|
82
|
-
|
83
|
-
<div class="handheld device">
|
84
|
-
<img alt="" class="chrome" src="/assets/projects/iphone5-c1ea3a16be931f1e80bacab5cfec932d.png" />
|
85
|
-
<div class="screenshot">
|
86
|
-
<img alt="" src="/uploads/project/iphone_image/7/bonvoyagin-handheld.jpg" />
|
87
|
-
</div>
|
88
|
-
</div>
|
89
|
-
</div>
|
90
|
-
|
91
|
-
<div class="project-description">
|
92
|
-
<div class="summary">
|
93
|
-
<h2>Bon Voyaging <i class="icon-play-sign"></i></h2>
|
94
|
-
<p>Bon Voyaging enables discerning travelers to expertly envision their next voyage from inspiration to exploration. Powerful search tools and a interactive javascript interface make planning trips fun.</p>
|
95
|
-
</div>
|
96
|
-
</div>
|
97
|
-
</a>
|
98
|
-
</div>
|
99
|
-
</section>
|
100
|
-
<section class="big-promo">
|
101
|
-
<div class="project" id="project-1">
|
102
|
-
<a href="/projects/kpcb-fellows">
|
103
|
-
<div class="devices">
|
104
|
-
<div class="laptop device">
|
105
|
-
<img alt="" class="chrome" src="/assets/projects/macbook-pro-1be04a78f99b40e2c676d78fc24fcc3d.png" />
|
106
|
-
<div class="screenshot">
|
107
|
-
<img alt="" src="/uploads/project/desktop_image/1/kpcb-fellows-screenshot.jpg" />
|
108
|
-
</div>
|
109
|
-
</div>
|
110
|
-
|
111
|
-
<div class="handheld device">
|
112
|
-
<img alt="" class="chrome" src="/assets/projects/iphone5-c1ea3a16be931f1e80bacab5cfec932d.png" />
|
113
|
-
<div class="screenshot">
|
114
|
-
<img alt="" src="/uploads/project/iphone_image/1/kpcb-fellows-iphone-screenshot.jpg" />
|
115
|
-
</div>
|
116
|
-
</div>
|
117
|
-
</div>
|
118
|
-
|
119
|
-
<div class="project-description">
|
120
|
-
<div class="summary">
|
121
|
-
<h2>KPCB Fellows Website and Brand <i class="icon-play-sign"></i></h2>
|
122
|
-
<p>The Fellows Program is a three-month work-based program that pairs top U.S. Engineering, Design and Product Design students with leading technology companies</p>
|
123
|
-
</div>
|
124
|
-
</div>
|
125
|
-
</a>
|
126
|
-
</div>
|
127
|
-
</section>
|
128
|
-
<section class="big-promo">
|
129
|
-
<div class="project" id="project-2">
|
130
|
-
<a href="/projects/thomson-reuters-messenger">
|
131
|
-
<div class="devices">
|
132
|
-
<div class="laptop device no-handheld">
|
133
|
-
<img alt="" class="chrome" src="/assets/projects/macbook-pro-1be04a78f99b40e2c676d78fc24fcc3d.png" />
|
134
|
-
<div class="screenshot">
|
135
|
-
<img alt="" src="/uploads/project/desktop_image/2/thomson-reuters-messenger-desktop.png" />
|
136
|
-
</div>
|
137
|
-
</div>
|
138
|
-
|
139
|
-
</div>
|
140
|
-
|
141
|
-
<div class="project-description">
|
142
|
-
<div class="summary">
|
143
|
-
<h2>Thomson Reuters Messenger <i class="icon-play-sign"></i></h2>
|
144
|
-
<p>Messenger is an html5 / javascript instant messenger application for financial professionals</p>
|
145
|
-
</div>
|
146
|
-
</div>
|
147
|
-
</a>
|
148
|
-
</div>
|
149
|
-
</section>
|
150
|
-
<section class="big-promo">
|
151
|
-
<div class="project" id="project-3">
|
152
|
-
<a href="/projects/kleiner-perkins-caufield-byers-digital-presence">
|
153
|
-
<div class="devices">
|
154
|
-
<div class="laptop device">
|
155
|
-
<img alt="" class="chrome" src="/assets/projects/macbook-pro-1be04a78f99b40e2c676d78fc24fcc3d.png" />
|
156
|
-
<div class="screenshot">
|
157
|
-
<img alt="" src="/uploads/project/desktop_image/3/kpcb-screenshot.jpg" />
|
158
|
-
</div>
|
159
|
-
</div>
|
160
|
-
|
161
|
-
<div class="handheld device">
|
162
|
-
<img alt="" class="chrome" src="/assets/projects/iphone5-c1ea3a16be931f1e80bacab5cfec932d.png" />
|
163
|
-
<div class="screenshot">
|
164
|
-
<img alt="" src="/uploads/project/iphone_image/3/kpcb-iphone-screenshot.jpg" />
|
165
|
-
</div>
|
166
|
-
</div>
|
167
|
-
</div>
|
168
|
-
|
169
|
-
<div class="project-description">
|
170
|
-
<div class="summary">
|
171
|
-
<h2>Kleiner Perkins Caufield & Byers Digital Presence <i class="icon-play-sign"></i></h2>
|
172
|
-
<p>KPCB is a venture capital stalwart located in Silicon Valley with over 40 years of tech and science investment.</p>
|
173
|
-
</div>
|
174
|
-
</div>
|
175
|
-
</a>
|
176
|
-
</div>
|
177
|
-
</section>
|
178
|
-
</div>
|
179
|
-
|
180
|
-
<section class="clients full-width">
|
181
|
-
<div class="container">
|
182
|
-
<h2>Clients</h2>
|
183
|
-
<ul class="hlist">
|
184
|
-
<li><a href="http://kpcb.com" rel="friend" target="_blank" title="KPCB's Website"><img alt="KPCB" src="/uploads/client/image/1/home_logo_kpcb-logo.png" /></a></li>
|
185
|
-
<li><a href="http://thomsonreuters.com" rel="friend" target="_blank" title="Thomson Reuters's Website"><img alt="Thomson Reuters" src="/uploads/client/image/2/home_logo_thomsonreuters.png" /></a></li>
|
186
|
-
<li><a href="http://sumzero.com" rel="friend" target="_blank" title="SumZero's Website"><img alt="SumZero" src="/uploads/client/image/3/home_logo_sumzero.png" /></a></li>
|
187
|
-
<li><a href="http://marlboroughgallery.com" rel="friend" target="_blank" title="Marlborough Gallery's Website"><img alt="Marlborough Gallery" src="/uploads/client/image/4/home_logo_marlborough.png" /></a></li>
|
188
|
-
<li><a href="http://flurry.com" rel="friend" target="_blank" title="Flurry Analytics's Website"><img alt="Flurry Analytics" src="/uploads/client/image/8/home_logo_flurry.png" /></a></li>
|
189
|
-
</ul>
|
190
|
-
</div>
|
191
|
-
</section>
|
192
|
-
|
193
|
-
<section class="quote">
|
194
|
-
<blockquote>
|
195
|
-
<p>"Square Mill really took the time to understand our business and think strategically about how we want to engage and communicate with our entrepreneurs online. Together, their small team is responsive, nimble and efficient and has the deep design and technical chops to back it up."</p>
|
196
|
-
<small><a href="http://kpcb.com/partner/christina-lee" rel="friend" title="Christina Lee, Operating Partner at KPCB">Christina Lee</a>, <em>Operating Partner at KPCB</em></small>
|
197
|
-
</blockquote>
|
198
|
-
</section>
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
</div>
|
204
|
-
|
205
|
-
<footer id="footer">
|
206
|
-
<section class="container">
|
207
|
-
<a class="logo" href="http://squaremill.com/" rel="home">
|
208
|
-
<img alt="Square Mill Logo" height="64" src="/assets/md-logo-black-942423ecfd86c43ec6f13f163ea03f97.png" width="64" />
|
209
|
-
</a> <div class="main-nav">
|
210
|
-
<ul class="container">
|
211
|
-
<li><a href="/projects">Projects</a></li>
|
212
|
-
<li><a href="/about">About Us</a></li>
|
213
|
-
<li><a href="/blog">Blog</a></li>
|
214
|
-
|
215
|
-
<!-- <li class="link-biography"><a href="http://squaremill.com/#biography">People</a></li> -->
|
216
|
-
</ul>
|
217
|
-
</div>
|
218
|
-
|
219
|
-
</section>
|
220
|
-
|
221
|
-
<p class="copyright">
|
222
|
-
© 2014 Square Mill Labs, LLC. All rights reserved.
|
223
|
-
</p>
|
224
|
-
</footer>
|
225
|
-
|
226
|
-
<script type="text/javascript" src="http://code.jquery.com/jquery-2.0.0.js"></script>
|
227
|
-
<script type="text/javascript" src="http://code.jquery.com/jquery-migrate-1.1.1.js"></script>
|
228
|
-
<script src="/assets/mn_application-82f6787dca307be34ec0c9fa6b7ba7d4.js"></script>
|
229
|
-
<script>
|
230
|
-
$(document).ready(function() {
|
231
|
-
|
232
|
-
var controller = $.superscrollorama({
|
233
|
-
triggerAtCenter: true,
|
234
|
-
playoutAnimations: true
|
235
|
-
});
|
236
|
-
|
237
|
-
if ( $(window).width() >= 767 ) {
|
238
|
-
controller.addTween('#project-7',
|
239
|
-
TweenMax.from($('#project-7'), .7, {
|
240
|
-
css:{"opacity":"0"},
|
241
|
-
onComplete: function(){
|
242
|
-
$('#project-7').toggleClass('active-in')
|
243
|
-
}
|
244
|
-
}),
|
245
|
-
300, // duration of scroll in pixel units
|
246
|
-
-100, // scroll offset (from center of viewport)
|
247
|
-
true
|
248
|
-
);
|
249
|
-
controller.addTween('#project-1',
|
250
|
-
TweenMax.from($('#project-1'), .7, {
|
251
|
-
css:{"opacity":"0"},
|
252
|
-
onComplete: function(){
|
253
|
-
$('#project-1').toggleClass('active-in')
|
254
|
-
}
|
255
|
-
}),
|
256
|
-
300, // duration of scroll in pixel units
|
257
|
-
-100, // scroll offset (from center of viewport)
|
258
|
-
true
|
259
|
-
);
|
260
|
-
controller.addTween('#project-2',
|
261
|
-
TweenMax.from($('#project-2'), .7, {
|
262
|
-
css:{"opacity":"0"},
|
263
|
-
onComplete: function(){
|
264
|
-
$('#project-2').toggleClass('active-in')
|
265
|
-
}
|
266
|
-
}),
|
267
|
-
300, // duration of scroll in pixel units
|
268
|
-
-100, // scroll offset (from center of viewport)
|
269
|
-
true
|
270
|
-
);
|
271
|
-
controller.addTween('#project-3',
|
272
|
-
TweenMax.from($('#project-3'), .7, {
|
273
|
-
css:{"opacity":"0"},
|
274
|
-
onComplete: function(){
|
275
|
-
$('#project-3').toggleClass('active-in')
|
276
|
-
}
|
277
|
-
}),
|
278
|
-
300, // duration of scroll in pixel units
|
279
|
-
-100, // scroll offset (from center of viewport)
|
280
|
-
true
|
281
|
-
);
|
282
|
-
}
|
283
|
-
|
284
|
-
});
|
285
|
-
</script>
|
286
|
-
|
287
|
-
</body>
|
288
|
-
</html>
|