staticizer 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -0
- data/lib/staticizer/command.rb +4 -0
- data/lib/staticizer/crawler.rb +29 -18
- data/lib/staticizer/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2ac92b2be780663c4828b685c270eccdee9106d
|
4
|
+
data.tar.gz: 2e31e7b3f701f431ea11e4cbb9a8296ec51475de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d019bef5884ce685a5def86bc6840edcc95879062b01a114ceeacf8b126fb4a3a24c7ba58cef3f674c7198a422a62b397fb57687a86b55843d0b1fa3f02745a8
|
7
|
+
data.tar.gz: fa25013152b03b1b83c3a1347604699575951efee4dfe669c7b7be04236c75a1473258beb2ca1bd935163a6a47524b3217efd5d157f7b17779a3200cacb6d696
|
data/README.md
CHANGED
@@ -75,6 +75,8 @@ This will only crawl urls in the domain squaremill.com
|
|
75
75
|
|
76
76
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
77
77
|
:aws => {
|
78
|
+
:region => "us-west-1",
|
79
|
+
:endpoint => "http://s3.amazonaws.com",
|
78
80
|
:bucket_name => "www.squaremill.com",
|
79
81
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
80
82
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -106,6 +108,8 @@ This will only crawl urls in the domain squaremill.com
|
|
106
108
|
|
107
109
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
108
110
|
:aws => {
|
111
|
+
:region => "us-west-1",
|
112
|
+
:endpoint => "http://s3.amazonaws.com",
|
109
113
|
:bucket_name => "www.squaremill.com",
|
110
114
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
111
115
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -130,6 +134,7 @@ This will only crawl urls in the domain squaremill.com
|
|
130
134
|
* :log_level - Log level - defaults to INFO.
|
131
135
|
* :valid_domains - Array of domains that should be crawled. Domains not in this list will be ignored.
|
132
136
|
* :process_body - lambda called to pre-process body of content before writing it out.
|
137
|
+
* :skip_write - don't write retrieved files to disk or s3, just crawl the site (can be used to find 404s etc.)
|
133
138
|
|
134
139
|
## Contributing
|
135
140
|
|
data/lib/staticizer/command.rb
CHANGED
@@ -44,6 +44,10 @@ module Staticizer
|
|
44
44
|
options[:logger] = Logger.new(v)
|
45
45
|
end
|
46
46
|
|
47
|
+
opts.on("--skip-write [PATH]", "Don't write out files to disk or s3") do |v|
|
48
|
+
options[:skip_write] = true
|
49
|
+
end
|
50
|
+
|
47
51
|
opts.on("--valid-domains x,y,z", Array, "Comma separated list of domains that should be crawled, other domains will be ignored") do |v|
|
48
52
|
options[:valid_domains] = v
|
49
53
|
end
|
data/lib/staticizer/crawler.rb
CHANGED
@@ -23,9 +23,8 @@ module Staticizer
|
|
23
23
|
|
24
24
|
if @opts[:aws]
|
25
25
|
bucket_name = @opts[:aws].delete(:bucket_name)
|
26
|
-
|
27
|
-
@s3_bucket =
|
28
|
-
@s3_bucket.acl = :public_read
|
26
|
+
Aws.config.update(opts[:aws])
|
27
|
+
@s3_bucket = Aws::S3::Resource.new.bucket(bucket_name)
|
29
28
|
end
|
30
29
|
|
31
30
|
if @opts[:valid_domains].nil?
|
@@ -86,6 +85,7 @@ module Staticizer
|
|
86
85
|
URI::join(base_uri, href).to_s
|
87
86
|
rescue StandardError => e
|
88
87
|
@log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}"
|
88
|
+
return nil
|
89
89
|
end
|
90
90
|
|
91
91
|
def add_url(url, info = {})
|
@@ -103,6 +103,7 @@ module Staticizer
|
|
103
103
|
end
|
104
104
|
|
105
105
|
def save_page(response, uri)
|
106
|
+
return if @opts[:skip_write]
|
106
107
|
if @opts[:aws]
|
107
108
|
save_page_to_aws(response, uri)
|
108
109
|
else
|
@@ -160,20 +161,23 @@ module Staticizer
|
|
160
161
|
key = key.gsub(%r{^/},"")
|
161
162
|
key = "index.html" if key == ""
|
162
163
|
# Upload this file directly to AWS::S3
|
163
|
-
opts = {:acl =>
|
164
|
+
opts = {:acl => "public-read"}
|
164
165
|
opts[:content_type] = response['content-type'] rescue "text/html"
|
165
166
|
@log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
|
166
167
|
if response.respond_to?(:read_body)
|
167
168
|
body = process_body(response.read_body, uri, opts)
|
168
|
-
@s3_bucket.
|
169
|
+
@s3_bucket.object(key).put(opts.merge(body: body))
|
169
170
|
else
|
170
171
|
body = process_body(response, uri, opts)
|
171
|
-
@s3_bucket.
|
172
|
+
@s3_bucket.object(key).put(opts.merge(body: body))
|
172
173
|
end
|
173
174
|
end
|
174
175
|
|
175
176
|
def process_success(response, parsed_uri)
|
176
177
|
url = parsed_uri.to_s
|
178
|
+
if @opts[:filter_process]
|
179
|
+
return if @opts[:filter_process].call(response, parsed_uri)
|
180
|
+
end
|
177
181
|
case response['content-type']
|
178
182
|
when /css/
|
179
183
|
save_page(response, parsed_uri)
|
@@ -211,28 +215,35 @@ module Staticizer
|
|
211
215
|
parsed_uri = URI(url)
|
212
216
|
|
213
217
|
@log.debug "Fetching #{parsed_uri}"
|
214
|
-
|
218
|
+
|
215
219
|
# Attempt to use an already open Net::HTTP connection
|
216
220
|
key = parsed_uri.host + parsed_uri.port.to_s
|
217
221
|
connection = @http_connections[key]
|
218
222
|
if connection.nil?
|
219
223
|
connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
224
|
+
connection.use_ssl = true if parsed_uri.scheme.downcase == "https"
|
220
225
|
@http_connections[key] = connection
|
221
226
|
end
|
222
227
|
|
223
228
|
request = Net::HTTP::Get.new(parsed_uri.request_uri)
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
229
|
+
begin
|
230
|
+
connection.request(request) do |response|
|
231
|
+
case response
|
232
|
+
when Net::HTTPSuccess
|
233
|
+
process_success(response, parsed_uri)
|
234
|
+
when Net::HTTPRedirection
|
235
|
+
redirect_url = response['location']
|
236
|
+
@log.debug "Processing redirect to #{redirect_url}"
|
237
|
+
process_redirect(parsed_uri, redirect_url)
|
238
|
+
add_url(redirect_url)
|
239
|
+
else
|
240
|
+
@log.error "Error #{response.code}:#{response.message} fetching url #{url}"
|
241
|
+
end
|
235
242
|
end
|
243
|
+
rescue OpenSSL::SSL::SSLError => e
|
244
|
+
@log.error "SSL Error #{e.message} fetching url #{url}"
|
245
|
+
rescue Errno::ECONNRESET => e
|
246
|
+
@log.error "Error #{e.class}:#{e.message} fetching url #{url}"
|
236
247
|
end
|
237
248
|
end
|
238
249
|
|
data/lib/staticizer/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: staticizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Conor Hunt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,7 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
122
|
version: '0'
|
123
123
|
requirements: []
|
124
124
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
125
|
+
rubygems_version: 2.4.8
|
126
126
|
signing_key:
|
127
127
|
specification_version: 4
|
128
128
|
summary: A tool to create a static version of a website for hosting on S3.
|