staticizer 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -0
- data/lib/staticizer/command.rb +4 -0
- data/lib/staticizer/crawler.rb +29 -18
- data/lib/staticizer/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2ac92b2be780663c4828b685c270eccdee9106d
|
4
|
+
data.tar.gz: 2e31e7b3f701f431ea11e4cbb9a8296ec51475de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d019bef5884ce685a5def86bc6840edcc95879062b01a114ceeacf8b126fb4a3a24c7ba58cef3f674c7198a422a62b397fb57687a86b55843d0b1fa3f02745a8
|
7
|
+
data.tar.gz: fa25013152b03b1b83c3a1347604699575951efee4dfe669c7b7be04236c75a1473258beb2ca1bd935163a6a47524b3217efd5d157f7b17779a3200cacb6d696
|
data/README.md
CHANGED
@@ -75,6 +75,8 @@ This will only crawl urls in the domain squaremill.com
|
|
75
75
|
|
76
76
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
77
77
|
:aws => {
|
78
|
+
:region => "us-west-1",
|
79
|
+
:endpoint => "http://s3.amazonaws.com",
|
78
80
|
:bucket_name => "www.squaremill.com",
|
79
81
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
80
82
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -106,6 +108,8 @@ This will only crawl urls in the domain squaremill.com
|
|
106
108
|
|
107
109
|
s = Staticizer::Crawler.new("http://squaremill.com",
|
108
110
|
:aws => {
|
111
|
+
:region => "us-west-1",
|
112
|
+
:endpoint => "http://s3.amazonaws.com",
|
109
113
|
:bucket_name => "www.squaremill.com",
|
110
114
|
:secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
|
111
115
|
:access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
|
@@ -130,6 +134,7 @@ This will only crawl urls in the domain squaremill.com
|
|
130
134
|
* :log_level - Log level - defaults to INFO.
|
131
135
|
* :valid_domains - Array of domains that should be crawled. Domains not in this list will be ignored.
|
132
136
|
* :process_body - lambda called to pre-process body of content before writing it out.
|
137
|
+
* :skip_write - don't write retrieved files to disk or s3, just crawl the site (can be used to find 404s etc.)
|
133
138
|
|
134
139
|
## Contributing
|
135
140
|
|
data/lib/staticizer/command.rb
CHANGED
@@ -44,6 +44,10 @@ module Staticizer
|
|
44
44
|
options[:logger] = Logger.new(v)
|
45
45
|
end
|
46
46
|
|
47
|
+
opts.on("--skip-write [PATH]", "Don't write out files to disk or s3") do |v|
|
48
|
+
options[:skip_write] = true
|
49
|
+
end
|
50
|
+
|
47
51
|
opts.on("--valid-domains x,y,z", Array, "Comma separated list of domains that should be crawled, other domains will be ignored") do |v|
|
48
52
|
options[:valid_domains] = v
|
49
53
|
end
|
data/lib/staticizer/crawler.rb
CHANGED
@@ -23,9 +23,8 @@ module Staticizer
|
|
23
23
|
|
24
24
|
if @opts[:aws]
|
25
25
|
bucket_name = @opts[:aws].delete(:bucket_name)
|
26
|
-
|
27
|
-
@s3_bucket =
|
28
|
-
@s3_bucket.acl = :public_read
|
26
|
+
Aws.config.update(opts[:aws])
|
27
|
+
@s3_bucket = Aws::S3::Resource.new.bucket(bucket_name)
|
29
28
|
end
|
30
29
|
|
31
30
|
if @opts[:valid_domains].nil?
|
@@ -86,6 +85,7 @@ module Staticizer
|
|
86
85
|
URI::join(base_uri, href).to_s
|
87
86
|
rescue StandardError => e
|
88
87
|
@log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}"
|
88
|
+
return nil
|
89
89
|
end
|
90
90
|
|
91
91
|
def add_url(url, info = {})
|
@@ -103,6 +103,7 @@ module Staticizer
|
|
103
103
|
end
|
104
104
|
|
105
105
|
def save_page(response, uri)
|
106
|
+
return if @opts[:skip_write]
|
106
107
|
if @opts[:aws]
|
107
108
|
save_page_to_aws(response, uri)
|
108
109
|
else
|
@@ -160,20 +161,23 @@ module Staticizer
|
|
160
161
|
key = key.gsub(%r{^/},"")
|
161
162
|
key = "index.html" if key == ""
|
162
163
|
# Upload this file directly to AWS::S3
|
163
|
-
opts = {:acl =>
|
164
|
+
opts = {:acl => "public-read"}
|
164
165
|
opts[:content_type] = response['content-type'] rescue "text/html"
|
165
166
|
@log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
|
166
167
|
if response.respond_to?(:read_body)
|
167
168
|
body = process_body(response.read_body, uri, opts)
|
168
|
-
@s3_bucket.
|
169
|
+
@s3_bucket.object(key).put(opts.merge(body: body))
|
169
170
|
else
|
170
171
|
body = process_body(response, uri, opts)
|
171
|
-
@s3_bucket.
|
172
|
+
@s3_bucket.object(key).put(opts.merge(body: body))
|
172
173
|
end
|
173
174
|
end
|
174
175
|
|
175
176
|
def process_success(response, parsed_uri)
|
176
177
|
url = parsed_uri.to_s
|
178
|
+
if @opts[:filter_process]
|
179
|
+
return if @opts[:filter_process].call(response, parsed_uri)
|
180
|
+
end
|
177
181
|
case response['content-type']
|
178
182
|
when /css/
|
179
183
|
save_page(response, parsed_uri)
|
@@ -211,28 +215,35 @@ module Staticizer
|
|
211
215
|
parsed_uri = URI(url)
|
212
216
|
|
213
217
|
@log.debug "Fetching #{parsed_uri}"
|
214
|
-
|
218
|
+
|
215
219
|
# Attempt to use an already open Net::HTTP connection
|
216
220
|
key = parsed_uri.host + parsed_uri.port.to_s
|
217
221
|
connection = @http_connections[key]
|
218
222
|
if connection.nil?
|
219
223
|
connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
|
224
|
+
connection.use_ssl = true if parsed_uri.scheme.downcase == "https"
|
220
225
|
@http_connections[key] = connection
|
221
226
|
end
|
222
227
|
|
223
228
|
request = Net::HTTP::Get.new(parsed_uri.request_uri)
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
229
|
+
begin
|
230
|
+
connection.request(request) do |response|
|
231
|
+
case response
|
232
|
+
when Net::HTTPSuccess
|
233
|
+
process_success(response, parsed_uri)
|
234
|
+
when Net::HTTPRedirection
|
235
|
+
redirect_url = response['location']
|
236
|
+
@log.debug "Processing redirect to #{redirect_url}"
|
237
|
+
process_redirect(parsed_uri, redirect_url)
|
238
|
+
add_url(redirect_url)
|
239
|
+
else
|
240
|
+
@log.error "Error #{response.code}:#{response.message} fetching url #{url}"
|
241
|
+
end
|
235
242
|
end
|
243
|
+
rescue OpenSSL::SSL::SSLError => e
|
244
|
+
@log.error "SSL Error #{e.message} fetching url #{url}"
|
245
|
+
rescue Errno::ECONNRESET => e
|
246
|
+
@log.error "Error #{e.class}:#{e.message} fetching url #{url}"
|
236
247
|
end
|
237
248
|
end
|
238
249
|
|
data/lib/staticizer/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: staticizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Conor Hunt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-06-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,7 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
122
|
version: '0'
|
123
123
|
requirements: []
|
124
124
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
125
|
+
rubygems_version: 2.4.8
|
126
126
|
signing_key:
|
127
127
|
specification_version: 4
|
128
128
|
summary: A tool to create a static version of a website for hosting on S3.
|