staticizer 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 281ee17c4f67f579faa5d1762f9677203696fd0b
4
- data.tar.gz: f82db0909cd61819e3f19a43e17eb3ed552111f8
3
+ metadata.gz: f2ac92b2be780663c4828b685c270eccdee9106d
4
+ data.tar.gz: 2e31e7b3f701f431ea11e4cbb9a8296ec51475de
5
5
  SHA512:
6
- metadata.gz: bc5a3463838030e46d0353bd173154ff37b92ae36294d561b4e37270b2ee4dd252b4f6e7aff36cb428b0cc1c917b02fb2ec1abbc1ce7bba85679aed428af68cb
7
- data.tar.gz: 3267e1e063ff7c17c6797c7072b35456fa105f0c9af0dfc905c42e9394f7faacb2615b2829019923a9c2de91bec90a499c64b24e28e43ff3ba542f860511ea27
6
+ metadata.gz: d019bef5884ce685a5def86bc6840edcc95879062b01a114ceeacf8b126fb4a3a24c7ba58cef3f674c7198a422a62b397fb57687a86b55843d0b1fa3f02745a8
7
+ data.tar.gz: fa25013152b03b1b83c3a1347604699575951efee4dfe669c7b7be04236c75a1473258beb2ca1bd935163a6a47524b3217efd5d157f7b17779a3200cacb6d696
data/README.md CHANGED
@@ -75,6 +75,8 @@ This will only crawl urls in the domain squaremill.com
75
75
 
76
76
  s = Staticizer::Crawler.new("http://squaremill.com",
77
77
  :aws => {
78
+ :region => "us-west-1",
79
+ :endpoint => "http://s3.amazonaws.com",
78
80
  :bucket_name => "www.squaremill.com",
79
81
  :secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
80
82
  :access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
@@ -106,6 +108,8 @@ This will only crawl urls in the domain squaremill.com
106
108
 
107
109
  s = Staticizer::Crawler.new("http://squaremill.com",
108
110
  :aws => {
111
+ :region => "us-west-1",
112
+ :endpoint => "http://s3.amazonaws.com",
109
113
  :bucket_name => "www.squaremill.com",
110
114
  :secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
111
115
  :access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
@@ -130,6 +134,7 @@ This will only crawl urls in the domain squaremill.com
130
134
  * :log_level - Log level - defaults to INFO.
131
135
  * :valid_domains - Array of domains that should be crawled. Domains not in this list will be ignored.
132
136
  * :process_body - lambda called to pre-process body of content before writing it out.
137
+ * :skip_write - don't write retrieved files to disk or s3, just crawl the site (can be used to find 404s etc.)
133
138
 
134
139
  ## Contributing
135
140
 
@@ -44,6 +44,10 @@ module Staticizer
44
44
  options[:logger] = Logger.new(v)
45
45
  end
46
46
 
47
+ opts.on("--skip-write [PATH]", "Don't write out files to disk or s3") do |v|
48
+ options[:skip_write] = true
49
+ end
50
+
47
51
  opts.on("--valid-domains x,y,z", Array, "Comma separated list of domains that should be crawled, other domains will be ignored") do |v|
48
52
  options[:valid_domains] = v
49
53
  end
@@ -23,9 +23,8 @@ module Staticizer
23
23
 
24
24
  if @opts[:aws]
25
25
  bucket_name = @opts[:aws].delete(:bucket_name)
26
- AWS.config(opts[:aws])
27
- @s3_bucket = AWS::S3.new.buckets[bucket_name]
28
- @s3_bucket.acl = :public_read
26
+ Aws.config.update(opts[:aws])
27
+ @s3_bucket = Aws::S3::Resource.new.bucket(bucket_name)
29
28
  end
30
29
 
31
30
  if @opts[:valid_domains].nil?
@@ -86,6 +85,7 @@ module Staticizer
86
85
  URI::join(base_uri, href).to_s
87
86
  rescue StandardError => e
88
87
  @log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}"
88
+ return nil
89
89
  end
90
90
 
91
91
  def add_url(url, info = {})
@@ -103,6 +103,7 @@ module Staticizer
103
103
  end
104
104
 
105
105
  def save_page(response, uri)
106
+ return if @opts[:skip_write]
106
107
  if @opts[:aws]
107
108
  save_page_to_aws(response, uri)
108
109
  else
@@ -160,20 +161,23 @@ module Staticizer
160
161
  key = key.gsub(%r{^/},"")
161
162
  key = "index.html" if key == ""
162
163
  # Upload this file directly to AWS::S3
163
- opts = {:acl => :public_read}
164
+ opts = {:acl => "public-read"}
164
165
  opts[:content_type] = response['content-type'] rescue "text/html"
165
166
  @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
166
167
  if response.respond_to?(:read_body)
167
168
  body = process_body(response.read_body, uri, opts)
168
- @s3_bucket.objects[key].write(body, opts)
169
+ @s3_bucket.object(key).put(opts.merge(body: body))
169
170
  else
170
171
  body = process_body(response, uri, opts)
171
- @s3_bucket.objects[key].write(body, opts)
172
+ @s3_bucket.object(key).put(opts.merge(body: body))
172
173
  end
173
174
  end
174
175
 
175
176
  def process_success(response, parsed_uri)
176
177
  url = parsed_uri.to_s
178
+ if @opts[:filter_process]
179
+ return if @opts[:filter_process].call(response, parsed_uri)
180
+ end
177
181
  case response['content-type']
178
182
  when /css/
179
183
  save_page(response, parsed_uri)
@@ -211,28 +215,35 @@ module Staticizer
211
215
  parsed_uri = URI(url)
212
216
 
213
217
  @log.debug "Fetching #{parsed_uri}"
214
-
218
+
215
219
  # Attempt to use an already open Net::HTTP connection
216
220
  key = parsed_uri.host + parsed_uri.port.to_s
217
221
  connection = @http_connections[key]
218
222
  if connection.nil?
219
223
  connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
224
+ connection.use_ssl = true if parsed_uri.scheme.downcase == "https"
220
225
  @http_connections[key] = connection
221
226
  end
222
227
 
223
228
  request = Net::HTTP::Get.new(parsed_uri.request_uri)
224
- connection.request(request) do |response|
225
- case response
226
- when Net::HTTPSuccess
227
- process_success(response, parsed_uri)
228
- when Net::HTTPRedirection
229
- redirect_url = response['location']
230
- @log.debug "Processing redirect to #{redirect_url}"
231
- process_redirect(parsed_uri, redirect_url)
232
- add_url(redirect_url)
233
- else
234
- @log.error "Error #{response.code}:#{response.message} fetching url #{url}"
229
+ begin
230
+ connection.request(request) do |response|
231
+ case response
232
+ when Net::HTTPSuccess
233
+ process_success(response, parsed_uri)
234
+ when Net::HTTPRedirection
235
+ redirect_url = response['location']
236
+ @log.debug "Processing redirect to #{redirect_url}"
237
+ process_redirect(parsed_uri, redirect_url)
238
+ add_url(redirect_url)
239
+ else
240
+ @log.error "Error #{response.code}:#{response.message} fetching url #{url}"
241
+ end
235
242
  end
243
+ rescue OpenSSL::SSL::SSLError => e
244
+ @log.error "SSL Error #{e.message} fetching url #{url}"
245
+ rescue Errno::ECONNRESET => e
246
+ @log.error "Error #{e.class}:#{e.message} fetching url #{url}"
236
247
  end
237
248
  end
238
249
 
@@ -1,3 +1,3 @@
1
1
  module Staticizer
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: staticizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Conor Hunt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-21 00:00:00.000000000 Z
11
+ date: 2016-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,7 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  version: '0'
123
123
  requirements: []
124
124
  rubyforge_project:
125
- rubygems_version: 2.2.2
125
+ rubygems_version: 2.4.8
126
126
  signing_key:
127
127
  specification_version: 4
128
128
  summary: A tool to create a static version of a website for hosting on S3.