staticizer 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 281ee17c4f67f579faa5d1762f9677203696fd0b
4
- data.tar.gz: f82db0909cd61819e3f19a43e17eb3ed552111f8
3
+ metadata.gz: f2ac92b2be780663c4828b685c270eccdee9106d
4
+ data.tar.gz: 2e31e7b3f701f431ea11e4cbb9a8296ec51475de
5
5
  SHA512:
6
- metadata.gz: bc5a3463838030e46d0353bd173154ff37b92ae36294d561b4e37270b2ee4dd252b4f6e7aff36cb428b0cc1c917b02fb2ec1abbc1ce7bba85679aed428af68cb
7
- data.tar.gz: 3267e1e063ff7c17c6797c7072b35456fa105f0c9af0dfc905c42e9394f7faacb2615b2829019923a9c2de91bec90a499c64b24e28e43ff3ba542f860511ea27
6
+ metadata.gz: d019bef5884ce685a5def86bc6840edcc95879062b01a114ceeacf8b126fb4a3a24c7ba58cef3f674c7198a422a62b397fb57687a86b55843d0b1fa3f02745a8
7
+ data.tar.gz: fa25013152b03b1b83c3a1347604699575951efee4dfe669c7b7be04236c75a1473258beb2ca1bd935163a6a47524b3217efd5d157f7b17779a3200cacb6d696
data/README.md CHANGED
@@ -75,6 +75,8 @@ This will only crawl urls in the domain squaremill.com
75
75
 
76
76
  s = Staticizer::Crawler.new("http://squaremill.com",
77
77
  :aws => {
78
+ :region => "us-west-1",
79
+ :endpoint => "http://s3.amazonaws.com",
78
80
  :bucket_name => "www.squaremill.com",
79
81
  :secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
80
82
  :access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
@@ -106,6 +108,8 @@ This will only crawl urls in the domain squaremill.com
106
108
 
107
109
  s = Staticizer::Crawler.new("http://squaremill.com",
108
110
  :aws => {
111
+ :region => "us-west-1",
112
+ :endpoint => "http://s3.amazonaws.com",
109
113
  :bucket_name => "www.squaremill.com",
110
114
  :secret_access_key => "HIA7T189234aADfFAdf322Vs12duRhOHy+23mc1+s",
111
115
  :access_key_id => "HJFJS5gSJHMDZDFFSSDQQ"
@@ -130,6 +134,7 @@ This will only crawl urls in the domain squaremill.com
130
134
  * :log_level - Log level - defaults to INFO.
131
135
  * :valid_domains - Array of domains that should be crawled. Domains not in this list will be ignored.
132
136
  * :process_body - lambda called to pre-process body of content before writing it out.
137
+ * :skip_write - don't write retrieved files to disk or s3, just crawl the site (can be used to find 404s etc.)
133
138
 
134
139
  ## Contributing
135
140
 
@@ -44,6 +44,10 @@ module Staticizer
44
44
  options[:logger] = Logger.new(v)
45
45
  end
46
46
 
47
+ opts.on("--skip-write [PATH]", "Don't write out files to disk or s3") do |v|
48
+ options[:skip_write] = true
49
+ end
50
+
47
51
  opts.on("--valid-domains x,y,z", Array, "Comma separated list of domains that should be crawled, other domains will be ignored") do |v|
48
52
  options[:valid_domains] = v
49
53
  end
@@ -23,9 +23,8 @@ module Staticizer
23
23
 
24
24
  if @opts[:aws]
25
25
  bucket_name = @opts[:aws].delete(:bucket_name)
26
- AWS.config(opts[:aws])
27
- @s3_bucket = AWS::S3.new.buckets[bucket_name]
28
- @s3_bucket.acl = :public_read
26
+ Aws.config.update(opts[:aws])
27
+ @s3_bucket = Aws::S3::Resource.new.bucket(bucket_name)
29
28
  end
30
29
 
31
30
  if @opts[:valid_domains].nil?
@@ -86,6 +85,7 @@ module Staticizer
86
85
  URI::join(base_uri, href).to_s
87
86
  rescue StandardError => e
88
87
  @log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}"
88
+ return nil
89
89
  end
90
90
 
91
91
  def add_url(url, info = {})
@@ -103,6 +103,7 @@ module Staticizer
103
103
  end
104
104
 
105
105
  def save_page(response, uri)
106
+ return if @opts[:skip_write]
106
107
  if @opts[:aws]
107
108
  save_page_to_aws(response, uri)
108
109
  else
@@ -160,20 +161,23 @@ module Staticizer
160
161
  key = key.gsub(%r{^/},"")
161
162
  key = "index.html" if key == ""
162
163
  # Upload this file directly to AWS::S3
163
- opts = {:acl => :public_read}
164
+ opts = {:acl => "public-read"}
164
165
  opts[:content_type] = response['content-type'] rescue "text/html"
165
166
  @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
166
167
  if response.respond_to?(:read_body)
167
168
  body = process_body(response.read_body, uri, opts)
168
- @s3_bucket.objects[key].write(body, opts)
169
+ @s3_bucket.object(key).put(opts.merge(body: body))
169
170
  else
170
171
  body = process_body(response, uri, opts)
171
- @s3_bucket.objects[key].write(body, opts)
172
+ @s3_bucket.object(key).put(opts.merge(body: body))
172
173
  end
173
174
  end
174
175
 
175
176
  def process_success(response, parsed_uri)
176
177
  url = parsed_uri.to_s
178
+ if @opts[:filter_process]
179
+ return if @opts[:filter_process].call(response, parsed_uri)
180
+ end
177
181
  case response['content-type']
178
182
  when /css/
179
183
  save_page(response, parsed_uri)
@@ -211,28 +215,35 @@ module Staticizer
211
215
  parsed_uri = URI(url)
212
216
 
213
217
  @log.debug "Fetching #{parsed_uri}"
214
-
218
+
215
219
  # Attempt to use an already open Net::HTTP connection
216
220
  key = parsed_uri.host + parsed_uri.port.to_s
217
221
  connection = @http_connections[key]
218
222
  if connection.nil?
219
223
  connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
224
+ connection.use_ssl = true if parsed_uri.scheme.downcase == "https"
220
225
  @http_connections[key] = connection
221
226
  end
222
227
 
223
228
  request = Net::HTTP::Get.new(parsed_uri.request_uri)
224
- connection.request(request) do |response|
225
- case response
226
- when Net::HTTPSuccess
227
- process_success(response, parsed_uri)
228
- when Net::HTTPRedirection
229
- redirect_url = response['location']
230
- @log.debug "Processing redirect to #{redirect_url}"
231
- process_redirect(parsed_uri, redirect_url)
232
- add_url(redirect_url)
233
- else
234
- @log.error "Error #{response.code}:#{response.message} fetching url #{url}"
229
+ begin
230
+ connection.request(request) do |response|
231
+ case response
232
+ when Net::HTTPSuccess
233
+ process_success(response, parsed_uri)
234
+ when Net::HTTPRedirection
235
+ redirect_url = response['location']
236
+ @log.debug "Processing redirect to #{redirect_url}"
237
+ process_redirect(parsed_uri, redirect_url)
238
+ add_url(redirect_url)
239
+ else
240
+ @log.error "Error #{response.code}:#{response.message} fetching url #{url}"
241
+ end
235
242
  end
243
+ rescue OpenSSL::SSL::SSLError => e
244
+ @log.error "SSL Error #{e.message} fetching url #{url}"
245
+ rescue Errno::ECONNRESET => e
246
+ @log.error "Error #{e.class}:#{e.message} fetching url #{url}"
236
247
  end
237
248
  end
238
249
 
@@ -1,3 +1,3 @@
1
1
  module Staticizer
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: staticizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Conor Hunt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-21 00:00:00.000000000 Z
11
+ date: 2016-06-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,7 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  version: '0'
123
123
  requirements: []
124
124
  rubyforge_project:
125
- rubygems_version: 2.2.2
125
+ rubygems_version: 2.4.8
126
126
  signing_key:
127
127
  specification_version: 4
128
128
  summary: A tool to create a static version of a website for hosting on S3.