staticizer 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: feebd7a95ad16f567cc466fdea5fd1a79b08c141
4
- data.tar.gz: 93ddf8a175711ccc60ea989b77f2e62d679f2ef0
3
+ metadata.gz: 8809f12f85e54642f56afc7fc537c5d20ab64db1
4
+ data.tar.gz: 12746fd0c57dc04ee204b67d635e0033b21cc96c
5
5
  SHA512:
6
- metadata.gz: f474de0b20080124bb67ea2aa34a186451cd8c53bd3a959a0a47fca49b214b21e8f61e5ce84cb95cac0f32a99c05ec777a0fcf1473223d2c487aed7609ef8bd7
7
- data.tar.gz: d34e17b145c7e5b80710a29fa6d39c8abcbdddf27e40dd4ee81b7f4ddb33125eb26f2a5d467d16e85c10ab4607b0500d21b8c33021fe8e7dbeddbc6ca568f6a9
6
+ metadata.gz: 81f7e9035328e62306c3e31abe6a8969e8b876c944af2589cecf1a3e43879469e26374a1fd143fa15b9e370fe6b8f2fc9916459a2c64f6fe4a34fa186f8aceb5
7
+ data.tar.gz: 44e25794a99861646943913ae3b530657febafa835652b86975dcfc39f2071a643c137df85338dd2a53907f0494daa44f10d028429a4a3b9516cf739db3eb158
data/README.md CHANGED
@@ -9,13 +9,14 @@ website. If the website goes down this backup would be available
9
9
  with reduced functionality.
10
10
 
11
11
  S3 and Route 53 provide an great way to host a static emergency backup for a website.
12
- See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
13
- . In our experience it works very well and is incredibly cheap at less than US$1 a month (depending on the size of the website).
14
-
15
- We tried using exsisting tools httrack/wget to crawl and create a static version
16
- of the site to upload to S3, but we found that they did not work well with S3 hosting.
17
- We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
18
- the existing site. This way when the site goes down incoming links from Google search
12
+ See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
13
+ . In our experience it works well and is incredibly cheap. Our average sized website
14
+ with a few hundred pages and assets is less than US$1 a month.
15
+
16
+ We tried using existing tools httrack/wget to crawl and create a static version
17
+ of the site to upload to S3, but we found that they did not work well with S3 hosting.
18
+ We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
19
+ the existing site. This way when the site goes down incoming links from Google search
19
20
  results etc. will still work.
20
21
 
21
22
  ## TODO
@@ -87,6 +88,21 @@ This will only crawl urls in the domain squaremill.com
87
88
  s = Staticizer::Crawler.new("http://squaremill.com", :output_dir => "/tmp/crawl")
88
89
  s.crawl
89
90
 
91
+
92
+ ### Crawl a website and make all pages contain 'noindex' meta tag
93
+
94
+ s = Staticizer::Crawler.new("http://squaremill.com",
95
+ :output_dir => "/tmp/crawl",
96
+ :process_body => lambda {|body, uri, opts|
97
+ # not the best regex, but it will do for our use
98
+ body = body.gsub(/<meta\s+name=['"]robots[^>]+>/i,'')
99
+ body = body.gsub(/<head>/i,"<head>\n<meta name='robots' content='noindex'>")
100
+ body
101
+ }
102
+ )
103
+ s.crawl
104
+
105
+
90
106
  ### Crawl a website and rewrite all non www urls to www
91
107
 
92
108
  s = Staticizer::Crawler.new("http://squaremill.com",
@@ -55,7 +55,7 @@ module Staticizer
55
55
  end
56
56
  end
57
57
 
58
- begin
58
+ begin
59
59
  parser.parse!(args)
60
60
  initial_page = ARGV.pop
61
61
  raise ArgumentError, "Need to specify an initial URL to start the crawl" unless initial_page
@@ -32,6 +32,11 @@ module Staticizer
32
32
  uri = URI.parse(initial_page)
33
33
  @opts[:valid_domains] ||= [uri.host]
34
34
  end
35
+
36
+ if @opts[:process_body]
37
+ @process_body = @opts[:process_body]
38
+ end
39
+
35
40
  add_url(initial_page)
36
41
  end
37
42
 
@@ -132,6 +137,7 @@ module Staticizer
132
137
  end
133
138
 
134
139
  body = response.respond_to?(:read_body) ? response.read_body : response
140
+ body = process_body(body, uri, {})
135
141
  outfile = File.join(current, "/#{filename}")
136
142
  if filename == ""
137
143
  indexfile = File.join(outfile, "/index.html")
@@ -158,9 +164,11 @@ module Staticizer
158
164
  opts[:content_type] = response['content-type'] rescue "text/html"
159
165
  @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
160
166
  if response.respond_to?(:read_body)
161
- @s3_bucket.objects[key].write(response.read_body, opts)
167
+ body = process_body(response.read_body, uri, opts)
168
+ @s3_bucket.objects[key].write(body, opts)
162
169
  else
163
- @s3_bucket.objects[key].write(response, opts)
170
+ body = process_body(response, uri, opts)
171
+ @s3_bucket.objects[key].write(body, opts)
164
172
  end
165
173
  end
166
174
 
@@ -189,6 +197,13 @@ module Staticizer
189
197
  save_page(body, url)
190
198
  end
191
199
 
200
+ def process_body(body, uri, opts)
201
+ if @process_body
202
+ body = @process_body.call(body, uri, opts)
203
+ end
204
+ body
205
+ end
206
+
192
207
  # Fetch a URI and save it to disk
193
208
  def process_url(url, info)
194
209
  @http_connections ||= {}
@@ -1,3 +1,3 @@
1
1
  module Staticizer
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,83 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: staticizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Conor Hunt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-03 00:00:00.000000000 Z
11
+ date: 2015-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: webmock
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: aws-sdk
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description: A tool to create a static version of a website for hosting on S3. Can
@@ -89,7 +89,7 @@ executables:
89
89
  extensions: []
90
90
  extra_rdoc_files: []
91
91
  files:
92
- - .gitignore
92
+ - ".gitignore"
93
93
  - Gemfile
94
94
  - LICENSE.txt
95
95
  - README.md
@@ -112,17 +112,17 @@ require_paths:
112
112
  - lib
113
113
  required_ruby_version: !ruby/object:Gem::Requirement
114
114
  requirements:
115
- - - '>='
115
+ - - ">="
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  requirements:
120
- - - '>='
120
+ - - ">="
121
121
  - !ruby/object:Gem::Version
122
122
  version: '0'
123
123
  requirements: []
124
124
  rubyforge_project:
125
- rubygems_version: 2.1.9
125
+ rubygems_version: 2.4.5
126
126
  signing_key:
127
127
  specification_version: 4
128
128
  summary: A tool to create a static version of a website for hosting on S3.