staticizer 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: feebd7a95ad16f567cc466fdea5fd1a79b08c141
4
- data.tar.gz: 93ddf8a175711ccc60ea989b77f2e62d679f2ef0
3
+ metadata.gz: 8809f12f85e54642f56afc7fc537c5d20ab64db1
4
+ data.tar.gz: 12746fd0c57dc04ee204b67d635e0033b21cc96c
5
5
  SHA512:
6
- metadata.gz: f474de0b20080124bb67ea2aa34a186451cd8c53bd3a959a0a47fca49b214b21e8f61e5ce84cb95cac0f32a99c05ec777a0fcf1473223d2c487aed7609ef8bd7
7
- data.tar.gz: d34e17b145c7e5b80710a29fa6d39c8abcbdddf27e40dd4ee81b7f4ddb33125eb26f2a5d467d16e85c10ab4607b0500d21b8c33021fe8e7dbeddbc6ca568f6a9
6
+ metadata.gz: 81f7e9035328e62306c3e31abe6a8969e8b876c944af2589cecf1a3e43879469e26374a1fd143fa15b9e370fe6b8f2fc9916459a2c64f6fe4a34fa186f8aceb5
7
+ data.tar.gz: 44e25794a99861646943913ae3b530657febafa835652b86975dcfc39f2071a643c137df85338dd2a53907f0494daa44f10d028429a4a3b9516cf739db3eb158
data/README.md CHANGED
@@ -9,13 +9,14 @@ website. If the website goes down this backup would be available
9
9
  with reduced functionality.
10
10
 
11
11
  S3 and Route 53 provide an great way to host a static emergency backup for a website.
12
- See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
13
- . In our experience it works very well and is incredibly cheap at less than US$1 a month (depending on the size of the website).
14
-
15
- We tried using exsisting tools httrack/wget to crawl and create a static version
16
- of the site to upload to S3, but we found that they did not work well with S3 hosting.
17
- We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
18
- the existing site. This way when the site goes down incoming links from Google search
12
+ See this article - http://aws.typepad.com/aws/2013/02/create-a-backup-website-using-route-53-dns-failover-and-s3-website-hosting.html
13
+ . In our experience it works well and is incredibly cheap. Our average sized website
14
+ with a few hundred pages and assets is less than US$1 a month.
15
+
16
+ We tried using existing tools httrack/wget to crawl and create a static version
17
+ of the site to upload to S3, but we found that they did not work well with S3 hosting.
18
+ We wanted the site uploaded to S3 to respond to the *exact* same URLs (where possible) as
19
+ the existing site. This way when the site goes down incoming links from Google search
19
20
  results etc. will still work.
20
21
 
21
22
  ## TODO
@@ -87,6 +88,21 @@ This will only crawl urls in the domain squaremill.com
87
88
  s = Staticizer::Crawler.new("http://squaremill.com", :output_dir => "/tmp/crawl")
88
89
  s.crawl
89
90
 
91
+
92
+ ### Crawl a website and make all pages contain 'noindex' meta tag
93
+
94
+ s = Staticizer::Crawler.new("http://squaremill.com",
95
+ :output_dir => "/tmp/crawl",
96
+ :process_body => lambda {|body, uri, opts|
97
+ # not the best regex, but it will do for our use
98
+ body = body.gsub(/<meta\s+name=['"]robots[^>]+>/i,'')
99
+ body = body.gsub(/<head>/i,"<head>\n<meta name='robots' content='noindex'>")
100
+ body
101
+ }
102
+ )
103
+ s.crawl
104
+
105
+
90
106
  ### Crawl a website and rewrite all non www urls to www
91
107
 
92
108
  s = Staticizer::Crawler.new("http://squaremill.com",
@@ -55,7 +55,7 @@ module Staticizer
55
55
  end
56
56
  end
57
57
 
58
- begin
58
+ begin
59
59
  parser.parse!(args)
60
60
  initial_page = ARGV.pop
61
61
  raise ArgumentError, "Need to specify an initial URL to start the crawl" unless initial_page
@@ -32,6 +32,11 @@ module Staticizer
32
32
  uri = URI.parse(initial_page)
33
33
  @opts[:valid_domains] ||= [uri.host]
34
34
  end
35
+
36
+ if @opts[:process_body]
37
+ @process_body = @opts[:process_body]
38
+ end
39
+
35
40
  add_url(initial_page)
36
41
  end
37
42
 
@@ -132,6 +137,7 @@ module Staticizer
132
137
  end
133
138
 
134
139
  body = response.respond_to?(:read_body) ? response.read_body : response
140
+ body = process_body(body, uri, {})
135
141
  outfile = File.join(current, "/#{filename}")
136
142
  if filename == ""
137
143
  indexfile = File.join(outfile, "/index.html")
@@ -158,9 +164,11 @@ module Staticizer
158
164
  opts[:content_type] = response['content-type'] rescue "text/html"
159
165
  @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
160
166
  if response.respond_to?(:read_body)
161
- @s3_bucket.objects[key].write(response.read_body, opts)
167
+ body = process_body(response.read_body, uri, opts)
168
+ @s3_bucket.objects[key].write(body, opts)
162
169
  else
163
- @s3_bucket.objects[key].write(response, opts)
170
+ body = process_body(response, uri, opts)
171
+ @s3_bucket.objects[key].write(body, opts)
164
172
  end
165
173
  end
166
174
 
@@ -189,6 +197,13 @@ module Staticizer
189
197
  save_page(body, url)
190
198
  end
191
199
 
200
+ def process_body(body, uri, opts)
201
+ if @process_body
202
+ body = @process_body.call(body, uri, opts)
203
+ end
204
+ body
205
+ end
206
+
192
207
  # Fetch a URI and save it to disk
193
208
  def process_url(url, info)
194
209
  @http_connections ||= {}
@@ -1,3 +1,3 @@
1
1
  module Staticizer
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,83 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: staticizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Conor Hunt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-03 00:00:00.000000000 Z
11
+ date: 2015-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: webmock
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: aws-sdk
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description: A tool to create a static version of a website for hosting on S3. Can
@@ -89,7 +89,7 @@ executables:
89
89
  extensions: []
90
90
  extra_rdoc_files: []
91
91
  files:
92
- - .gitignore
92
+ - ".gitignore"
93
93
  - Gemfile
94
94
  - LICENSE.txt
95
95
  - README.md
@@ -112,17 +112,17 @@ require_paths:
112
112
  - lib
113
113
  required_ruby_version: !ruby/object:Gem::Requirement
114
114
  requirements:
115
- - - '>='
115
+ - - ">="
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  requirements:
120
- - - '>='
120
+ - - ">="
121
121
  - !ruby/object:Gem::Version
122
122
  version: '0'
123
123
  requirements: []
124
124
  rubyforge_project:
125
- rubygems_version: 2.1.9
125
+ rubygems_version: 2.4.5
126
126
  signing_key:
127
127
  specification_version: 4
128
128
  summary: A tool to create a static version of a website for hosting on S3.