webpage-archivist 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ # 0.0.2
2
+
3
+ - replace websnap + wkhtmltoimage by PhantomJS
4
+ - replace mini_magick by custom code
5
+ - WebpageArchivist#fetch_webpages no takes pages instead of ids
6
+
@@ -11,8 +11,8 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
11
11
 
12
12
  * An SQL database supported by Sequel[http://sequel.rubyforge.org/]
13
13
  * Git[http://git-scm.com/]
14
- * ImageMagick[http://www.imagemagick.org/script/index.php]
15
- * wkhtmltoimage[http://code.google.com/p/wkhtmltopdf/] (work but not so well on Mac OS, prefer Linux for real usage)
14
+ * GraphicsMagick[http://www.graphicsmagick.org/]
15
+ * PhantomJS [http://code.google.com/p/phantomjs]
16
16
 
17
17
  = Installation
18
18
 
@@ -40,7 +40,8 @@ Basic configuration is done through environment variables:
40
40
  * +ARCHIVIST_ASSETS_PATH+ : path to store the assets, default to +./archivist_assets+
41
41
  * +ARCHIVIST_SNAPSHOTS_PATH+ : path to store the thumbnail, default to +./archivist_snapshots+
42
42
  * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
43
- * +IMAGE_MAGICK_PATH+ : path to ImageMagick executables if they aren't in the path
43
+ * +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
44
+ * +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
44
45
 
45
46
  Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
46
47
 
@@ -6,16 +6,16 @@ module WebpageArchivist::Fetcher
6
6
 
7
7
  SEMAPHORE = Mutex.new
8
8
 
9
- # Fetch several webpages, return an hash indexed by the ids holding the corresponding Instances or http result codes
9
+ # Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
10
10
  # (may be existing instances if the pages haven't changed)
11
- def self.fetch_webpages ids
12
- if ids.empty?
11
+ def self.fetch_webpages webpages
12
+ if webpages.empty?
13
13
  []
14
14
  else
15
15
  SEMAPHORE.synchronize do
16
16
  @fetcher_watcher = FetcherWatcher.new
17
17
  EventMachine.run do
18
- WebpageArchivist::Webpage.filter(:id => ids).each do |webpage|
18
+ webpages.each do |webpage|
19
19
  @fetcher_watcher.add_request WebpageRequest.new(webpage, @fetcher_watcher)
20
20
  end
21
21
  @fetcher_watcher.wait
@@ -23,7 +23,7 @@ module WebpageArchivist::Fetcher
23
23
 
24
24
  result = {}
25
25
  @fetcher_watcher.requests.each do |webpage_request|
26
- result[webpage_request.webpage.id] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
26
+ result[webpage_request.webpage] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
27
27
  end
28
28
  result
29
29
  end
@@ -1,7 +1,6 @@
1
1
  # Contains monkey patches
2
2
  require 'addressable/uri'
3
3
  require 'grit'
4
- require 'mini_magick'
5
4
 
6
5
  module Addressable
7
6
 
@@ -37,20 +36,6 @@ class Grit::Repo
37
36
 
38
37
  end
39
38
 
40
- # Patching mini magic so we can specify the path to the image magick installation
41
- module MiniMagick
42
-
43
- class CommandBuilder
44
-
45
- alias :parent_command :command
46
-
47
- def command
48
- "#{ENV['IMAGE_MAGICK_PATH'] ? "#{ENV['IMAGE_MAGICK_PATH']}/" : ''}#{parent_command}"
49
- end
50
-
51
- end
52
- end
53
-
54
39
  class Dir
55
40
 
56
41
  # Create a dit if it does not exist
@@ -0,0 +1,22 @@
1
+ var page = new WebPage(),
2
+ address, output, size;
3
+
4
+ if (phantom.args.length != 4) {
5
+ console.log('Usage: rasterize.js URL filename width height');
6
+ phantom.exit();
7
+ } else {
8
+ address = phantom.args[0];
9
+ output = phantom.args[1];
10
+ page.viewportSize = { width: phantom.args[2], height: phantom.args[3]};
11
+ page.clipRect = { top: 0, left: 0, width: phantom.args[2], height: phantom.args[3]};
12
+ page.open(address, function (status) {
13
+ if (status !== 'success') {
14
+ console.log('Unable to load the address!');
15
+ } else {
16
+ window.setTimeout(function () {
17
+ page.render(output);
18
+ phantom.exit();
19
+ }, 200);
20
+ }
21
+ });
22
+ }
@@ -1,17 +1,19 @@
1
- require 'websnap'
2
- require 'mini_magick'
3
-
4
1
  module WebpageArchivist
5
2
 
6
3
  # Snapshot the pages and create thumbnails
7
4
  class Snapshoter
8
5
 
9
6
  SNAPSHOTS_PATH = File.expand_path(ENV['ARCHIVIST_SNAPSHOTS_PATH'] || './archivist_snapshots')
7
+
8
+ GRAPHICS_MAGICK_PATH = "#{ENV['GRAPHICS_MAGICK_PATH'] ? "#{ENV['GRAPHICS_MAGICK_PATH']}/" : ''}gm"
9
+
10
+ PHANTOMJS_PATH = ENV['PHANTOMJS_PATH'] || 'phantomjs'
11
+
10
12
  p "Archivist snapshots path is [#{SNAPSHOTS_PATH}]"
11
13
  Dir.mkdir_if_not_exist SNAPSHOTS_PATH
12
14
 
13
15
  class << self
14
- attr_accessor :width, :height, :format, :thumbnail_scale, :thumbnail_crop_width, :thumbnail_crop_height
16
+ attr_accessor :width, :height, :format, :thumbnail_scale, :thumbnail_crop_width, :thumbnail_crop_height, :quality
15
17
  end
16
18
 
17
19
  Snapshoter.width = 1024
@@ -36,8 +38,9 @@ module WebpageArchivist
36
38
 
37
39
  snapshot_path = File.join(dir_path, "#{instance.id}.#{Snapshoter.format}")
38
40
  thumbnail_path = File.join(dir_path, "#{instance.id.to_s}-small.#{Snapshoter.format}")
39
- snapshot File.new(instance.webpage.index_path), snapshot_path, thumbnail_path
40
- instance.update(:snapshot => true)
41
+ if snapshot(instance.webpage.index_path, snapshot_path, thumbnail_path)
42
+ instance.update(:snapshot => true)
43
+ end
41
44
  end
42
45
 
43
46
  # Create a snapshot of a web page
@@ -45,15 +48,25 @@ module WebpageArchivist
45
48
  # snapshot_path:: path to the snapshot file
46
49
  # thumbnail_path: path to the thumbnail (can be nil for no thumbnail)
47
50
  def self.snapshot uri_or_file, snapshot_path, thumbnail_path = nil
48
- ::WebpageArchivist.debug "Snapshot for [#{uri_or_file.kind_of?(File) ? uri_or_file.path : uri_or_file}] on [#{snapshot_path}]" if ::WebpageArchivist.log
51
+ ::WebpageArchivist.debug "Snapshot for [#{uri_or_file}] on [#{snapshot_path}]" if ::WebpageArchivist.log
49
52
 
50
53
  if File.exists? snapshot_path
51
54
  File.delete snapshot_path
52
55
  end
53
56
 
54
- snapper = WebSnap::Snapper.new(uri_or_file)
55
- snapper.options.clear.merge!({'--height' => Snapshoter.height, '--width' => Snapshoter.width, '--format' => Snapshoter.format})
56
- snapper.to_file(snapshot_path)
57
+ # if the result is not a png we use an intermediate image as PhantomJS makes crappy jpeg images
58
+ intermediate_image = (Snapshoter.format != 'png')
59
+ real_snapshot_path = intermediate_image ? "#{snapshot_path[0..(-(File.extname(snapshot_path).length + 1))]}.png" : snapshot_path
60
+
61
+ `#{PHANTOMJS_PATH} #{File.dirname(__FILE__)}/rasterize.js '#{uri_or_file}' #{real_snapshot_path} #{Snapshoter.width} #{Snapshoter.height}`
62
+
63
+ unless File.exists?(real_snapshot_path)
64
+ return false
65
+ end
66
+
67
+ if intermediate_image
68
+ `#{GRAPHICS_MAGICK_PATH} convert -background white #{real_snapshot_path} #{snapshot_path}`
69
+ end
57
70
 
58
71
  if thumbnail_path
59
72
  ::WebpageArchivist.debug "Thumbnail of [#{snapshot_path}] on [#{thumbnail_path}]" if ::WebpageArchivist.log
@@ -62,14 +75,14 @@ module WebpageArchivist
62
75
  File.delete thumbnail_path
63
76
  end
64
77
 
65
- img = MiniMagick::Image::open(snapshot_path)
66
- img.combine_options do |c|
67
- c.crop "#{Snapshoter.thumbnail_crop_width}x#{Snapshoter.thumbnail_crop_height}+0+0"
68
- c.scale "#{Snapshoter.thumbnail_scale}%"
69
- end
70
- img.write thumbnail_path
78
+ `#{GRAPHICS_MAGICK_PATH} convert -background white -scale #{Snapshoter.thumbnail_scale}% -crop #{Snapshoter.thumbnail_crop_width}x#{Snapshoter.thumbnail_crop_height}+0+0 #{real_snapshot_path} #{thumbnail_path}`
79
+ end
80
+
81
+ if intermediate_image
82
+ File.delete real_snapshot_path
71
83
  end
72
84
 
85
+ return true
73
86
  end
74
87
 
75
88
  end
@@ -4,15 +4,6 @@ require 'iconv'
4
4
 
5
5
  module CssParser
6
6
 
7
- # Replace original implementation with one using addressable
8
- def self.convert_uris(css, base_uri)
9
- return css.gsub(URI_RX) do
10
- uri = $1.to_s
11
- uri.gsub!(/["']+/, '')
12
- "url('#{base_uri.absolutize(uri)}')"
13
- end
14
- end
15
-
16
7
  # Make the declaration accessible
17
8
  class RuleSet
18
9
 
@@ -87,6 +78,7 @@ module WebpageArchivist
87
78
  unless @expanded
88
79
  parser.each_rule_set do |rs|
89
80
  rs.expand_background_shorthand!
81
+ rs.expand_list_style_shorthand!
90
82
  end
91
83
  @expanded = true
92
84
  end
@@ -1,3 +1,3 @@
1
1
  module WebpageArchivist
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -14,9 +14,14 @@ module WebpageArchivist
14
14
  Webpage.create(:name => name, :uri => uri)
15
15
  end
16
16
 
17
- # Fetch several webpages, return an hash indexed by the ids holding the corresponding instances or http result codes
18
- def fetch_webpages ids
19
- Fetcher.fetch_webpages ids
17
+ # Fetch all webpages
18
+ def fetch_all
19
+ Fetcher.fetch_webpages list_webpages
20
+ end
21
+
22
+ # Fetch several webpages, return an hash indexed by the webpages holding the corresponding instances or http result codes
23
+ def fetch_webpages webpages
24
+ Fetcher.fetch_webpages webpages
20
25
  end
21
26
 
22
27
  # List the webpages
@@ -24,12 +29,6 @@ module WebpageArchivist
24
29
  Webpage.all
25
30
  end
26
31
 
27
- # List the instances of a webpage
28
- # id:: the webpage id
29
- def list_instances webpage_id
30
- Instance.where(:webpage_id => webpage_id)
31
- end
32
-
33
32
  # Write the full content of a webpage instance into a zip file
34
33
  # id:: the instance id
35
34
  # file:: the file to write to
@@ -23,10 +23,8 @@ Gem::Specification.new do |s|
23
23
  s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
24
24
  s.add_runtime_dependency 'nokogiri', '~> 1.5'
25
25
  s.add_runtime_dependency 'addressable', '~> 2.2.6'
26
- s.add_runtime_dependency 'css_parser', '~> 1.1.9'
26
+ s.add_runtime_dependency 'css_parser', '~> 1.2.3'
27
27
  s.add_runtime_dependency 'grit', '~> 2.4.1'
28
- s.add_runtime_dependency 'websnap', '~> 0.1.3'
29
- s.add_runtime_dependency 'mini_magick', '~> 3.3'
30
28
  s.add_runtime_dependency 'mime-types', '~> 1.16'
31
29
 
32
30
  s.add_development_dependency 'sqlite3', '~> 1.3.3'
metadata CHANGED
@@ -1,224 +1,135 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: webpage-archivist
3
- version: !ruby/object:Gem::Version
4
- hash: 29
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 1
10
- version: 0.0.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Julien Kirch
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-08-10 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
12
+ date: 2011-09-08 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
21
15
  name: andand
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: &2160802540 !ruby/object:Gem::Requirement
24
17
  none: false
25
- requirements:
18
+ requirements:
26
19
  - - ~>
27
- - !ruby/object:Gem::Version
28
- hash: 25
29
- segments:
30
- - 1
31
- - 3
32
- - 1
20
+ - !ruby/object:Gem::Version
33
21
  version: 1.3.1
34
22
  type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: sequel
38
23
  prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: *2160802540
25
+ - !ruby/object:Gem::Dependency
26
+ name: sequel
27
+ requirement: &2160802040 !ruby/object:Gem::Requirement
40
28
  none: false
41
- requirements:
29
+ requirements:
42
30
  - - ~>
43
- - !ruby/object:Gem::Version
44
- hash: 53
45
- segments:
46
- - 3
47
- - 25
48
- version: "3.25"
31
+ - !ruby/object:Gem::Version
32
+ version: '3.25'
49
33
  type: :runtime
50
- version_requirements: *id002
51
- - !ruby/object:Gem::Dependency
52
- name: eventmachine
53
34
  prerelease: false
54
- requirement: &id003 !ruby/object:Gem::Requirement
35
+ version_requirements: *2160802040
36
+ - !ruby/object:Gem::Dependency
37
+ name: eventmachine
38
+ requirement: &2160801580 !ruby/object:Gem::Requirement
55
39
  none: false
56
- requirements:
40
+ requirements:
57
41
  - - ~>
58
- - !ruby/object:Gem::Version
59
- hash: 62196357
60
- segments:
61
- - 1
62
- - 0
63
- - 0
64
- - beta
65
- - 3
42
+ - !ruby/object:Gem::Version
66
43
  version: 1.0.0.beta.3
67
44
  type: :runtime
68
- version_requirements: *id003
69
- - !ruby/object:Gem::Dependency
70
- name: em-http-request
71
45
  prerelease: false
72
- requirement: &id004 !ruby/object:Gem::Requirement
46
+ version_requirements: *2160801580
47
+ - !ruby/object:Gem::Dependency
48
+ name: em-http-request
49
+ requirement: &2160801120 !ruby/object:Gem::Requirement
73
50
  none: false
74
- requirements:
51
+ requirements:
75
52
  - - ~>
76
- - !ruby/object:Gem::Version
77
- hash: 62196363
78
- segments:
79
- - 1
80
- - 0
81
- - 0
82
- - beta
83
- - 4
53
+ - !ruby/object:Gem::Version
84
54
  version: 1.0.0.beta.4
85
55
  type: :runtime
86
- version_requirements: *id004
87
- - !ruby/object:Gem::Dependency
88
- name: nokogiri
89
56
  prerelease: false
90
- requirement: &id005 !ruby/object:Gem::Requirement
57
+ version_requirements: *2160801120
58
+ - !ruby/object:Gem::Dependency
59
+ name: nokogiri
60
+ requirement: &2160800660 !ruby/object:Gem::Requirement
91
61
  none: false
92
- requirements:
62
+ requirements:
93
63
  - - ~>
94
- - !ruby/object:Gem::Version
95
- hash: 5
96
- segments:
97
- - 1
98
- - 5
99
- version: "1.5"
64
+ - !ruby/object:Gem::Version
65
+ version: '1.5'
100
66
  type: :runtime
101
- version_requirements: *id005
102
- - !ruby/object:Gem::Dependency
103
- name: addressable
104
67
  prerelease: false
105
- requirement: &id006 !ruby/object:Gem::Requirement
68
+ version_requirements: *2160800660
69
+ - !ruby/object:Gem::Dependency
70
+ name: addressable
71
+ requirement: &2160800200 !ruby/object:Gem::Requirement
106
72
  none: false
107
- requirements:
73
+ requirements:
108
74
  - - ~>
109
- - !ruby/object:Gem::Version
110
- hash: 11
111
- segments:
112
- - 2
113
- - 2
114
- - 6
75
+ - !ruby/object:Gem::Version
115
76
  version: 2.2.6
116
77
  type: :runtime
117
- version_requirements: *id006
118
- - !ruby/object:Gem::Dependency
119
- name: css_parser
120
78
  prerelease: false
121
- requirement: &id007 !ruby/object:Gem::Requirement
79
+ version_requirements: *2160800200
80
+ - !ruby/object:Gem::Dependency
81
+ name: css_parser
82
+ requirement: &2160799740 !ruby/object:Gem::Requirement
122
83
  none: false
123
- requirements:
84
+ requirements:
124
85
  - - ~>
125
- - !ruby/object:Gem::Version
126
- hash: 1
127
- segments:
128
- - 1
129
- - 1
130
- - 9
131
- version: 1.1.9
86
+ - !ruby/object:Gem::Version
87
+ version: 1.2.3
132
88
  type: :runtime
133
- version_requirements: *id007
134
- - !ruby/object:Gem::Dependency
135
- name: grit
136
89
  prerelease: false
137
- requirement: &id008 !ruby/object:Gem::Requirement
90
+ version_requirements: *2160799740
91
+ - !ruby/object:Gem::Dependency
92
+ name: grit
93
+ requirement: &2160799280 !ruby/object:Gem::Requirement
138
94
  none: false
139
- requirements:
95
+ requirements:
140
96
  - - ~>
141
- - !ruby/object:Gem::Version
142
- hash: 29
143
- segments:
144
- - 2
145
- - 4
146
- - 1
97
+ - !ruby/object:Gem::Version
147
98
  version: 2.4.1
148
99
  type: :runtime
149
- version_requirements: *id008
150
- - !ruby/object:Gem::Dependency
151
- name: websnap
152
100
  prerelease: false
153
- requirement: &id009 !ruby/object:Gem::Requirement
154
- none: false
155
- requirements:
156
- - - ~>
157
- - !ruby/object:Gem::Version
158
- hash: 29
159
- segments:
160
- - 0
161
- - 1
162
- - 3
163
- version: 0.1.3
164
- type: :runtime
165
- version_requirements: *id009
166
- - !ruby/object:Gem::Dependency
167
- name: mini_magick
168
- prerelease: false
169
- requirement: &id010 !ruby/object:Gem::Requirement
170
- none: false
171
- requirements:
172
- - - ~>
173
- - !ruby/object:Gem::Version
174
- hash: 1
175
- segments:
176
- - 3
177
- - 3
178
- version: "3.3"
179
- type: :runtime
180
- version_requirements: *id010
181
- - !ruby/object:Gem::Dependency
101
+ version_requirements: *2160799280
102
+ - !ruby/object:Gem::Dependency
182
103
  name: mime-types
183
- prerelease: false
184
- requirement: &id011 !ruby/object:Gem::Requirement
104
+ requirement: &2160798820 !ruby/object:Gem::Requirement
185
105
  none: false
186
- requirements:
106
+ requirements:
187
107
  - - ~>
188
- - !ruby/object:Gem::Version
189
- hash: 47
190
- segments:
191
- - 1
192
- - 16
193
- version: "1.16"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.16'
194
110
  type: :runtime
195
- version_requirements: *id011
196
- - !ruby/object:Gem::Dependency
197
- name: sqlite3
198
111
  prerelease: false
199
- requirement: &id012 !ruby/object:Gem::Requirement
112
+ version_requirements: *2160798820
113
+ - !ruby/object:Gem::Dependency
114
+ name: sqlite3
115
+ requirement: &2160798360 !ruby/object:Gem::Requirement
200
116
  none: false
201
- requirements:
117
+ requirements:
202
118
  - - ~>
203
- - !ruby/object:Gem::Version
204
- hash: 29
205
- segments:
206
- - 1
207
- - 3
208
- - 3
119
+ - !ruby/object:Gem::Version
209
120
  version: 1.3.3
210
121
  type: :development
211
- version_requirements: *id012
122
+ prerelease: false
123
+ version_requirements: *2160798360
212
124
  description: An utility to archive webpages through time
213
125
  email:
214
126
  executables: []
215
-
216
127
  extensions: []
217
-
218
- extra_rdoc_files:
128
+ extra_rdoc_files:
219
129
  - README.rdoc
220
- files:
130
+ files:
221
131
  - .gitignore
132
+ - CHANGELOG.md
222
133
  - Gemfile
223
134
  - README.rdoc
224
135
  - Rakefile
@@ -234,6 +145,7 @@ files:
234
145
  - lib/webpage-archivist/migrations.rb
235
146
  - lib/webpage-archivist/models.rb
236
147
  - lib/webpage-archivist/patches.rb
148
+ - lib/webpage-archivist/rasterize.js
237
149
  - lib/webpage-archivist/snapshoter.rb
238
150
  - lib/webpage-archivist/stylesheet_document.rb
239
151
  - lib/webpage-archivist/version.rb
@@ -245,39 +157,31 @@ files:
245
157
  - webpage-archivist.gemspec
246
158
  homepage: https://github.com/archiloque/webpage-archivist
247
159
  licenses: []
248
-
249
160
  post_install_message:
250
- rdoc_options:
161
+ rdoc_options:
251
162
  - --main
252
163
  - README.rdoc
253
- require_paths:
164
+ require_paths:
254
165
  - lib
255
- required_ruby_version: !ruby/object:Gem::Requirement
166
+ required_ruby_version: !ruby/object:Gem::Requirement
256
167
  none: false
257
- requirements:
258
- - - ">="
259
- - !ruby/object:Gem::Version
260
- hash: 3
261
- segments:
262
- - 0
263
- version: "0"
264
- required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ! '>='
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ required_rubygems_version: !ruby/object:Gem::Requirement
265
173
  none: false
266
- requirements:
267
- - - ">="
268
- - !ruby/object:Gem::Version
269
- hash: 3
270
- segments:
271
- - 0
272
- version: "0"
174
+ requirements:
175
+ - - ! '>='
176
+ - !ruby/object:Gem::Version
177
+ version: '0'
273
178
  requirements: []
274
-
275
179
  rubyforge_project: webpage-archivist
276
- rubygems_version: 1.8.5
180
+ rubygems_version: 1.8.8
277
181
  signing_key:
278
182
  specification_version: 3
279
183
  summary: An utility to archive webpages through time
280
- test_files:
184
+ test_files:
281
185
  - test/crud_test.rb
282
186
  - test/files/stylesheet.css
283
187
  - test/helper.rb