webpage-archivist 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ module WebpageArchivist
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,79 @@
1
+ require 'logger'
2
+
3
+ module WebpageArchivist
4
+
5
+ # Entry point for the Web Archivist features.
6
+ # Database configuration will rely on the DATABASE_uri environment variable
7
+ # see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html for the syntax detail
8
+ class WebpageArchivist
9
+
10
+ # Add a webpage for future fetching, return the corresponding Webpage
11
+ # uri:: page uri
12
+ # name:: page name
13
+ def add_webpage uri, name
14
+ Webpage.create(:name => name, :uri => uri)
15
+ end
16
+
17
+ # Fetch several webpages, return an hash indexed by the ids holding the corresponding instances or http result codes
18
+ def fetch_webpages ids
19
+ Fetcher.fetch_webpages ids
20
+ end
21
+
22
+ # List the webpages
23
+ def list_webpages
24
+ Webpage.all
25
+ end
26
+
27
+ # List the instances of a webpage
28
+ # id:: the webpage id
29
+ def list_instances webpage_id
30
+ Instance.where(:webpage_id => webpage_id)
31
+ end
32
+
33
+ # Write the full content of a webpage instance into a zip file
34
+ # id:: the instance id
35
+ # file:: the file to write to
36
+ def extract_instance_content id, file
37
+ Extracter.instance_content id, file
38
+ end
39
+
40
+ # Purge cached elements from the database, they are *not* deleted from the disk
41
+ # retention_period:: number of days after which the purge should start
42
+ def purge_cache retention_period
43
+ purge_starting_date = DateTime.now - retention_period
44
+ Stylesheet.filter('last_fetched < ?', purge_starting_date).delete
45
+ Script.filter('last_fetched < ?', purge_starting_date).delete
46
+ Image.filter('last_fetched < ?', purge_starting_date).delete
47
+ end
48
+
49
+ # Create a snapshot of a web page
50
+ # See Snapshoter class for configuration
51
+ # uri:: the uri to snapshot
52
+ # snapshot_path:: path to the snapshot file
53
+ # thumbnail_path:: path to the thumbnail (can be nil for no thumbnail)
54
+ def snapshot instance, thumbnail
55
+ Snapshoter.snapshot instance, thumbnail
56
+ end
57
+
58
+ end
59
+
60
+ @@log = false
61
+ @@logger = ::Logger.new(STDOUT)
62
+
63
+ def self.log
64
+ @@log
65
+ end
66
+
67
+ def self.log= value
68
+ @@log = value
69
+ end
70
+
71
+ def self.debug(str)
72
+ @@logger.debug { str }
73
+ end
74
+
75
+ def self.error(str)
76
+ @@logger.error { str }
77
+ end
78
+
79
+ end
data/test/crud_test.rb ADDED
@@ -0,0 +1,28 @@
1
+ require_relative 'helper'
2
+
3
+ describe 'crud' do
4
+
5
+ it 'has no webpage by default' do
6
+ WebpageArchivist::DATABASE.transaction do
7
+
8
+ WebpageArchivist::Webpage.count.must_equal 0
9
+ @@archivist.list_webpages.count.must_equal 0
10
+
11
+ raise(Sequel::Rollback)
12
+ end
13
+ end
14
+
15
+ it 'can add a webpage' do
16
+ WebpageArchivist::DATABASE.transaction do
17
+
18
+ webpage = @@archivist.add_webpage 'http://example.com', 'example'
19
+ webpage.must_be_instance_of WebpageArchivist::Webpage
20
+ webpage.name.must_equal 'example'
21
+ webpage.uri.must_equal 'http://example.com'
22
+ @@archivist.list_webpages.count.must_equal 1
23
+
24
+ raise(Sequel::Rollback)
25
+ end
26
+ end
27
+
28
+ end
@@ -0,0 +1,14 @@
1
+ @import url("relative.css");
2
+ @import url("http://absolute.net/stylesheet.css");
3
+
4
+ #something1 {
5
+ background-image: url("relative1.jpg")
6
+ }
7
+
8
+ #something2 {
9
+ background:#ffffff url("relative2.jpg") no-repeat right top;
10
+ }
11
+
12
+ #something3 {
13
+ background-image: url("http://absolute.net/absolute.jpg")
14
+ }
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ database_path = "sqlite://#{Dir.pwd}/webpage-archivist-test.sqlite3"
2
+
3
+ if File.exist? database_path
4
+ File.delete database_path
5
+ end
6
+
7
+ ENV['DATABASE_URL'] = database_path
8
+
9
+ require 'bundler'
10
+ Bundler.setup
11
+ require 'test/unit'
12
+ require 'minitest/spec'
13
+
14
+ require_relative '../lib/webpage-archivist'
15
+ @@archivist = WebpageArchivist::WebpageArchivist.new
@@ -0,0 +1,48 @@
1
+ require_relative 'helper'
2
+
3
+ describe 'stylesheet' do
4
+
5
+ def stylesheet_file_path
6
+ current_file = File.expand_path(File.dirname(__FILE__))
7
+ File.join(current_file, 'files', 'stylesheet.css')
8
+ end
9
+
10
+ def base_url
11
+ Addressable::URI.parse('http://my.example.com')
12
+ end
13
+
14
+ def parse_stylesheet
15
+ WebpageArchivist::StylesheetDocument.new(IO.read(stylesheet_file_path), base_url)
16
+ end
17
+
18
+ it 'lists imports' do
19
+ imports = []
20
+ stylesheet = parse_stylesheet
21
+ stylesheet.each_import do |i|
22
+ imports << i
23
+ nil
24
+ end
25
+ imports.length.must_equal 2
26
+ imports[0].must_equal 'relative.css'
27
+ imports[1].must_equal 'http://absolute.net/stylesheet.css'
28
+ stylesheet.to_css.must_include '@import url("relative.css");'
29
+ stylesheet.to_css.must_include '@import url("http://absolute.net/stylesheet.css");'
30
+ end
31
+
32
+ it 'list images' do
33
+ images = []
34
+ stylesheet = parse_stylesheet
35
+ stylesheet.each_image do |image|
36
+ images << image
37
+ nil
38
+ end
39
+ images.length.must_equal 3
40
+ images[0].must_equal 'relative1.jpg'
41
+ images[1].must_equal 'relative2.jpg'
42
+ images[2].must_equal 'http://absolute.net/absolute.jpg'
43
+ stylesheet.to_css.must_include 'url("relative1.jpg");'
44
+ stylesheet.to_css.must_include 'url("relative2.jpg");'
45
+ stylesheet.to_css.must_include 'url("http://absolute.net/absolute.jpg");'
46
+ end
47
+
48
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "webpage-archivist/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'webpage-archivist'
7
+ s.version = WebpageArchivist::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Julien Kirch"]
10
+ s.homepage = 'https://github.com/archiloque/webpage-archivist'
11
+ s.summary = 'An utility to archive webpages through time'
12
+ s.description = s.summary
13
+
14
+ s.rubyforge_project = s.name
15
+
16
+ s.has_rdoc = true
17
+ s.extra_rdoc_files = ['README.rdoc']
18
+ s.rdoc_options = ['--main', 'README.rdoc']
19
+
20
+ s.add_runtime_dependency 'andand', '~> 1.3.1'
21
+ s.add_runtime_dependency 'sequel', '~> 3.25'
22
+ s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
23
+ s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
24
+ s.add_runtime_dependency 'nokogiri', '~> 1.5'
25
+ s.add_runtime_dependency 'addressable', '~> 2.2.6'
26
+ s.add_runtime_dependency 'css_parser', '~> 1.1.9'
27
+ s.add_runtime_dependency 'grit', '~> 2.4.1'
28
+ s.add_runtime_dependency 'websnap', '~> 0.1.3'
29
+ s.add_runtime_dependency 'mini_magick', '~> 3.3'
30
+ s.add_runtime_dependency 'mime-types', '~> 1.16'
31
+
32
+ s.add_development_dependency 'sqlite3', '~> 1.3.3'
33
+
34
+ s.files = `git ls-files`.split("\n")
35
+ s.test_files = `git ls-files -- test/*`.split("\n")
36
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
37
+ s.require_paths = ["lib"]
38
+ end
metadata ADDED
@@ -0,0 +1,284 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webpage-archivist
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Julien Kirch
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-08-10 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: andand
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 25
29
+ segments:
30
+ - 1
31
+ - 3
32
+ - 1
33
+ version: 1.3.1
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 53
45
+ segments:
46
+ - 3
47
+ - 25
48
+ version: "3.25"
49
+ type: :runtime
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: eventmachine
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ~>
58
+ - !ruby/object:Gem::Version
59
+ hash: 62196357
60
+ segments:
61
+ - 1
62
+ - 0
63
+ - 0
64
+ - beta
65
+ - 3
66
+ version: 1.0.0.beta.3
67
+ type: :runtime
68
+ version_requirements: *id003
69
+ - !ruby/object:Gem::Dependency
70
+ name: em-http-request
71
+ prerelease: false
72
+ requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ hash: 62196363
78
+ segments:
79
+ - 1
80
+ - 0
81
+ - 0
82
+ - beta
83
+ - 4
84
+ version: 1.0.0.beta.4
85
+ type: :runtime
86
+ version_requirements: *id004
87
+ - !ruby/object:Gem::Dependency
88
+ name: nokogiri
89
+ prerelease: false
90
+ requirement: &id005 !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ~>
94
+ - !ruby/object:Gem::Version
95
+ hash: 5
96
+ segments:
97
+ - 1
98
+ - 5
99
+ version: "1.5"
100
+ type: :runtime
101
+ version_requirements: *id005
102
+ - !ruby/object:Gem::Dependency
103
+ name: addressable
104
+ prerelease: false
105
+ requirement: &id006 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ hash: 11
111
+ segments:
112
+ - 2
113
+ - 2
114
+ - 6
115
+ version: 2.2.6
116
+ type: :runtime
117
+ version_requirements: *id006
118
+ - !ruby/object:Gem::Dependency
119
+ name: css_parser
120
+ prerelease: false
121
+ requirement: &id007 !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ~>
125
+ - !ruby/object:Gem::Version
126
+ hash: 1
127
+ segments:
128
+ - 1
129
+ - 1
130
+ - 9
131
+ version: 1.1.9
132
+ type: :runtime
133
+ version_requirements: *id007
134
+ - !ruby/object:Gem::Dependency
135
+ name: grit
136
+ prerelease: false
137
+ requirement: &id008 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ~>
141
+ - !ruby/object:Gem::Version
142
+ hash: 29
143
+ segments:
144
+ - 2
145
+ - 4
146
+ - 1
147
+ version: 2.4.1
148
+ type: :runtime
149
+ version_requirements: *id008
150
+ - !ruby/object:Gem::Dependency
151
+ name: websnap
152
+ prerelease: false
153
+ requirement: &id009 !ruby/object:Gem::Requirement
154
+ none: false
155
+ requirements:
156
+ - - ~>
157
+ - !ruby/object:Gem::Version
158
+ hash: 29
159
+ segments:
160
+ - 0
161
+ - 1
162
+ - 3
163
+ version: 0.1.3
164
+ type: :runtime
165
+ version_requirements: *id009
166
+ - !ruby/object:Gem::Dependency
167
+ name: mini_magick
168
+ prerelease: false
169
+ requirement: &id010 !ruby/object:Gem::Requirement
170
+ none: false
171
+ requirements:
172
+ - - ~>
173
+ - !ruby/object:Gem::Version
174
+ hash: 1
175
+ segments:
176
+ - 3
177
+ - 3
178
+ version: "3.3"
179
+ type: :runtime
180
+ version_requirements: *id010
181
+ - !ruby/object:Gem::Dependency
182
+ name: mime-types
183
+ prerelease: false
184
+ requirement: &id011 !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ~>
188
+ - !ruby/object:Gem::Version
189
+ hash: 47
190
+ segments:
191
+ - 1
192
+ - 16
193
+ version: "1.16"
194
+ type: :runtime
195
+ version_requirements: *id011
196
+ - !ruby/object:Gem::Dependency
197
+ name: sqlite3
198
+ prerelease: false
199
+ requirement: &id012 !ruby/object:Gem::Requirement
200
+ none: false
201
+ requirements:
202
+ - - ~>
203
+ - !ruby/object:Gem::Version
204
+ hash: 29
205
+ segments:
206
+ - 1
207
+ - 3
208
+ - 3
209
+ version: 1.3.3
210
+ type: :development
211
+ version_requirements: *id012
212
+ description: An utility to archive webpages through time
213
+ email:
214
+ executables: []
215
+
216
+ extensions: []
217
+
218
+ extra_rdoc_files:
219
+ - README.rdoc
220
+ files:
221
+ - .gitignore
222
+ - Gemfile
223
+ - README.rdoc
224
+ - Rakefile
225
+ - lib/webpage-archivist.rb
226
+ - lib/webpage-archivist/extracter.rb
227
+ - lib/webpage-archivist/fetcher/element_request.rb
228
+ - lib/webpage-archivist/fetcher/fetcher.rb
229
+ - lib/webpage-archivist/fetcher/requests_plumber.rb
230
+ - lib/webpage-archivist/fetcher/stylesheet_request.rb
231
+ - lib/webpage-archivist/fetcher/thread-pool.rb
232
+ - lib/webpage-archivist/fetcher/webpage_request.rb
233
+ - lib/webpage-archivist/html_document.rb
234
+ - lib/webpage-archivist/migrations.rb
235
+ - lib/webpage-archivist/models.rb
236
+ - lib/webpage-archivist/patches.rb
237
+ - lib/webpage-archivist/snapshoter.rb
238
+ - lib/webpage-archivist/stylesheet_document.rb
239
+ - lib/webpage-archivist/version.rb
240
+ - lib/webpage-archivist/webpage-archivist.rb
241
+ - test/crud_test.rb
242
+ - test/files/stylesheet.css
243
+ - test/helper.rb
244
+ - test/stylesheet_test.rb
245
+ - webpage-archivist.gemspec
246
+ homepage: https://github.com/archiloque/webpage-archivist
247
+ licenses: []
248
+
249
+ post_install_message:
250
+ rdoc_options:
251
+ - --main
252
+ - README.rdoc
253
+ require_paths:
254
+ - lib
255
+ required_ruby_version: !ruby/object:Gem::Requirement
256
+ none: false
257
+ requirements:
258
+ - - ">="
259
+ - !ruby/object:Gem::Version
260
+ hash: 3
261
+ segments:
262
+ - 0
263
+ version: "0"
264
+ required_rubygems_version: !ruby/object:Gem::Requirement
265
+ none: false
266
+ requirements:
267
+ - - ">="
268
+ - !ruby/object:Gem::Version
269
+ hash: 3
270
+ segments:
271
+ - 0
272
+ version: "0"
273
+ requirements: []
274
+
275
+ rubyforge_project: webpage-archivist
276
+ rubygems_version: 1.8.5
277
+ signing_key:
278
+ specification_version: 3
279
+ summary: An utility to archive webpages through time
280
+ test_files:
281
+ - test/crud_test.rb
282
+ - test/files/stylesheet.css
283
+ - test/helper.rb
284
+ - test/stylesheet_test.rb