webpage-archivist 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ module WebpageArchivist
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,79 @@
1
+ require 'logger'
2
+
3
+ module WebpageArchivist
4
+
5
+ # Entry point for the Web Archivist features.
6
+ # Database configuration will rely on the DATABASE_uri environment variable
7
+ # see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html for the syntax detail
8
+ class WebpageArchivist
9
+
10
+ # Add a webpage for future fetching, return the corresponding Webpage
11
+ # uri:: page uri
12
+ # name:: page name
13
+ def add_webpage uri, name
14
+ Webpage.create(:name => name, :uri => uri)
15
+ end
16
+
17
+ # Fetch several webpages, return an hash indexed by the ids holding the corresponding instances or http result codes
18
+ def fetch_webpages ids
19
+ Fetcher.fetch_webpages ids
20
+ end
21
+
22
+ # List the webpages
23
+ def list_webpages
24
+ Webpage.all
25
+ end
26
+
27
+ # List the instances of a webpage
28
+ # id:: the webpage id
29
+ def list_instances webpage_id
30
+ Instance.where(:webpage_id => webpage_id)
31
+ end
32
+
33
+ # Write the full content of a webpage instance into a zip file
34
+ # id:: the instance id
35
+ # file:: the file to write to
36
+ def extract_instance_content id, file
37
+ Extracter.instance_content id, file
38
+ end
39
+
40
+ # Purge cached elements from the database, they are *not* deleted from the disk
41
+ # retention_period:: number of days after which the purge should start
42
+ def purge_cache retention_period
43
+ purge_starting_date = DateTime.now - retention_period
44
+ Stylesheet.filter('last_fetched < ?', purge_starting_date).delete
45
+ Script.filter('last_fetched < ?', purge_starting_date).delete
46
+ Image.filter('last_fetched < ?', purge_starting_date).delete
47
+ end
48
+
49
+ # Create a snapshot of a web page
50
+ # See Snapshoter class for configuration
51
+ # uri:: the uri to snapshot
52
+ # snapshot_path:: path to the snapshot file
53
+ # thumbnail_path:: path to the thumbnail (can be nil for no thumbnail)
54
+ def snapshot instance, thumbnail
55
+ Snapshoter.snapshot instance, thumbnail
56
+ end
57
+
58
+ end
59
+
60
+ @@log = false
61
+ @@logger = ::Logger.new(STDOUT)
62
+
63
+ def self.log
64
+ @@log
65
+ end
66
+
67
+ def self.log= value
68
+ @@log = value
69
+ end
70
+
71
+ def self.debug(str)
72
+ @@logger.debug { str }
73
+ end
74
+
75
+ def self.error(str)
76
+ @@logger.error { str }
77
+ end
78
+
79
+ end
data/test/crud_test.rb ADDED
@@ -0,0 +1,28 @@
1
+ require_relative 'helper'
2
+
3
+ describe 'crud' do
4
+
5
+ it 'has no webpage by default' do
6
+ WebpageArchivist::DATABASE.transaction do
7
+
8
+ WebpageArchivist::Webpage.count.must_equal 0
9
+ @@archivist.list_webpages.count.must_equal 0
10
+
11
+ raise(Sequel::Rollback)
12
+ end
13
+ end
14
+
15
+ it 'can add a webpage' do
16
+ WebpageArchivist::DATABASE.transaction do
17
+
18
+ webpage = @@archivist.add_webpage 'http://example.com', 'example'
19
+ webpage.must_be_instance_of WebpageArchivist::Webpage
20
+ webpage.name.must_equal 'example'
21
+ webpage.uri.must_equal 'http://example.com'
22
+ @@archivist.list_webpages.count.must_equal 1
23
+
24
+ raise(Sequel::Rollback)
25
+ end
26
+ end
27
+
28
+ end
@@ -0,0 +1,14 @@
1
+ @import url("relative.css");
2
+ @import url("http://absolute.net/stylesheet.css");
3
+
4
+ #something1 {
5
+ background-image: url("relative1.jpg")
6
+ }
7
+
8
+ #something2 {
9
+ background:#ffffff url("relative2.jpg") no-repeat right top;
10
+ }
11
+
12
+ #something3 {
13
+ background-image: url("http://absolute.net/absolute.jpg")
14
+ }
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ database_path = "sqlite://#{Dir.pwd}/webpage-archivist-test.sqlite3"
2
+
3
+ if File.exist? database_path
4
+ File.delete database_path
5
+ end
6
+
7
+ ENV['DATABASE_URL'] = database_path
8
+
9
+ require 'bundler'
10
+ Bundler.setup
11
+ require 'test/unit'
12
+ require 'minitest/spec'
13
+
14
+ require_relative '../lib/webpage-archivist'
15
+ @@archivist = WebpageArchivist::WebpageArchivist.new
@@ -0,0 +1,48 @@
1
+ require_relative 'helper'
2
+
3
+ describe 'stylesheet' do
4
+
5
+ def stylesheet_file_path
6
+ current_file = File.expand_path(File.dirname(__FILE__))
7
+ File.join(current_file, 'files', 'stylesheet.css')
8
+ end
9
+
10
+ def base_url
11
+ Addressable::URI.parse('http://my.example.com')
12
+ end
13
+
14
+ def parse_stylesheet
15
+ WebpageArchivist::StylesheetDocument.new(IO.read(stylesheet_file_path), base_url)
16
+ end
17
+
18
+ it 'lists imports' do
19
+ imports = []
20
+ stylesheet = parse_stylesheet
21
+ stylesheet.each_import do |i|
22
+ imports << i
23
+ nil
24
+ end
25
+ imports.length.must_equal 2
26
+ imports[0].must_equal 'relative.css'
27
+ imports[1].must_equal 'http://absolute.net/stylesheet.css'
28
+ stylesheet.to_css.must_include '@import url("relative.css");'
29
+ stylesheet.to_css.must_include '@import url("http://absolute.net/stylesheet.css");'
30
+ end
31
+
32
+ it 'list images' do
33
+ images = []
34
+ stylesheet = parse_stylesheet
35
+ stylesheet.each_image do |image|
36
+ images << image
37
+ nil
38
+ end
39
+ images.length.must_equal 3
40
+ images[0].must_equal 'relative1.jpg'
41
+ images[1].must_equal 'relative2.jpg'
42
+ images[2].must_equal 'http://absolute.net/absolute.jpg'
43
+ stylesheet.to_css.must_include 'url("relative1.jpg");'
44
+ stylesheet.to_css.must_include 'url("relative2.jpg");'
45
+ stylesheet.to_css.must_include 'url("http://absolute.net/absolute.jpg");'
46
+ end
47
+
48
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "webpage-archivist/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'webpage-archivist'
7
+ s.version = WebpageArchivist::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Julien Kirch"]
10
+ s.homepage = 'https://github.com/archiloque/webpage-archivist'
11
+ s.summary = 'An utility to archive webpages through time'
12
+ s.description = s.summary
13
+
14
+ s.rubyforge_project = s.name
15
+
16
+ s.has_rdoc = true
17
+ s.extra_rdoc_files = ['README.rdoc']
18
+ s.rdoc_options = ['--main', 'README.rdoc']
19
+
20
+ s.add_runtime_dependency 'andand', '~> 1.3.1'
21
+ s.add_runtime_dependency 'sequel', '~> 3.25'
22
+ s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
23
+ s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
24
+ s.add_runtime_dependency 'nokogiri', '~> 1.5'
25
+ s.add_runtime_dependency 'addressable', '~> 2.2.6'
26
+ s.add_runtime_dependency 'css_parser', '~> 1.1.9'
27
+ s.add_runtime_dependency 'grit', '~> 2.4.1'
28
+ s.add_runtime_dependency 'websnap', '~> 0.1.3'
29
+ s.add_runtime_dependency 'mini_magick', '~> 3.3'
30
+ s.add_runtime_dependency 'mime-types', '~> 1.16'
31
+
32
+ s.add_development_dependency 'sqlite3', '~> 1.3.3'
33
+
34
+ s.files = `git ls-files`.split("\n")
35
+ s.test_files = `git ls-files -- test/*`.split("\n")
36
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
37
+ s.require_paths = ["lib"]
38
+ end
metadata ADDED
@@ -0,0 +1,284 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webpage-archivist
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Julien Kirch
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-08-10 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: andand
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ~>
27
+ - !ruby/object:Gem::Version
28
+ hash: 25
29
+ segments:
30
+ - 1
31
+ - 3
32
+ - 1
33
+ version: 1.3.1
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ hash: 53
45
+ segments:
46
+ - 3
47
+ - 25
48
+ version: "3.25"
49
+ type: :runtime
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: eventmachine
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ~>
58
+ - !ruby/object:Gem::Version
59
+ hash: 62196357
60
+ segments:
61
+ - 1
62
+ - 0
63
+ - 0
64
+ - beta
65
+ - 3
66
+ version: 1.0.0.beta.3
67
+ type: :runtime
68
+ version_requirements: *id003
69
+ - !ruby/object:Gem::Dependency
70
+ name: em-http-request
71
+ prerelease: false
72
+ requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ hash: 62196363
78
+ segments:
79
+ - 1
80
+ - 0
81
+ - 0
82
+ - beta
83
+ - 4
84
+ version: 1.0.0.beta.4
85
+ type: :runtime
86
+ version_requirements: *id004
87
+ - !ruby/object:Gem::Dependency
88
+ name: nokogiri
89
+ prerelease: false
90
+ requirement: &id005 !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ~>
94
+ - !ruby/object:Gem::Version
95
+ hash: 5
96
+ segments:
97
+ - 1
98
+ - 5
99
+ version: "1.5"
100
+ type: :runtime
101
+ version_requirements: *id005
102
+ - !ruby/object:Gem::Dependency
103
+ name: addressable
104
+ prerelease: false
105
+ requirement: &id006 !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ hash: 11
111
+ segments:
112
+ - 2
113
+ - 2
114
+ - 6
115
+ version: 2.2.6
116
+ type: :runtime
117
+ version_requirements: *id006
118
+ - !ruby/object:Gem::Dependency
119
+ name: css_parser
120
+ prerelease: false
121
+ requirement: &id007 !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ~>
125
+ - !ruby/object:Gem::Version
126
+ hash: 1
127
+ segments:
128
+ - 1
129
+ - 1
130
+ - 9
131
+ version: 1.1.9
132
+ type: :runtime
133
+ version_requirements: *id007
134
+ - !ruby/object:Gem::Dependency
135
+ name: grit
136
+ prerelease: false
137
+ requirement: &id008 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ~>
141
+ - !ruby/object:Gem::Version
142
+ hash: 29
143
+ segments:
144
+ - 2
145
+ - 4
146
+ - 1
147
+ version: 2.4.1
148
+ type: :runtime
149
+ version_requirements: *id008
150
+ - !ruby/object:Gem::Dependency
151
+ name: websnap
152
+ prerelease: false
153
+ requirement: &id009 !ruby/object:Gem::Requirement
154
+ none: false
155
+ requirements:
156
+ - - ~>
157
+ - !ruby/object:Gem::Version
158
+ hash: 29
159
+ segments:
160
+ - 0
161
+ - 1
162
+ - 3
163
+ version: 0.1.3
164
+ type: :runtime
165
+ version_requirements: *id009
166
+ - !ruby/object:Gem::Dependency
167
+ name: mini_magick
168
+ prerelease: false
169
+ requirement: &id010 !ruby/object:Gem::Requirement
170
+ none: false
171
+ requirements:
172
+ - - ~>
173
+ - !ruby/object:Gem::Version
174
+ hash: 1
175
+ segments:
176
+ - 3
177
+ - 3
178
+ version: "3.3"
179
+ type: :runtime
180
+ version_requirements: *id010
181
+ - !ruby/object:Gem::Dependency
182
+ name: mime-types
183
+ prerelease: false
184
+ requirement: &id011 !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ~>
188
+ - !ruby/object:Gem::Version
189
+ hash: 47
190
+ segments:
191
+ - 1
192
+ - 16
193
+ version: "1.16"
194
+ type: :runtime
195
+ version_requirements: *id011
196
+ - !ruby/object:Gem::Dependency
197
+ name: sqlite3
198
+ prerelease: false
199
+ requirement: &id012 !ruby/object:Gem::Requirement
200
+ none: false
201
+ requirements:
202
+ - - ~>
203
+ - !ruby/object:Gem::Version
204
+ hash: 29
205
+ segments:
206
+ - 1
207
+ - 3
208
+ - 3
209
+ version: 1.3.3
210
+ type: :development
211
+ version_requirements: *id012
212
+ description: An utility to archive webpages through time
213
+ email:
214
+ executables: []
215
+
216
+ extensions: []
217
+
218
+ extra_rdoc_files:
219
+ - README.rdoc
220
+ files:
221
+ - .gitignore
222
+ - Gemfile
223
+ - README.rdoc
224
+ - Rakefile
225
+ - lib/webpage-archivist.rb
226
+ - lib/webpage-archivist/extracter.rb
227
+ - lib/webpage-archivist/fetcher/element_request.rb
228
+ - lib/webpage-archivist/fetcher/fetcher.rb
229
+ - lib/webpage-archivist/fetcher/requests_plumber.rb
230
+ - lib/webpage-archivist/fetcher/stylesheet_request.rb
231
+ - lib/webpage-archivist/fetcher/thread-pool.rb
232
+ - lib/webpage-archivist/fetcher/webpage_request.rb
233
+ - lib/webpage-archivist/html_document.rb
234
+ - lib/webpage-archivist/migrations.rb
235
+ - lib/webpage-archivist/models.rb
236
+ - lib/webpage-archivist/patches.rb
237
+ - lib/webpage-archivist/snapshoter.rb
238
+ - lib/webpage-archivist/stylesheet_document.rb
239
+ - lib/webpage-archivist/version.rb
240
+ - lib/webpage-archivist/webpage-archivist.rb
241
+ - test/crud_test.rb
242
+ - test/files/stylesheet.css
243
+ - test/helper.rb
244
+ - test/stylesheet_test.rb
245
+ - webpage-archivist.gemspec
246
+ homepage: https://github.com/archiloque/webpage-archivist
247
+ licenses: []
248
+
249
+ post_install_message:
250
+ rdoc_options:
251
+ - --main
252
+ - README.rdoc
253
+ require_paths:
254
+ - lib
255
+ required_ruby_version: !ruby/object:Gem::Requirement
256
+ none: false
257
+ requirements:
258
+ - - ">="
259
+ - !ruby/object:Gem::Version
260
+ hash: 3
261
+ segments:
262
+ - 0
263
+ version: "0"
264
+ required_rubygems_version: !ruby/object:Gem::Requirement
265
+ none: false
266
+ requirements:
267
+ - - ">="
268
+ - !ruby/object:Gem::Version
269
+ hash: 3
270
+ segments:
271
+ - 0
272
+ version: "0"
273
+ requirements: []
274
+
275
+ rubyforge_project: webpage-archivist
276
+ rubygems_version: 1.8.5
277
+ signing_key:
278
+ specification_version: 3
279
+ summary: An utility to archive webpages through time
280
+ test_files:
281
+ - test/crud_test.rb
282
+ - test/files/stylesheet.css
283
+ - test/helper.rb
284
+ - test/stylesheet_test.rb