webpage-archivist 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/README.rdoc +50 -0
- data/Rakefile +10 -0
- data/lib/webpage-archivist.rb +20 -0
- data/lib/webpage-archivist/extracter.rb +31 -0
- data/lib/webpage-archivist/fetcher/element_request.rb +126 -0
- data/lib/webpage-archivist/fetcher/fetcher.rb +83 -0
- data/lib/webpage-archivist/fetcher/requests_plumber.rb +140 -0
- data/lib/webpage-archivist/fetcher/stylesheet_request.rb +112 -0
- data/lib/webpage-archivist/fetcher/thread-pool.rb +101 -0
- data/lib/webpage-archivist/fetcher/webpage_request.rb +197 -0
- data/lib/webpage-archivist/html_document.rb +66 -0
- data/lib/webpage-archivist/migrations.rb +93 -0
- data/lib/webpage-archivist/models.rb +190 -0
- data/lib/webpage-archivist/patches.rb +63 -0
- data/lib/webpage-archivist/snapshoter.rb +77 -0
- data/lib/webpage-archivist/stylesheet_document.rb +129 -0
- data/lib/webpage-archivist/version.rb +3 -0
- data/lib/webpage-archivist/webpage-archivist.rb +79 -0
- data/test/crud_test.rb +28 -0
- data/test/files/stylesheet.css +14 -0
- data/test/helper.rb +15 -0
- data/test/stylesheet_test.rb +48 -0
- data/webpage-archivist.gemspec +38 -0
- metadata +284 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module WebpageArchivist
|
4
|
+
|
5
|
+
# Entry point for the Web Archivist features.
|
6
|
+
# Database configuration will rely on the DATABASE_uri environment variable
|
7
|
+
# see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html for the syntax detail
|
8
|
+
class WebpageArchivist
|
9
|
+
|
10
|
+
# Add a webpage for future fetching, return the corresponding Webpage
|
11
|
+
# uri:: page uri
|
12
|
+
# name:: page name
|
13
|
+
def add_webpage uri, name
|
14
|
+
Webpage.create(:name => name, :uri => uri)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Fetch several webpages, return an hash indexed by the ids holding the corresponding instances or http result codes
|
18
|
+
def fetch_webpages ids
|
19
|
+
Fetcher.fetch_webpages ids
|
20
|
+
end
|
21
|
+
|
22
|
+
# List the webpages
|
23
|
+
def list_webpages
|
24
|
+
Webpage.all
|
25
|
+
end
|
26
|
+
|
27
|
+
# List the instances of a webpage
|
28
|
+
# id:: the webpage id
|
29
|
+
def list_instances webpage_id
|
30
|
+
Instance.where(:webpage_id => webpage_id)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Write the full content of a webpage instance into a zip file
|
34
|
+
# id:: the instance id
|
35
|
+
# file:: the file to write to
|
36
|
+
def extract_instance_content id, file
|
37
|
+
Extracter.instance_content id, file
|
38
|
+
end
|
39
|
+
|
40
|
+
# Purge cached elements from the database, they are *not* deleted from the disk
|
41
|
+
# retention_period:: number of days after which the purge should start
|
42
|
+
def purge_cache retention_period
|
43
|
+
purge_starting_date = DateTime.now - retention_period
|
44
|
+
Stylesheet.filter('last_fetched < ?', purge_starting_date).delete
|
45
|
+
Script.filter('last_fetched < ?', purge_starting_date).delete
|
46
|
+
Image.filter('last_fetched < ?', purge_starting_date).delete
|
47
|
+
end
|
48
|
+
|
49
|
+
# Create a snapshot of a web page
|
50
|
+
# See Snapshoter class for configuration
|
51
|
+
# uri:: the uri to snapshot
|
52
|
+
# snapshot_path:: path to the snapshot file
|
53
|
+
# thumbnail_path:: path to the thumbnail (can be nil for no thumbnail)
|
54
|
+
def snapshot instance, thumbnail
|
55
|
+
Snapshoter.snapshot instance, thumbnail
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
@@log = false
|
61
|
+
@@logger = ::Logger.new(STDOUT)
|
62
|
+
|
63
|
+
def self.log
|
64
|
+
@@log
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.log= value
|
68
|
+
@@log = value
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.debug(str)
|
72
|
+
@@logger.debug { str }
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.error(str)
|
76
|
+
@@logger.error { str }
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
data/test/crud_test.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require_relative 'helper'
|
2
|
+
|
3
|
+
describe 'crud' do
|
4
|
+
|
5
|
+
it 'has no webpage by default' do
|
6
|
+
WebpageArchivist::DATABASE.transaction do
|
7
|
+
|
8
|
+
WebpageArchivist::Webpage.count.must_equal 0
|
9
|
+
@@archivist.list_webpages.count.must_equal 0
|
10
|
+
|
11
|
+
raise(Sequel::Rollback)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'can add a webpage' do
|
16
|
+
WebpageArchivist::DATABASE.transaction do
|
17
|
+
|
18
|
+
webpage = @@archivist.add_webpage 'http://example.com', 'example'
|
19
|
+
webpage.must_be_instance_of WebpageArchivist::Webpage
|
20
|
+
webpage.name.must_equal 'example'
|
21
|
+
webpage.uri.must_equal 'http://example.com'
|
22
|
+
@@archivist.list_webpages.count.must_equal 1
|
23
|
+
|
24
|
+
raise(Sequel::Rollback)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
@import url("relative.css");
|
2
|
+
@import url("http://absolute.net/stylesheet.css");
|
3
|
+
|
4
|
+
#something1 {
|
5
|
+
background-image: url("relative1.jpg")
|
6
|
+
}
|
7
|
+
|
8
|
+
#something2 {
|
9
|
+
background:#ffffff url("relative2.jpg") no-repeat right top;
|
10
|
+
}
|
11
|
+
|
12
|
+
#something3 {
|
13
|
+
background-image: url("http://absolute.net/absolute.jpg")
|
14
|
+
}
|
data/test/helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
database_path = "sqlite://#{Dir.pwd}/webpage-archivist-test.sqlite3"
|
2
|
+
|
3
|
+
if File.exist? database_path
|
4
|
+
File.delete database_path
|
5
|
+
end
|
6
|
+
|
7
|
+
ENV['DATABASE_URL'] = database_path
|
8
|
+
|
9
|
+
require 'bundler'
|
10
|
+
Bundler.setup
|
11
|
+
require 'test/unit'
|
12
|
+
require 'minitest/spec'
|
13
|
+
|
14
|
+
require_relative '../lib/webpage-archivist'
|
15
|
+
@@archivist = WebpageArchivist::WebpageArchivist.new
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative 'helper'
|
2
|
+
|
3
|
+
describe 'stylesheet' do
|
4
|
+
|
5
|
+
def stylesheet_file_path
|
6
|
+
current_file = File.expand_path(File.dirname(__FILE__))
|
7
|
+
File.join(current_file, 'files', 'stylesheet.css')
|
8
|
+
end
|
9
|
+
|
10
|
+
def base_url
|
11
|
+
Addressable::URI.parse('http://my.example.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_stylesheet
|
15
|
+
WebpageArchivist::StylesheetDocument.new(IO.read(stylesheet_file_path), base_url)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'lists imports' do
|
19
|
+
imports = []
|
20
|
+
stylesheet = parse_stylesheet
|
21
|
+
stylesheet.each_import do |i|
|
22
|
+
imports << i
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
imports.length.must_equal 2
|
26
|
+
imports[0].must_equal 'relative.css'
|
27
|
+
imports[1].must_equal 'http://absolute.net/stylesheet.css'
|
28
|
+
stylesheet.to_css.must_include '@import url("relative.css");'
|
29
|
+
stylesheet.to_css.must_include '@import url("http://absolute.net/stylesheet.css");'
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'list images' do
|
33
|
+
images = []
|
34
|
+
stylesheet = parse_stylesheet
|
35
|
+
stylesheet.each_image do |image|
|
36
|
+
images << image
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
images.length.must_equal 3
|
40
|
+
images[0].must_equal 'relative1.jpg'
|
41
|
+
images[1].must_equal 'relative2.jpg'
|
42
|
+
images[2].must_equal 'http://absolute.net/absolute.jpg'
|
43
|
+
stylesheet.to_css.must_include 'url("relative1.jpg");'
|
44
|
+
stylesheet.to_css.must_include 'url("relative2.jpg");'
|
45
|
+
stylesheet.to_css.must_include 'url("http://absolute.net/absolute.jpg");'
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "webpage-archivist/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'webpage-archivist'
|
7
|
+
s.version = WebpageArchivist::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Julien Kirch"]
|
10
|
+
s.homepage = 'https://github.com/archiloque/webpage-archivist'
|
11
|
+
s.summary = 'An utility to archive webpages through time'
|
12
|
+
s.description = s.summary
|
13
|
+
|
14
|
+
s.rubyforge_project = s.name
|
15
|
+
|
16
|
+
s.has_rdoc = true
|
17
|
+
s.extra_rdoc_files = ['README.rdoc']
|
18
|
+
s.rdoc_options = ['--main', 'README.rdoc']
|
19
|
+
|
20
|
+
s.add_runtime_dependency 'andand', '~> 1.3.1'
|
21
|
+
s.add_runtime_dependency 'sequel', '~> 3.25'
|
22
|
+
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
|
23
|
+
s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
|
24
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.5'
|
25
|
+
s.add_runtime_dependency 'addressable', '~> 2.2.6'
|
26
|
+
s.add_runtime_dependency 'css_parser', '~> 1.1.9'
|
27
|
+
s.add_runtime_dependency 'grit', '~> 2.4.1'
|
28
|
+
s.add_runtime_dependency 'websnap', '~> 0.1.3'
|
29
|
+
s.add_runtime_dependency 'mini_magick', '~> 3.3'
|
30
|
+
s.add_runtime_dependency 'mime-types', '~> 1.16'
|
31
|
+
|
32
|
+
s.add_development_dependency 'sqlite3', '~> 1.3.3'
|
33
|
+
|
34
|
+
s.files = `git ls-files`.split("\n")
|
35
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
36
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,284 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webpage-archivist
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Julien Kirch
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-08-10 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: andand
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ~>
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 25
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 3
|
32
|
+
- 1
|
33
|
+
version: 1.3.1
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: sequel
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 53
|
45
|
+
segments:
|
46
|
+
- 3
|
47
|
+
- 25
|
48
|
+
version: "3.25"
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: eventmachine
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ~>
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 62196357
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
63
|
+
- 0
|
64
|
+
- beta
|
65
|
+
- 3
|
66
|
+
version: 1.0.0.beta.3
|
67
|
+
type: :runtime
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: em-http-request
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 62196363
|
78
|
+
segments:
|
79
|
+
- 1
|
80
|
+
- 0
|
81
|
+
- 0
|
82
|
+
- beta
|
83
|
+
- 4
|
84
|
+
version: 1.0.0.beta.4
|
85
|
+
type: :runtime
|
86
|
+
version_requirements: *id004
|
87
|
+
- !ruby/object:Gem::Dependency
|
88
|
+
name: nokogiri
|
89
|
+
prerelease: false
|
90
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ~>
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
hash: 5
|
96
|
+
segments:
|
97
|
+
- 1
|
98
|
+
- 5
|
99
|
+
version: "1.5"
|
100
|
+
type: :runtime
|
101
|
+
version_requirements: *id005
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: addressable
|
104
|
+
prerelease: false
|
105
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 11
|
111
|
+
segments:
|
112
|
+
- 2
|
113
|
+
- 2
|
114
|
+
- 6
|
115
|
+
version: 2.2.6
|
116
|
+
type: :runtime
|
117
|
+
version_requirements: *id006
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: css_parser
|
120
|
+
prerelease: false
|
121
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ~>
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
hash: 1
|
127
|
+
segments:
|
128
|
+
- 1
|
129
|
+
- 1
|
130
|
+
- 9
|
131
|
+
version: 1.1.9
|
132
|
+
type: :runtime
|
133
|
+
version_requirements: *id007
|
134
|
+
- !ruby/object:Gem::Dependency
|
135
|
+
name: grit
|
136
|
+
prerelease: false
|
137
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ~>
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
hash: 29
|
143
|
+
segments:
|
144
|
+
- 2
|
145
|
+
- 4
|
146
|
+
- 1
|
147
|
+
version: 2.4.1
|
148
|
+
type: :runtime
|
149
|
+
version_requirements: *id008
|
150
|
+
- !ruby/object:Gem::Dependency
|
151
|
+
name: websnap
|
152
|
+
prerelease: false
|
153
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
154
|
+
none: false
|
155
|
+
requirements:
|
156
|
+
- - ~>
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
hash: 29
|
159
|
+
segments:
|
160
|
+
- 0
|
161
|
+
- 1
|
162
|
+
- 3
|
163
|
+
version: 0.1.3
|
164
|
+
type: :runtime
|
165
|
+
version_requirements: *id009
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: mini_magick
|
168
|
+
prerelease: false
|
169
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
170
|
+
none: false
|
171
|
+
requirements:
|
172
|
+
- - ~>
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
hash: 1
|
175
|
+
segments:
|
176
|
+
- 3
|
177
|
+
- 3
|
178
|
+
version: "3.3"
|
179
|
+
type: :runtime
|
180
|
+
version_requirements: *id010
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: mime-types
|
183
|
+
prerelease: false
|
184
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ~>
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
hash: 47
|
190
|
+
segments:
|
191
|
+
- 1
|
192
|
+
- 16
|
193
|
+
version: "1.16"
|
194
|
+
type: :runtime
|
195
|
+
version_requirements: *id011
|
196
|
+
- !ruby/object:Gem::Dependency
|
197
|
+
name: sqlite3
|
198
|
+
prerelease: false
|
199
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
200
|
+
none: false
|
201
|
+
requirements:
|
202
|
+
- - ~>
|
203
|
+
- !ruby/object:Gem::Version
|
204
|
+
hash: 29
|
205
|
+
segments:
|
206
|
+
- 1
|
207
|
+
- 3
|
208
|
+
- 3
|
209
|
+
version: 1.3.3
|
210
|
+
type: :development
|
211
|
+
version_requirements: *id012
|
212
|
+
description: An utility to archive webpages through time
|
213
|
+
email:
|
214
|
+
executables: []
|
215
|
+
|
216
|
+
extensions: []
|
217
|
+
|
218
|
+
extra_rdoc_files:
|
219
|
+
- README.rdoc
|
220
|
+
files:
|
221
|
+
- .gitignore
|
222
|
+
- Gemfile
|
223
|
+
- README.rdoc
|
224
|
+
- Rakefile
|
225
|
+
- lib/webpage-archivist.rb
|
226
|
+
- lib/webpage-archivist/extracter.rb
|
227
|
+
- lib/webpage-archivist/fetcher/element_request.rb
|
228
|
+
- lib/webpage-archivist/fetcher/fetcher.rb
|
229
|
+
- lib/webpage-archivist/fetcher/requests_plumber.rb
|
230
|
+
- lib/webpage-archivist/fetcher/stylesheet_request.rb
|
231
|
+
- lib/webpage-archivist/fetcher/thread-pool.rb
|
232
|
+
- lib/webpage-archivist/fetcher/webpage_request.rb
|
233
|
+
- lib/webpage-archivist/html_document.rb
|
234
|
+
- lib/webpage-archivist/migrations.rb
|
235
|
+
- lib/webpage-archivist/models.rb
|
236
|
+
- lib/webpage-archivist/patches.rb
|
237
|
+
- lib/webpage-archivist/snapshoter.rb
|
238
|
+
- lib/webpage-archivist/stylesheet_document.rb
|
239
|
+
- lib/webpage-archivist/version.rb
|
240
|
+
- lib/webpage-archivist/webpage-archivist.rb
|
241
|
+
- test/crud_test.rb
|
242
|
+
- test/files/stylesheet.css
|
243
|
+
- test/helper.rb
|
244
|
+
- test/stylesheet_test.rb
|
245
|
+
- webpage-archivist.gemspec
|
246
|
+
homepage: https://github.com/archiloque/webpage-archivist
|
247
|
+
licenses: []
|
248
|
+
|
249
|
+
post_install_message:
|
250
|
+
rdoc_options:
|
251
|
+
- --main
|
252
|
+
- README.rdoc
|
253
|
+
require_paths:
|
254
|
+
- lib
|
255
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
256
|
+
none: false
|
257
|
+
requirements:
|
258
|
+
- - ">="
|
259
|
+
- !ruby/object:Gem::Version
|
260
|
+
hash: 3
|
261
|
+
segments:
|
262
|
+
- 0
|
263
|
+
version: "0"
|
264
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
265
|
+
none: false
|
266
|
+
requirements:
|
267
|
+
- - ">="
|
268
|
+
- !ruby/object:Gem::Version
|
269
|
+
hash: 3
|
270
|
+
segments:
|
271
|
+
- 0
|
272
|
+
version: "0"
|
273
|
+
requirements: []
|
274
|
+
|
275
|
+
rubyforge_project: webpage-archivist
|
276
|
+
rubygems_version: 1.8.5
|
277
|
+
signing_key:
|
278
|
+
specification_version: 3
|
279
|
+
summary: An utility to archive webpages through time
|
280
|
+
test_files:
|
281
|
+
- test/crud_test.rb
|
282
|
+
- test/files/stylesheet.css
|
283
|
+
- test/helper.rb
|
284
|
+
- test/stylesheet_test.rb
|