webpage-archivist 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/README.rdoc +50 -0
- data/Rakefile +10 -0
- data/lib/webpage-archivist.rb +20 -0
- data/lib/webpage-archivist/extracter.rb +31 -0
- data/lib/webpage-archivist/fetcher/element_request.rb +126 -0
- data/lib/webpage-archivist/fetcher/fetcher.rb +83 -0
- data/lib/webpage-archivist/fetcher/requests_plumber.rb +140 -0
- data/lib/webpage-archivist/fetcher/stylesheet_request.rb +112 -0
- data/lib/webpage-archivist/fetcher/thread-pool.rb +101 -0
- data/lib/webpage-archivist/fetcher/webpage_request.rb +197 -0
- data/lib/webpage-archivist/html_document.rb +66 -0
- data/lib/webpage-archivist/migrations.rb +93 -0
- data/lib/webpage-archivist/models.rb +190 -0
- data/lib/webpage-archivist/patches.rb +63 -0
- data/lib/webpage-archivist/snapshoter.rb +77 -0
- data/lib/webpage-archivist/stylesheet_document.rb +129 -0
- data/lib/webpage-archivist/version.rb +3 -0
- data/lib/webpage-archivist/webpage-archivist.rb +79 -0
- data/test/crud_test.rb +28 -0
- data/test/files/stylesheet.css +14 -0
- data/test/helper.rb +15 -0
- data/test/stylesheet_test.rb +48 -0
- data/webpage-archivist.gemspec +38 -0
- metadata +284 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module WebpageArchivist
|
4
|
+
|
5
|
+
# Entry point for the Web Archivist features.
|
6
|
+
# Database configuration will rely on the DATABASE_uri environment variable
|
7
|
+
# see http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html for the syntax detail
|
8
|
+
class WebpageArchivist
|
9
|
+
|
10
|
+
# Add a webpage for future fetching, return the corresponding Webpage
|
11
|
+
# uri:: page uri
|
12
|
+
# name:: page name
|
13
|
+
def add_webpage uri, name
|
14
|
+
Webpage.create(:name => name, :uri => uri)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Fetch several webpages, return an hash indexed by the ids holding the corresponding instances or http result codes
|
18
|
+
def fetch_webpages ids
|
19
|
+
Fetcher.fetch_webpages ids
|
20
|
+
end
|
21
|
+
|
22
|
+
# List the webpages
|
23
|
+
def list_webpages
|
24
|
+
Webpage.all
|
25
|
+
end
|
26
|
+
|
27
|
+
# List the instances of a webpage
|
28
|
+
# id:: the webpage id
|
29
|
+
def list_instances webpage_id
|
30
|
+
Instance.where(:webpage_id => webpage_id)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Write the full content of a webpage instance into a zip file
|
34
|
+
# id:: the instance id
|
35
|
+
# file:: the file to write to
|
36
|
+
def extract_instance_content id, file
|
37
|
+
Extracter.instance_content id, file
|
38
|
+
end
|
39
|
+
|
40
|
+
# Purge cached elements from the database, they are *not* deleted from the disk
|
41
|
+
# retention_period:: number of days after which the purge should start
|
42
|
+
def purge_cache retention_period
|
43
|
+
purge_starting_date = DateTime.now - retention_period
|
44
|
+
Stylesheet.filter('last_fetched < ?', purge_starting_date).delete
|
45
|
+
Script.filter('last_fetched < ?', purge_starting_date).delete
|
46
|
+
Image.filter('last_fetched < ?', purge_starting_date).delete
|
47
|
+
end
|
48
|
+
|
49
|
+
# Create a snapshot of a web page
|
50
|
+
# See Snapshoter class for configuration
|
51
|
+
# uri:: the uri to snapshot
|
52
|
+
# snapshot_path:: path to the snapshot file
|
53
|
+
# thumbnail_path:: path to the thumbnail (can be nil for no thumbnail)
|
54
|
+
def snapshot instance, thumbnail
|
55
|
+
Snapshoter.snapshot instance, thumbnail
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
@@log = false
|
61
|
+
@@logger = ::Logger.new(STDOUT)
|
62
|
+
|
63
|
+
def self.log
|
64
|
+
@@log
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.log= value
|
68
|
+
@@log = value
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.debug(str)
|
72
|
+
@@logger.debug { str }
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.error(str)
|
76
|
+
@@logger.error { str }
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
data/test/crud_test.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require_relative 'helper'
|
2
|
+
|
3
|
+
describe 'crud' do
|
4
|
+
|
5
|
+
it 'has no webpage by default' do
|
6
|
+
WebpageArchivist::DATABASE.transaction do
|
7
|
+
|
8
|
+
WebpageArchivist::Webpage.count.must_equal 0
|
9
|
+
@@archivist.list_webpages.count.must_equal 0
|
10
|
+
|
11
|
+
raise(Sequel::Rollback)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'can add a webpage' do
|
16
|
+
WebpageArchivist::DATABASE.transaction do
|
17
|
+
|
18
|
+
webpage = @@archivist.add_webpage 'http://example.com', 'example'
|
19
|
+
webpage.must_be_instance_of WebpageArchivist::Webpage
|
20
|
+
webpage.name.must_equal 'example'
|
21
|
+
webpage.uri.must_equal 'http://example.com'
|
22
|
+
@@archivist.list_webpages.count.must_equal 1
|
23
|
+
|
24
|
+
raise(Sequel::Rollback)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
@import url("relative.css");
|
2
|
+
@import url("http://absolute.net/stylesheet.css");
|
3
|
+
|
4
|
+
#something1 {
|
5
|
+
background-image: url("relative1.jpg")
|
6
|
+
}
|
7
|
+
|
8
|
+
#something2 {
|
9
|
+
background:#ffffff url("relative2.jpg") no-repeat right top;
|
10
|
+
}
|
11
|
+
|
12
|
+
#something3 {
|
13
|
+
background-image: url("http://absolute.net/absolute.jpg")
|
14
|
+
}
|
data/test/helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
database_path = "sqlite://#{Dir.pwd}/webpage-archivist-test.sqlite3"
|
2
|
+
|
3
|
+
if File.exist? database_path
|
4
|
+
File.delete database_path
|
5
|
+
end
|
6
|
+
|
7
|
+
ENV['DATABASE_URL'] = database_path
|
8
|
+
|
9
|
+
require 'bundler'
|
10
|
+
Bundler.setup
|
11
|
+
require 'test/unit'
|
12
|
+
require 'minitest/spec'
|
13
|
+
|
14
|
+
require_relative '../lib/webpage-archivist'
|
15
|
+
@@archivist = WebpageArchivist::WebpageArchivist.new
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative 'helper'
|
2
|
+
|
3
|
+
describe 'stylesheet' do
|
4
|
+
|
5
|
+
def stylesheet_file_path
|
6
|
+
current_file = File.expand_path(File.dirname(__FILE__))
|
7
|
+
File.join(current_file, 'files', 'stylesheet.css')
|
8
|
+
end
|
9
|
+
|
10
|
+
def base_url
|
11
|
+
Addressable::URI.parse('http://my.example.com')
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_stylesheet
|
15
|
+
WebpageArchivist::StylesheetDocument.new(IO.read(stylesheet_file_path), base_url)
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'lists imports' do
|
19
|
+
imports = []
|
20
|
+
stylesheet = parse_stylesheet
|
21
|
+
stylesheet.each_import do |i|
|
22
|
+
imports << i
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
imports.length.must_equal 2
|
26
|
+
imports[0].must_equal 'relative.css'
|
27
|
+
imports[1].must_equal 'http://absolute.net/stylesheet.css'
|
28
|
+
stylesheet.to_css.must_include '@import url("relative.css");'
|
29
|
+
stylesheet.to_css.must_include '@import url("http://absolute.net/stylesheet.css");'
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'list images' do
|
33
|
+
images = []
|
34
|
+
stylesheet = parse_stylesheet
|
35
|
+
stylesheet.each_image do |image|
|
36
|
+
images << image
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
images.length.must_equal 3
|
40
|
+
images[0].must_equal 'relative1.jpg'
|
41
|
+
images[1].must_equal 'relative2.jpg'
|
42
|
+
images[2].must_equal 'http://absolute.net/absolute.jpg'
|
43
|
+
stylesheet.to_css.must_include 'url("relative1.jpg");'
|
44
|
+
stylesheet.to_css.must_include 'url("relative2.jpg");'
|
45
|
+
stylesheet.to_css.must_include 'url("http://absolute.net/absolute.jpg");'
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "webpage-archivist/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'webpage-archivist'
|
7
|
+
s.version = WebpageArchivist::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Julien Kirch"]
|
10
|
+
s.homepage = 'https://github.com/archiloque/webpage-archivist'
|
11
|
+
s.summary = 'An utility to archive webpages through time'
|
12
|
+
s.description = s.summary
|
13
|
+
|
14
|
+
s.rubyforge_project = s.name
|
15
|
+
|
16
|
+
s.has_rdoc = true
|
17
|
+
s.extra_rdoc_files = ['README.rdoc']
|
18
|
+
s.rdoc_options = ['--main', 'README.rdoc']
|
19
|
+
|
20
|
+
s.add_runtime_dependency 'andand', '~> 1.3.1'
|
21
|
+
s.add_runtime_dependency 'sequel', '~> 3.25'
|
22
|
+
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
|
23
|
+
s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
|
24
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.5'
|
25
|
+
s.add_runtime_dependency 'addressable', '~> 2.2.6'
|
26
|
+
s.add_runtime_dependency 'css_parser', '~> 1.1.9'
|
27
|
+
s.add_runtime_dependency 'grit', '~> 2.4.1'
|
28
|
+
s.add_runtime_dependency 'websnap', '~> 0.1.3'
|
29
|
+
s.add_runtime_dependency 'mini_magick', '~> 3.3'
|
30
|
+
s.add_runtime_dependency 'mime-types', '~> 1.16'
|
31
|
+
|
32
|
+
s.add_development_dependency 'sqlite3', '~> 1.3.3'
|
33
|
+
|
34
|
+
s.files = `git ls-files`.split("\n")
|
35
|
+
s.test_files = `git ls-files -- test/*`.split("\n")
|
36
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,284 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webpage-archivist
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Julien Kirch
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-08-10 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: andand
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ~>
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 25
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 3
|
32
|
+
- 1
|
33
|
+
version: 1.3.1
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: sequel
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 53
|
45
|
+
segments:
|
46
|
+
- 3
|
47
|
+
- 25
|
48
|
+
version: "3.25"
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: eventmachine
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ~>
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 62196357
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
63
|
+
- 0
|
64
|
+
- beta
|
65
|
+
- 3
|
66
|
+
version: 1.0.0.beta.3
|
67
|
+
type: :runtime
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: em-http-request
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 62196363
|
78
|
+
segments:
|
79
|
+
- 1
|
80
|
+
- 0
|
81
|
+
- 0
|
82
|
+
- beta
|
83
|
+
- 4
|
84
|
+
version: 1.0.0.beta.4
|
85
|
+
type: :runtime
|
86
|
+
version_requirements: *id004
|
87
|
+
- !ruby/object:Gem::Dependency
|
88
|
+
name: nokogiri
|
89
|
+
prerelease: false
|
90
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ~>
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
hash: 5
|
96
|
+
segments:
|
97
|
+
- 1
|
98
|
+
- 5
|
99
|
+
version: "1.5"
|
100
|
+
type: :runtime
|
101
|
+
version_requirements: *id005
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: addressable
|
104
|
+
prerelease: false
|
105
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 11
|
111
|
+
segments:
|
112
|
+
- 2
|
113
|
+
- 2
|
114
|
+
- 6
|
115
|
+
version: 2.2.6
|
116
|
+
type: :runtime
|
117
|
+
version_requirements: *id006
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: css_parser
|
120
|
+
prerelease: false
|
121
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ~>
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
hash: 1
|
127
|
+
segments:
|
128
|
+
- 1
|
129
|
+
- 1
|
130
|
+
- 9
|
131
|
+
version: 1.1.9
|
132
|
+
type: :runtime
|
133
|
+
version_requirements: *id007
|
134
|
+
- !ruby/object:Gem::Dependency
|
135
|
+
name: grit
|
136
|
+
prerelease: false
|
137
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ~>
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
hash: 29
|
143
|
+
segments:
|
144
|
+
- 2
|
145
|
+
- 4
|
146
|
+
- 1
|
147
|
+
version: 2.4.1
|
148
|
+
type: :runtime
|
149
|
+
version_requirements: *id008
|
150
|
+
- !ruby/object:Gem::Dependency
|
151
|
+
name: websnap
|
152
|
+
prerelease: false
|
153
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
154
|
+
none: false
|
155
|
+
requirements:
|
156
|
+
- - ~>
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
hash: 29
|
159
|
+
segments:
|
160
|
+
- 0
|
161
|
+
- 1
|
162
|
+
- 3
|
163
|
+
version: 0.1.3
|
164
|
+
type: :runtime
|
165
|
+
version_requirements: *id009
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: mini_magick
|
168
|
+
prerelease: false
|
169
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
170
|
+
none: false
|
171
|
+
requirements:
|
172
|
+
- - ~>
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
hash: 1
|
175
|
+
segments:
|
176
|
+
- 3
|
177
|
+
- 3
|
178
|
+
version: "3.3"
|
179
|
+
type: :runtime
|
180
|
+
version_requirements: *id010
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: mime-types
|
183
|
+
prerelease: false
|
184
|
+
requirement: &id011 !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ~>
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
hash: 47
|
190
|
+
segments:
|
191
|
+
- 1
|
192
|
+
- 16
|
193
|
+
version: "1.16"
|
194
|
+
type: :runtime
|
195
|
+
version_requirements: *id011
|
196
|
+
- !ruby/object:Gem::Dependency
|
197
|
+
name: sqlite3
|
198
|
+
prerelease: false
|
199
|
+
requirement: &id012 !ruby/object:Gem::Requirement
|
200
|
+
none: false
|
201
|
+
requirements:
|
202
|
+
- - ~>
|
203
|
+
- !ruby/object:Gem::Version
|
204
|
+
hash: 29
|
205
|
+
segments:
|
206
|
+
- 1
|
207
|
+
- 3
|
208
|
+
- 3
|
209
|
+
version: 1.3.3
|
210
|
+
type: :development
|
211
|
+
version_requirements: *id012
|
212
|
+
description: An utility to archive webpages through time
|
213
|
+
email:
|
214
|
+
executables: []
|
215
|
+
|
216
|
+
extensions: []
|
217
|
+
|
218
|
+
extra_rdoc_files:
|
219
|
+
- README.rdoc
|
220
|
+
files:
|
221
|
+
- .gitignore
|
222
|
+
- Gemfile
|
223
|
+
- README.rdoc
|
224
|
+
- Rakefile
|
225
|
+
- lib/webpage-archivist.rb
|
226
|
+
- lib/webpage-archivist/extracter.rb
|
227
|
+
- lib/webpage-archivist/fetcher/element_request.rb
|
228
|
+
- lib/webpage-archivist/fetcher/fetcher.rb
|
229
|
+
- lib/webpage-archivist/fetcher/requests_plumber.rb
|
230
|
+
- lib/webpage-archivist/fetcher/stylesheet_request.rb
|
231
|
+
- lib/webpage-archivist/fetcher/thread-pool.rb
|
232
|
+
- lib/webpage-archivist/fetcher/webpage_request.rb
|
233
|
+
- lib/webpage-archivist/html_document.rb
|
234
|
+
- lib/webpage-archivist/migrations.rb
|
235
|
+
- lib/webpage-archivist/models.rb
|
236
|
+
- lib/webpage-archivist/patches.rb
|
237
|
+
- lib/webpage-archivist/snapshoter.rb
|
238
|
+
- lib/webpage-archivist/stylesheet_document.rb
|
239
|
+
- lib/webpage-archivist/version.rb
|
240
|
+
- lib/webpage-archivist/webpage-archivist.rb
|
241
|
+
- test/crud_test.rb
|
242
|
+
- test/files/stylesheet.css
|
243
|
+
- test/helper.rb
|
244
|
+
- test/stylesheet_test.rb
|
245
|
+
- webpage-archivist.gemspec
|
246
|
+
homepage: https://github.com/archiloque/webpage-archivist
|
247
|
+
licenses: []
|
248
|
+
|
249
|
+
post_install_message:
|
250
|
+
rdoc_options:
|
251
|
+
- --main
|
252
|
+
- README.rdoc
|
253
|
+
require_paths:
|
254
|
+
- lib
|
255
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
256
|
+
none: false
|
257
|
+
requirements:
|
258
|
+
- - ">="
|
259
|
+
- !ruby/object:Gem::Version
|
260
|
+
hash: 3
|
261
|
+
segments:
|
262
|
+
- 0
|
263
|
+
version: "0"
|
264
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
265
|
+
none: false
|
266
|
+
requirements:
|
267
|
+
- - ">="
|
268
|
+
- !ruby/object:Gem::Version
|
269
|
+
hash: 3
|
270
|
+
segments:
|
271
|
+
- 0
|
272
|
+
version: "0"
|
273
|
+
requirements: []
|
274
|
+
|
275
|
+
rubyforge_project: webpage-archivist
|
276
|
+
rubygems_version: 1.8.5
|
277
|
+
signing_key:
|
278
|
+
specification_version: 3
|
279
|
+
summary: An utility to archive webpages through time
|
280
|
+
test_files:
|
281
|
+
- test/crud_test.rb
|
282
|
+
- test/files/stylesheet.css
|
283
|
+
- test/helper.rb
|
284
|
+
- test/stylesheet_test.rb
|