webpage-archivist 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/README.rdoc +50 -0
- data/Rakefile +10 -0
- data/lib/webpage-archivist.rb +20 -0
- data/lib/webpage-archivist/extracter.rb +31 -0
- data/lib/webpage-archivist/fetcher/element_request.rb +126 -0
- data/lib/webpage-archivist/fetcher/fetcher.rb +83 -0
- data/lib/webpage-archivist/fetcher/requests_plumber.rb +140 -0
- data/lib/webpage-archivist/fetcher/stylesheet_request.rb +112 -0
- data/lib/webpage-archivist/fetcher/thread-pool.rb +101 -0
- data/lib/webpage-archivist/fetcher/webpage_request.rb +197 -0
- data/lib/webpage-archivist/html_document.rb +66 -0
- data/lib/webpage-archivist/migrations.rb +93 -0
- data/lib/webpage-archivist/models.rb +190 -0
- data/lib/webpage-archivist/patches.rb +63 -0
- data/lib/webpage-archivist/snapshoter.rb +77 -0
- data/lib/webpage-archivist/stylesheet_document.rb +129 -0
- data/lib/webpage-archivist/version.rb +3 -0
- data/lib/webpage-archivist/webpage-archivist.rb +79 -0
- data/test/crud_test.rb +28 -0
- data/test/files/stylesheet.css +14 -0
- data/test/helper.rb +15 -0
- data/test/stylesheet_test.rb +48 -0
- data/webpage-archivist.gemspec +38 -0
- metadata +284 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
An utility to archive webpages through time.
|
2
|
+
|
3
|
+
Takes snapshots and make incremental backups of webpages assets so you can follow the pages' evolutions through time.
|
4
|
+
|
5
|
+
* Assets are stored in a git respository to simplify incremental storage and easy retrieval
|
6
|
+
* Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
|
7
|
+
* List of webpages and archives instances are stored in an SQL database
|
8
|
+
* Some caching data are stored in the same databse
|
9
|
+
|
10
|
+
= Required tools:
|
11
|
+
|
12
|
+
* An SQL database supported by Sequel[http://sequel.rubyforge.org/]
|
13
|
+
* Git[http://git-scm.com/]
|
14
|
+
* ImageMagick[http://www.imagemagick.org/script/index.php]
|
15
|
+
* wkhtmltoimage[http://code.google.com/p/wkhtmltopdf/] (work but not so well on Mac OS, prefer Linux for real usage)
|
16
|
+
|
17
|
+
= Installation
|
18
|
+
|
19
|
+
* Install the required tools
|
20
|
+
* Install the gem
|
21
|
+
* All configuration items have default value, have a look bellow if you want to customize them (default database configuration require the sqlite3 gem)
|
22
|
+
* Use it !: all the required files and database structure will be created at first call
|
23
|
+
|
24
|
+
= API
|
25
|
+
|
26
|
+
The public API is provided by WebpageArchivist::WebpageArchivist, example:
|
27
|
+
|
28
|
+
require 'webpage-archivist'
|
29
|
+
archivist = WebpageArchivist::WebpageArchivist.new
|
30
|
+
webpage = archivist.add_webpage('http://www.nytimes.com/' , 'The New York Times')
|
31
|
+
archivist.fetch_webpages [webpage.id]
|
32
|
+
|
33
|
+
Models are available in the lib/webpag-archivist/models.rb file, have a look at the Sequel[http://sequel.rubyforge.org/] API if you want to querry them.
|
34
|
+
|
35
|
+
= Configuration
|
36
|
+
|
37
|
+
Basic configuration is done through environment variables:
|
38
|
+
|
39
|
+
* +DATABASE_URL+ : database url, default to <tt>sqlite://#{Dir.pwd}/webpage-archivist.sqlite3</tt> syntax is described here[http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html], remember to add the required database gem
|
40
|
+
* +ARCHIVIST_ASSETS_PATH+ : path to store the assets, default to +./archivist_assets+
|
41
|
+
* +ARCHIVIST_SNAPSHOTS_PATH+ : path to store the thumbnail, default to +./archivist_snapshots+
|
42
|
+
* +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
|
43
|
+
* +IMAGE_MAGICK_PATH+ : path to ImageMagick executables if they aren't in the path
|
44
|
+
|
45
|
+
Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
|
46
|
+
|
47
|
+
To enable debugging use
|
48
|
+
|
49
|
+
WebpageArchivist.log= true
|
50
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require_relative 'webpage-archivist/migrations'
|
2
|
+
|
3
|
+
require_relative 'webpage-archivist/patches'
|
4
|
+
|
5
|
+
require_relative 'webpage-archivist/html_document'
|
6
|
+
require_relative 'webpage-archivist/stylesheet_document'
|
7
|
+
|
8
|
+
require_relative 'webpage-archivist/webpage-archivist'
|
9
|
+
|
10
|
+
require_relative 'webpage-archivist/models'
|
11
|
+
|
12
|
+
require_relative 'webpage-archivist/extracter'
|
13
|
+
|
14
|
+
require_relative 'webpage-archivist/snapshoter'
|
15
|
+
|
16
|
+
require_relative 'webpage-archivist/fetcher/fetcher'
|
17
|
+
require_relative 'webpage-archivist/fetcher/requests_plumber'
|
18
|
+
require_relative 'webpage-archivist/fetcher/element_request'
|
19
|
+
require_relative 'webpage-archivist/fetcher/stylesheet_request'
|
20
|
+
require_relative 'webpage-archivist/fetcher/webpage_request'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
module WebpageArchivist
|
4
|
+
|
5
|
+
# Fetch webpages' content
|
6
|
+
module Extracter
|
7
|
+
|
8
|
+
# Write the full content of a webpage instance into a zip file
|
9
|
+
# id:: the instance id
|
10
|
+
# file:: the file to write to
|
11
|
+
def self.instance_content id, file
|
12
|
+
@instance = Instance.filter(:id => id).first
|
13
|
+
unless @instance
|
14
|
+
raise "Instance [#{id}] not found"
|
15
|
+
end
|
16
|
+
repository = @instance.webpage.repository
|
17
|
+
commit = repository.log.find{ |l| l.message == @instance.commit_timestamp }
|
18
|
+
|
19
|
+
unless file.end_with? '.zip'
|
20
|
+
file << '.zip'
|
21
|
+
end
|
22
|
+
|
23
|
+
if commit
|
24
|
+
repository.archive_zip commit.id, file
|
25
|
+
else
|
26
|
+
raise "Version [#{@instance.commit_timestamp}] not found"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module WebpageArchivist::Fetcher
|
2
|
+
|
3
|
+
# Requesting a webpage
|
4
|
+
class ElementRequest
|
5
|
+
|
6
|
+
attr_reader :element, :uri
|
7
|
+
|
8
|
+
attr_accessor :status
|
9
|
+
|
10
|
+
def initialize request_element_type, uri, plumber
|
11
|
+
::WebpageArchivist.debug "Creating request [#{uri}]" if ::WebpageArchivist.log
|
12
|
+
|
13
|
+
@requesters = []
|
14
|
+
@request_element_type = request_element_type
|
15
|
+
@uri = uri
|
16
|
+
@element = request_element_type.filter(:uri => uri).first
|
17
|
+
@status = :fetching
|
18
|
+
@plumber = plumber
|
19
|
+
|
20
|
+
@requesters_notified = false
|
21
|
+
end
|
22
|
+
|
23
|
+
# Start the request
|
24
|
+
# Not in initialize so we can register the request before executing it and throttle the number of connections
|
25
|
+
# retries:: number of retries in case of error
|
26
|
+
def start retries = 3
|
27
|
+
::WebpageArchivist.debug "Starting request [#{uri}]" if ::WebpageArchivist.log
|
28
|
+
head = {'accept-encoding' => 'gzip, compressed'}
|
29
|
+
if element
|
30
|
+
head['If-Modified-Since'] = element.last_modified
|
31
|
+
@plumber.register_file_name element.file_name
|
32
|
+
end
|
33
|
+
|
34
|
+
# Don't overflow the servers or they will kick us out
|
35
|
+
http = EventMachine::HttpRequest.new(uri).get :redirects => 5, :timeout => 30, :head => head
|
36
|
+
http.callback do
|
37
|
+
if ([500, 503].include? http.response_header.status) && (retries > 0)
|
38
|
+
start(retries - 1)
|
39
|
+
else
|
40
|
+
will_process_response http
|
41
|
+
end
|
42
|
+
end
|
43
|
+
http.errback do
|
44
|
+
::WebpageArchivist.debug "[#{@uri}] errback" if ::WebpageArchivist.log
|
45
|
+
if retries > 0
|
46
|
+
start(retries - 1)
|
47
|
+
else
|
48
|
+
will_process_response http
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Call process_response and ensure managment is done
|
54
|
+
def will_process_response http
|
55
|
+
begin
|
56
|
+
process_response http
|
57
|
+
rescue Exception => e
|
58
|
+
::WebpageArchivist.error e if ::WebpageArchivist.log
|
59
|
+
notify_requesters
|
60
|
+
ensure
|
61
|
+
@plumber.request_ended
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Add a requester to be notified when the request is over
|
66
|
+
def add_requester requester
|
67
|
+
@requesters << requester
|
68
|
+
end
|
69
|
+
|
70
|
+
# Process the response
|
71
|
+
def process_response http
|
72
|
+
result_code = http.response_header.status
|
73
|
+
::WebpageArchivist.debug "[#{@uri}] returned #{result_code}" if ::WebpageArchivist.log
|
74
|
+
|
75
|
+
if [304, 408, 0].include? result_code
|
76
|
+
# Not changed or connection error
|
77
|
+
if element
|
78
|
+
element.update(:last_fetched => DateTime.now)
|
79
|
+
end
|
80
|
+
content_not_changed http
|
81
|
+
elsif result_code == 200
|
82
|
+
if element
|
83
|
+
element.update(:last_fetched => DateTime.now,
|
84
|
+
:last_modified => http.response_header.last_modified || DateTime.now.rfc2822)
|
85
|
+
else
|
86
|
+
extension = @request_element_type.extention(@uri, http.response_header[EventMachine::HttpClient::CONTENT_TYPE])
|
87
|
+
@element = @request_element_type.create(:webpage => @plumber.webpage,
|
88
|
+
:uri => @uri,
|
89
|
+
:file_hash => @plumber.get_hash(@request_element_type, @uri, extension),
|
90
|
+
:extension => extension,
|
91
|
+
:last_fetched => DateTime.now,
|
92
|
+
:last_modified => @plumber.last_modified(http))
|
93
|
+
end
|
94
|
+
save_content_end_request http
|
95
|
+
else
|
96
|
+
self.status= :over
|
97
|
+
notify_requesters
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Content has not changed since last fetch
|
102
|
+
def content_not_changed http
|
103
|
+
self.status= :over
|
104
|
+
notify_requesters
|
105
|
+
end
|
106
|
+
|
107
|
+
# Content has changed: save the content and end the request
|
108
|
+
def save_content_end_request http
|
109
|
+
::WebpageArchivist.debug "[#{@uri}] writing content to #{element.file_name}" if ::WebpageArchivist.log
|
110
|
+
|
111
|
+
element.save_content http.response
|
112
|
+
self.status= :over
|
113
|
+
notify_requesters
|
114
|
+
end
|
115
|
+
|
116
|
+
def notify_requesters
|
117
|
+
unless @requesters_notified
|
118
|
+
::WebpageArchivist.debug "[#{@uri}] notify #{@requesters.length}" if ::WebpageArchivist.log
|
119
|
+
@requesters.each { |r| r.request_over(@uri) }
|
120
|
+
@requester_notified = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'eventmachine'
|
2
|
+
require_relative 'thread-pool'
|
3
|
+
|
4
|
+
# Module in charge of fetching pages
|
5
|
+
module WebpageArchivist::Fetcher
|
6
|
+
|
7
|
+
SEMAPHORE = Mutex.new
|
8
|
+
|
9
|
+
# Fetch several webpages, return an hash indexed by the ids holding the corresponding Instances or http result codes
|
10
|
+
# (may be existing instances if the pages haven't changed)
|
11
|
+
def self.fetch_webpages ids
|
12
|
+
if ids.empty?
|
13
|
+
[]
|
14
|
+
else
|
15
|
+
SEMAPHORE.synchronize do
|
16
|
+
@fetcher_watcher = FetcherWatcher.new
|
17
|
+
EventMachine.run do
|
18
|
+
WebpageArchivist::Webpage.filter(:id => ids).each do |webpage|
|
19
|
+
@fetcher_watcher.add_request WebpageRequest.new(webpage, @fetcher_watcher)
|
20
|
+
end
|
21
|
+
@fetcher_watcher.wait
|
22
|
+
end
|
23
|
+
|
24
|
+
result = {}
|
25
|
+
@fetcher_watcher.requests.each do |webpage_request|
|
26
|
+
result[webpage_request.webpage.id] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
|
27
|
+
end
|
28
|
+
result
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Wait for callbacks for webpages' requests, stop event machine when all requests are over and manage the snapshot generation
|
34
|
+
class FetcherWatcher
|
35
|
+
|
36
|
+
attr_reader :requests
|
37
|
+
|
38
|
+
def initialize
|
39
|
+
@waiting_requests = 0
|
40
|
+
@status = :starting
|
41
|
+
@requests = []
|
42
|
+
@thread_pool = Pool.new 1
|
43
|
+
end
|
44
|
+
|
45
|
+
# Start to wait
|
46
|
+
def wait
|
47
|
+
@status = :waiting
|
48
|
+
if @waiting_requests == 0
|
49
|
+
end_watcher
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Add a request to wait for
|
54
|
+
def add_request request
|
55
|
+
@waiting_requests += 1
|
56
|
+
@requests << request
|
57
|
+
end
|
58
|
+
|
59
|
+
# A request is over
|
60
|
+
# request:: the request
|
61
|
+
# ok:: indicates if the request went ok, in this case ask for a snapshot
|
62
|
+
def end_request request, ok
|
63
|
+
@waiting_requests -= 1
|
64
|
+
if ok && request.instance
|
65
|
+
@thread_pool.schedule do
|
66
|
+
::WebpageArchivist::Snapshoter.snapshot_instance request.instance
|
67
|
+
end
|
68
|
+
end
|
69
|
+
if (@status == :waiting) && (@waiting_requests <= 0)
|
70
|
+
end_watcher
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# End the watch
|
75
|
+
def end_watcher
|
76
|
+
EM.stop
|
77
|
+
@thread_pool.shutdown
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'andand'
|
3
|
+
|
4
|
+
module WebpageArchivist::Fetcher
|
5
|
+
# Contains the plumbing for the fetching code
|
6
|
+
class RequestsPlumber
|
7
|
+
|
8
|
+
MAX_RUNNING_REQUESTS = (ENV['ARCHIVIST_MAX_RUNNING_REQUESTS'].andand.to_i || 20)
|
9
|
+
|
10
|
+
attr_reader :requests_files
|
11
|
+
|
12
|
+
@@next_tick = false
|
13
|
+
@@waiting_requests = []
|
14
|
+
@@running_requests = 0
|
15
|
+
|
16
|
+
def initialize webpage_request
|
17
|
+
@requests = {}
|
18
|
+
@requests_hashes = Set.new
|
19
|
+
@requests_files = Set.new
|
20
|
+
@webpage_request = webpage_request
|
21
|
+
@@waiting_requests << webpage_request
|
22
|
+
RequestsPlumber.new_request
|
23
|
+
end
|
24
|
+
|
25
|
+
# The page being fetched
|
26
|
+
def webpage
|
27
|
+
@webpage_request.webpage
|
28
|
+
end
|
29
|
+
|
30
|
+
# Access an element request by its uri
|
31
|
+
def [] key
|
32
|
+
@requests[key]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Request an element to be fetched
|
36
|
+
# When the fetch is called, request_over will be called on the requester with the uri
|
37
|
+
# requester:: the element doing the request
|
38
|
+
# type:: the type the type of requested element
|
39
|
+
# uri:: the requested uri
|
40
|
+
def request_element requester, request_element_type, uri
|
41
|
+
::WebpageArchivist.debug "Requesting [#{uri}] for [#{requester.uri}]" if ::WebpageArchivist.log
|
42
|
+
|
43
|
+
if request = @requests[uri]
|
44
|
+
if request.status == :over
|
45
|
+
::WebpageArchivist.debug "Request already done" if ::WebpageArchivist.log
|
46
|
+
requester.request_over uri
|
47
|
+
else
|
48
|
+
::WebpageArchivist.debug "Adding to requesters" if ::WebpageArchivist.log
|
49
|
+
request.add_requester requester
|
50
|
+
end
|
51
|
+
else
|
52
|
+
::WebpageArchivist.debug "Creating new request" if ::WebpageArchivist.log
|
53
|
+
if request_element_type == WebpageArchivist::Stylesheet
|
54
|
+
request = StyleSheetRequest.new(uri, self)
|
55
|
+
else
|
56
|
+
request = ElementRequest.new(request_element_type, uri, self)
|
57
|
+
end
|
58
|
+
@requests[uri] = request
|
59
|
+
request.add_requester requester
|
60
|
+
|
61
|
+
@@waiting_requests << request
|
62
|
+
|
63
|
+
# try registering for the next tick
|
64
|
+
RequestsPlumber.new_request
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Notify that a requst as ended so it can start another one
|
69
|
+
def request_ended
|
70
|
+
@@running_requests -= 1
|
71
|
+
unless @@waiting_requests.empty?
|
72
|
+
RequestsPlumber.new_request
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Get the has corresponding to an uri and make sure there is no collision
|
77
|
+
def get_hash type, uri, extension
|
78
|
+
file_hash = Digest::SHA1.hexdigest(uri)
|
79
|
+
if @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
|
80
|
+
i = 0
|
81
|
+
begin
|
82
|
+
file_hash = Digest::SHA1.hexdigest("#{uri}#{i}")
|
83
|
+
i += 1
|
84
|
+
end while @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
|
85
|
+
end
|
86
|
+
|
87
|
+
@requests_files << "#{file_hash}#{extension}"
|
88
|
+
file_hash
|
89
|
+
end
|
90
|
+
|
91
|
+
# Register a filename so it is considered part of the webpage
|
92
|
+
def register_file_name file_name
|
93
|
+
@requests_files << file_name
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the charset of a response, may be nil
|
97
|
+
def response_charset http
|
98
|
+
type = http.response_header[EventMachine::HttpClient::CONTENT_TYPE]
|
99
|
+
if type
|
100
|
+
match = /.+;\s*charset=(.+)/i.match(type)
|
101
|
+
if match
|
102
|
+
match[1].upcase
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def last_modified http
|
108
|
+
http.response_header.last_modified || DateTime.now.rfc2822
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
# Called when a new request has been added
|
114
|
+
# Register a callback for the next tick
|
115
|
+
def self.new_request
|
116
|
+
if (!@@next_tick) && (@@running_requests < MAX_RUNNING_REQUESTS)
|
117
|
+
EventMachine.next_tick do
|
118
|
+
RequestsPlumber.next_tick
|
119
|
+
end
|
120
|
+
@@next_tick = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Start the first waiting request
|
125
|
+
def self.next_tick
|
126
|
+
unless @@waiting_requests.empty?
|
127
|
+
@@waiting_requests.shift.start
|
128
|
+
@@running_requests += 1
|
129
|
+
end
|
130
|
+
|
131
|
+
@@next_tick = false
|
132
|
+
|
133
|
+
unless @@waiting_requests.empty?
|
134
|
+
RequestsPlumber.new_request
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|