webpage-archivist 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/README.rdoc +50 -0
- data/Rakefile +10 -0
- data/lib/webpage-archivist.rb +20 -0
- data/lib/webpage-archivist/extracter.rb +31 -0
- data/lib/webpage-archivist/fetcher/element_request.rb +126 -0
- data/lib/webpage-archivist/fetcher/fetcher.rb +83 -0
- data/lib/webpage-archivist/fetcher/requests_plumber.rb +140 -0
- data/lib/webpage-archivist/fetcher/stylesheet_request.rb +112 -0
- data/lib/webpage-archivist/fetcher/thread-pool.rb +101 -0
- data/lib/webpage-archivist/fetcher/webpage_request.rb +197 -0
- data/lib/webpage-archivist/html_document.rb +66 -0
- data/lib/webpage-archivist/migrations.rb +93 -0
- data/lib/webpage-archivist/models.rb +190 -0
- data/lib/webpage-archivist/patches.rb +63 -0
- data/lib/webpage-archivist/snapshoter.rb +77 -0
- data/lib/webpage-archivist/stylesheet_document.rb +129 -0
- data/lib/webpage-archivist/version.rb +3 -0
- data/lib/webpage-archivist/webpage-archivist.rb +79 -0
- data/test/crud_test.rb +28 -0
- data/test/files/stylesheet.css +14 -0
- data/test/helper.rb +15 -0
- data/test/stylesheet_test.rb +48 -0
- data/webpage-archivist.gemspec +38 -0
- metadata +284 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
An utility to archive webpages through time.
|
2
|
+
|
3
|
+
Takes snapshots and make incremental backups of webpages assets so you can follow the pages' evolutions through time.
|
4
|
+
|
5
|
+
* Assets are stored in a git respository to simplify incremental storage and easy retrieval
|
6
|
+
* Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
|
7
|
+
* List of webpages and archives instances are stored in an SQL database
|
8
|
+
* Some caching data are stored in the same databse
|
9
|
+
|
10
|
+
= Required tools:
|
11
|
+
|
12
|
+
* An SQL database supported by Sequel[http://sequel.rubyforge.org/]
|
13
|
+
* Git[http://git-scm.com/]
|
14
|
+
* ImageMagick[http://www.imagemagick.org/script/index.php]
|
15
|
+
* wkhtmltoimage[http://code.google.com/p/wkhtmltopdf/] (work but not so well on Mac OS, prefer Linux for real usage)
|
16
|
+
|
17
|
+
= Installation
|
18
|
+
|
19
|
+
* Install the required tools
|
20
|
+
* Install the gem
|
21
|
+
* All configuration items have default value, have a look bellow if you want to customize them (default database configuration require the sqlite3 gem)
|
22
|
+
* Use it !: all the required files and database structure will be created at first call
|
23
|
+
|
24
|
+
= API
|
25
|
+
|
26
|
+
The public API is provided by WebpageArchivist::WebpageArchivist, example:
|
27
|
+
|
28
|
+
require 'webpage-archivist'
|
29
|
+
archivist = WebpageArchivist::WebpageArchivist.new
|
30
|
+
webpage = archivist.add_webpage('http://www.nytimes.com/' , 'The New York Times')
|
31
|
+
archivist.fetch_webpages [webpage.id]
|
32
|
+
|
33
|
+
Models are available in the lib/webpag-archivist/models.rb file, have a look at the Sequel[http://sequel.rubyforge.org/] API if you want to querry them.
|
34
|
+
|
35
|
+
= Configuration
|
36
|
+
|
37
|
+
Basic configuration is done through environment variables:
|
38
|
+
|
39
|
+
* +DATABASE_URL+ : database url, default to <tt>sqlite://#{Dir.pwd}/webpage-archivist.sqlite3</tt> syntax is described here[http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html], remember to add the required database gem
|
40
|
+
* +ARCHIVIST_ASSETS_PATH+ : path to store the assets, default to +./archivist_assets+
|
41
|
+
* +ARCHIVIST_SNAPSHOTS_PATH+ : path to store the thumbnail, default to +./archivist_snapshots+
|
42
|
+
* +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
|
43
|
+
* +IMAGE_MAGICK_PATH+ : path to ImageMagick executables if they aren't in the path
|
44
|
+
|
45
|
+
Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
|
46
|
+
|
47
|
+
To enable debugging use
|
48
|
+
|
49
|
+
WebpageArchivist.log= true
|
50
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require_relative 'webpage-archivist/migrations'
|
2
|
+
|
3
|
+
require_relative 'webpage-archivist/patches'
|
4
|
+
|
5
|
+
require_relative 'webpage-archivist/html_document'
|
6
|
+
require_relative 'webpage-archivist/stylesheet_document'
|
7
|
+
|
8
|
+
require_relative 'webpage-archivist/webpage-archivist'
|
9
|
+
|
10
|
+
require_relative 'webpage-archivist/models'
|
11
|
+
|
12
|
+
require_relative 'webpage-archivist/extracter'
|
13
|
+
|
14
|
+
require_relative 'webpage-archivist/snapshoter'
|
15
|
+
|
16
|
+
require_relative 'webpage-archivist/fetcher/fetcher'
|
17
|
+
require_relative 'webpage-archivist/fetcher/requests_plumber'
|
18
|
+
require_relative 'webpage-archivist/fetcher/element_request'
|
19
|
+
require_relative 'webpage-archivist/fetcher/stylesheet_request'
|
20
|
+
require_relative 'webpage-archivist/fetcher/webpage_request'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
module WebpageArchivist
|
4
|
+
|
5
|
+
# Fetch webpages' content
|
6
|
+
module Extracter
|
7
|
+
|
8
|
+
# Write the full content of a webpage instance into a zip file
|
9
|
+
# id:: the instance id
|
10
|
+
# file:: the file to write to
|
11
|
+
def self.instance_content id, file
|
12
|
+
@instance = Instance.filter(:id => id).first
|
13
|
+
unless @instance
|
14
|
+
raise "Instance [#{id}] not found"
|
15
|
+
end
|
16
|
+
repository = @instance.webpage.repository
|
17
|
+
commit = repository.log.find{ |l| l.message == @instance.commit_timestamp }
|
18
|
+
|
19
|
+
unless file.end_with? '.zip'
|
20
|
+
file << '.zip'
|
21
|
+
end
|
22
|
+
|
23
|
+
if commit
|
24
|
+
repository.archive_zip commit.id, file
|
25
|
+
else
|
26
|
+
raise "Version [#{@instance.commit_timestamp}] not found"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module WebpageArchivist::Fetcher
|
2
|
+
|
3
|
+
# Requesting a webpage
|
4
|
+
class ElementRequest
|
5
|
+
|
6
|
+
attr_reader :element, :uri
|
7
|
+
|
8
|
+
attr_accessor :status
|
9
|
+
|
10
|
+
def initialize request_element_type, uri, plumber
|
11
|
+
::WebpageArchivist.debug "Creating request [#{uri}]" if ::WebpageArchivist.log
|
12
|
+
|
13
|
+
@requesters = []
|
14
|
+
@request_element_type = request_element_type
|
15
|
+
@uri = uri
|
16
|
+
@element = request_element_type.filter(:uri => uri).first
|
17
|
+
@status = :fetching
|
18
|
+
@plumber = plumber
|
19
|
+
|
20
|
+
@requesters_notified = false
|
21
|
+
end
|
22
|
+
|
23
|
+
# Start the request
|
24
|
+
# Not in initialize so we can register the request before executing it and throttle the number of connections
|
25
|
+
# retries:: number of retries in case of error
|
26
|
+
def start retries = 3
|
27
|
+
::WebpageArchivist.debug "Starting request [#{uri}]" if ::WebpageArchivist.log
|
28
|
+
head = {'accept-encoding' => 'gzip, compressed'}
|
29
|
+
if element
|
30
|
+
head['If-Modified-Since'] = element.last_modified
|
31
|
+
@plumber.register_file_name element.file_name
|
32
|
+
end
|
33
|
+
|
34
|
+
# Don't overflow the servers or they will kick us out
|
35
|
+
http = EventMachine::HttpRequest.new(uri).get :redirects => 5, :timeout => 30, :head => head
|
36
|
+
http.callback do
|
37
|
+
if ([500, 503].include? http.response_header.status) && (retries > 0)
|
38
|
+
start(retries - 1)
|
39
|
+
else
|
40
|
+
will_process_response http
|
41
|
+
end
|
42
|
+
end
|
43
|
+
http.errback do
|
44
|
+
::WebpageArchivist.debug "[#{@uri}] errback" if ::WebpageArchivist.log
|
45
|
+
if retries > 0
|
46
|
+
start(retries - 1)
|
47
|
+
else
|
48
|
+
will_process_response http
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Call process_response and ensure managment is done
|
54
|
+
def will_process_response http
|
55
|
+
begin
|
56
|
+
process_response http
|
57
|
+
rescue Exception => e
|
58
|
+
::WebpageArchivist.error e if ::WebpageArchivist.log
|
59
|
+
notify_requesters
|
60
|
+
ensure
|
61
|
+
@plumber.request_ended
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Add a requester to be notified when the request is over
|
66
|
+
def add_requester requester
|
67
|
+
@requesters << requester
|
68
|
+
end
|
69
|
+
|
70
|
+
# Process the response
|
71
|
+
def process_response http
|
72
|
+
result_code = http.response_header.status
|
73
|
+
::WebpageArchivist.debug "[#{@uri}] returned #{result_code}" if ::WebpageArchivist.log
|
74
|
+
|
75
|
+
if [304, 408, 0].include? result_code
|
76
|
+
# Not changed or connection error
|
77
|
+
if element
|
78
|
+
element.update(:last_fetched => DateTime.now)
|
79
|
+
end
|
80
|
+
content_not_changed http
|
81
|
+
elsif result_code == 200
|
82
|
+
if element
|
83
|
+
element.update(:last_fetched => DateTime.now,
|
84
|
+
:last_modified => http.response_header.last_modified || DateTime.now.rfc2822)
|
85
|
+
else
|
86
|
+
extension = @request_element_type.extention(@uri, http.response_header[EventMachine::HttpClient::CONTENT_TYPE])
|
87
|
+
@element = @request_element_type.create(:webpage => @plumber.webpage,
|
88
|
+
:uri => @uri,
|
89
|
+
:file_hash => @plumber.get_hash(@request_element_type, @uri, extension),
|
90
|
+
:extension => extension,
|
91
|
+
:last_fetched => DateTime.now,
|
92
|
+
:last_modified => @plumber.last_modified(http))
|
93
|
+
end
|
94
|
+
save_content_end_request http
|
95
|
+
else
|
96
|
+
self.status= :over
|
97
|
+
notify_requesters
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Content has not changed since last fetch
|
102
|
+
def content_not_changed http
|
103
|
+
self.status= :over
|
104
|
+
notify_requesters
|
105
|
+
end
|
106
|
+
|
107
|
+
# Content has changed: save the content and end the request
|
108
|
+
def save_content_end_request http
|
109
|
+
::WebpageArchivist.debug "[#{@uri}] writing content to #{element.file_name}" if ::WebpageArchivist.log
|
110
|
+
|
111
|
+
element.save_content http.response
|
112
|
+
self.status= :over
|
113
|
+
notify_requesters
|
114
|
+
end
|
115
|
+
|
116
|
+
def notify_requesters
|
117
|
+
unless @requesters_notified
|
118
|
+
::WebpageArchivist.debug "[#{@uri}] notify #{@requesters.length}" if ::WebpageArchivist.log
|
119
|
+
@requesters.each { |r| r.request_over(@uri) }
|
120
|
+
@requester_notified = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'eventmachine'
|
2
|
+
require_relative 'thread-pool'
|
3
|
+
|
4
|
+
# Module in charge of fetching pages
|
5
|
+
module WebpageArchivist::Fetcher
|
6
|
+
|
7
|
+
SEMAPHORE = Mutex.new
|
8
|
+
|
9
|
+
# Fetch several webpages, return an hash indexed by the ids holding the corresponding Instances or http result codes
|
10
|
+
# (may be existing instances if the pages haven't changed)
|
11
|
+
def self.fetch_webpages ids
|
12
|
+
if ids.empty?
|
13
|
+
[]
|
14
|
+
else
|
15
|
+
SEMAPHORE.synchronize do
|
16
|
+
@fetcher_watcher = FetcherWatcher.new
|
17
|
+
EventMachine.run do
|
18
|
+
WebpageArchivist::Webpage.filter(:id => ids).each do |webpage|
|
19
|
+
@fetcher_watcher.add_request WebpageRequest.new(webpage, @fetcher_watcher)
|
20
|
+
end
|
21
|
+
@fetcher_watcher.wait
|
22
|
+
end
|
23
|
+
|
24
|
+
result = {}
|
25
|
+
@fetcher_watcher.requests.each do |webpage_request|
|
26
|
+
result[webpage_request.webpage.id] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
|
27
|
+
end
|
28
|
+
result
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Wait for callbacks for webpages' requests, stop event machine when all requests are over and manage the snapshot generation
|
34
|
+
class FetcherWatcher
|
35
|
+
|
36
|
+
attr_reader :requests
|
37
|
+
|
38
|
+
def initialize
|
39
|
+
@waiting_requests = 0
|
40
|
+
@status = :starting
|
41
|
+
@requests = []
|
42
|
+
@thread_pool = Pool.new 1
|
43
|
+
end
|
44
|
+
|
45
|
+
# Start to wait
|
46
|
+
def wait
|
47
|
+
@status = :waiting
|
48
|
+
if @waiting_requests == 0
|
49
|
+
end_watcher
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Add a request to wait for
|
54
|
+
def add_request request
|
55
|
+
@waiting_requests += 1
|
56
|
+
@requests << request
|
57
|
+
end
|
58
|
+
|
59
|
+
# A request is over
|
60
|
+
# request:: the request
|
61
|
+
# ok:: indicates if the request went ok, in this case ask for a snapshot
|
62
|
+
def end_request request, ok
|
63
|
+
@waiting_requests -= 1
|
64
|
+
if ok && request.instance
|
65
|
+
@thread_pool.schedule do
|
66
|
+
::WebpageArchivist::Snapshoter.snapshot_instance request.instance
|
67
|
+
end
|
68
|
+
end
|
69
|
+
if (@status == :waiting) && (@waiting_requests <= 0)
|
70
|
+
end_watcher
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# End the watch
|
75
|
+
def end_watcher
|
76
|
+
EM.stop
|
77
|
+
@thread_pool.shutdown
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'andand'
|
3
|
+
|
4
|
+
module WebpageArchivist::Fetcher
|
5
|
+
# Contains the plumbing for the fetching code
|
6
|
+
class RequestsPlumber
|
7
|
+
|
8
|
+
MAX_RUNNING_REQUESTS = (ENV['ARCHIVIST_MAX_RUNNING_REQUESTS'].andand.to_i || 20)
|
9
|
+
|
10
|
+
attr_reader :requests_files
|
11
|
+
|
12
|
+
@@next_tick = false
|
13
|
+
@@waiting_requests = []
|
14
|
+
@@running_requests = 0
|
15
|
+
|
16
|
+
def initialize webpage_request
|
17
|
+
@requests = {}
|
18
|
+
@requests_hashes = Set.new
|
19
|
+
@requests_files = Set.new
|
20
|
+
@webpage_request = webpage_request
|
21
|
+
@@waiting_requests << webpage_request
|
22
|
+
RequestsPlumber.new_request
|
23
|
+
end
|
24
|
+
|
25
|
+
# The page being fetched
|
26
|
+
def webpage
|
27
|
+
@webpage_request.webpage
|
28
|
+
end
|
29
|
+
|
30
|
+
# Access an element request by its uri
|
31
|
+
def [] key
|
32
|
+
@requests[key]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Request an element to be fetched
|
36
|
+
# When the fetch is called, request_over will be called on the requester with the uri
|
37
|
+
# requester:: the element doing the request
|
38
|
+
# type:: the type the type of requested element
|
39
|
+
# uri:: the requested uri
|
40
|
+
def request_element requester, request_element_type, uri
|
41
|
+
::WebpageArchivist.debug "Requesting [#{uri}] for [#{requester.uri}]" if ::WebpageArchivist.log
|
42
|
+
|
43
|
+
if request = @requests[uri]
|
44
|
+
if request.status == :over
|
45
|
+
::WebpageArchivist.debug "Request already done" if ::WebpageArchivist.log
|
46
|
+
requester.request_over uri
|
47
|
+
else
|
48
|
+
::WebpageArchivist.debug "Adding to requesters" if ::WebpageArchivist.log
|
49
|
+
request.add_requester requester
|
50
|
+
end
|
51
|
+
else
|
52
|
+
::WebpageArchivist.debug "Creating new request" if ::WebpageArchivist.log
|
53
|
+
if request_element_type == WebpageArchivist::Stylesheet
|
54
|
+
request = StyleSheetRequest.new(uri, self)
|
55
|
+
else
|
56
|
+
request = ElementRequest.new(request_element_type, uri, self)
|
57
|
+
end
|
58
|
+
@requests[uri] = request
|
59
|
+
request.add_requester requester
|
60
|
+
|
61
|
+
@@waiting_requests << request
|
62
|
+
|
63
|
+
# try registering for the next tick
|
64
|
+
RequestsPlumber.new_request
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Notify that a requst as ended so it can start another one
|
69
|
+
def request_ended
|
70
|
+
@@running_requests -= 1
|
71
|
+
unless @@waiting_requests.empty?
|
72
|
+
RequestsPlumber.new_request
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Get the has corresponding to an uri and make sure there is no collision
|
77
|
+
def get_hash type, uri, extension
|
78
|
+
file_hash = Digest::SHA1.hexdigest(uri)
|
79
|
+
if @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
|
80
|
+
i = 0
|
81
|
+
begin
|
82
|
+
file_hash = Digest::SHA1.hexdigest("#{uri}#{i}")
|
83
|
+
i += 1
|
84
|
+
end while @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
|
85
|
+
end
|
86
|
+
|
87
|
+
@requests_files << "#{file_hash}#{extension}"
|
88
|
+
file_hash
|
89
|
+
end
|
90
|
+
|
91
|
+
# Register a filename so it is considered part of the webpage
|
92
|
+
def register_file_name file_name
|
93
|
+
@requests_files << file_name
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the charset of a response, may be nil
|
97
|
+
def response_charset http
|
98
|
+
type = http.response_header[EventMachine::HttpClient::CONTENT_TYPE]
|
99
|
+
if type
|
100
|
+
match = /.+;\s*charset=(.+)/i.match(type)
|
101
|
+
if match
|
102
|
+
match[1].upcase
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def last_modified http
|
108
|
+
http.response_header.last_modified || DateTime.now.rfc2822
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
# Called when a new request has been added
|
114
|
+
# Register a callback for the next tick
|
115
|
+
def self.new_request
|
116
|
+
if (!@@next_tick) && (@@running_requests < MAX_RUNNING_REQUESTS)
|
117
|
+
EventMachine.next_tick do
|
118
|
+
RequestsPlumber.next_tick
|
119
|
+
end
|
120
|
+
@@next_tick = true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Start the first waiting request
|
125
|
+
def self.next_tick
|
126
|
+
unless @@waiting_requests.empty?
|
127
|
+
@@waiting_requests.shift.start
|
128
|
+
@@running_requests += 1
|
129
|
+
end
|
130
|
+
|
131
|
+
@@next_tick = false
|
132
|
+
|
133
|
+
unless @@waiting_requests.empty?
|
134
|
+
RequestsPlumber.new_request
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|