webpage-archivist 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ *.gem
3
+ *.sqlite3
4
+ .bundle
5
+ Gemfile.lock
6
+ pkg/*
7
+ webpages/*
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://www.rubygems.org"
2
+
3
+ gemspec
data/README.rdoc ADDED
@@ -0,0 +1,50 @@
1
+ An utility to archive webpages through time.
2
+
3
+ Takes snapshots and make incremental backups of webpages assets so you can follow the pages' evolutions through time.
4
+
5
+ * Assets are stored in a git respository to simplify incremental storage and easy retrieval
6
+ * Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
7
+ * List of webpages and archives instances are stored in an SQL database
8
+ * Some caching data are stored in the same databse
9
+
10
+ = Required tools:
11
+
12
+ * An SQL database supported by Sequel[http://sequel.rubyforge.org/]
13
+ * Git[http://git-scm.com/]
14
+ * ImageMagick[http://www.imagemagick.org/script/index.php]
15
+ * wkhtmltoimage[http://code.google.com/p/wkhtmltopdf/] (work but not so well on Mac OS, prefer Linux for real usage)
16
+
17
+ = Installation
18
+
19
+ * Install the required tools
20
+ * Install the gem
21
+ * All configuration items have default value, have a look bellow if you want to customize them (default database configuration require the sqlite3 gem)
22
+ * Use it !: all the required files and database structure will be created at first call
23
+
24
+ = API
25
+
26
+ The public API is provided by WebpageArchivist::WebpageArchivist, example:
27
+
28
+ require 'webpage-archivist'
29
+ archivist = WebpageArchivist::WebpageArchivist.new
30
+ webpage = archivist.add_webpage('http://www.nytimes.com/' , 'The New York Times')
31
+ archivist.fetch_webpages [webpage.id]
32
+
33
+ Models are available in the lib/webpag-archivist/models.rb file, have a look at the Sequel[http://sequel.rubyforge.org/] API if you want to querry them.
34
+
35
+ = Configuration
36
+
37
+ Basic configuration is done through environment variables:
38
+
39
+ * +DATABASE_URL+ : database url, default to <tt>sqlite://#{Dir.pwd}/webpage-archivist.sqlite3</tt> syntax is described here[http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html], remember to add the required database gem
40
+ * +ARCHIVIST_ASSETS_PATH+ : path to store the assets, default to +./archivist_assets+
41
+ * +ARCHIVIST_SNAPSHOTS_PATH+ : path to store the thumbnail, default to +./archivist_snapshots+
42
+ * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
43
+ * +IMAGE_MAGICK_PATH+ : path to ImageMagick executables if they aren't in the path
44
+
45
+ Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
46
+
47
+ To enable debugging use
48
+
49
+ WebpageArchivist.log= true
50
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+
4
+ desc "Run all tests"
5
+ Rake::TestTask.new do |t|
6
+ t.test_files = FileList['test/*_test.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,20 @@
1
+ require_relative 'webpage-archivist/migrations'
2
+
3
+ require_relative 'webpage-archivist/patches'
4
+
5
+ require_relative 'webpage-archivist/html_document'
6
+ require_relative 'webpage-archivist/stylesheet_document'
7
+
8
+ require_relative 'webpage-archivist/webpage-archivist'
9
+
10
+ require_relative 'webpage-archivist/models'
11
+
12
+ require_relative 'webpage-archivist/extracter'
13
+
14
+ require_relative 'webpage-archivist/snapshoter'
15
+
16
+ require_relative 'webpage-archivist/fetcher/fetcher'
17
+ require_relative 'webpage-archivist/fetcher/requests_plumber'
18
+ require_relative 'webpage-archivist/fetcher/element_request'
19
+ require_relative 'webpage-archivist/fetcher/stylesheet_request'
20
+ require_relative 'webpage-archivist/fetcher/webpage_request'
@@ -0,0 +1,31 @@
1
+ require 'addressable/uri'
2
+
3
+ module WebpageArchivist
4
+
5
+ # Fetch webpages' content
6
+ module Extracter
7
+
8
+ # Write the full content of a webpage instance into a zip file
9
+ # id:: the instance id
10
+ # file:: the file to write to
11
+ def self.instance_content id, file
12
+ @instance = Instance.filter(:id => id).first
13
+ unless @instance
14
+ raise "Instance [#{id}] not found"
15
+ end
16
+ repository = @instance.webpage.repository
17
+ commit = repository.log.find{ |l| l.message == @instance.commit_timestamp }
18
+
19
+ unless file.end_with? '.zip'
20
+ file << '.zip'
21
+ end
22
+
23
+ if commit
24
+ repository.archive_zip commit.id, file
25
+ else
26
+ raise "Version [#{@instance.commit_timestamp}] not found"
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,126 @@
1
+ module WebpageArchivist::Fetcher
2
+
3
+ # Requesting a webpage
4
+ class ElementRequest
5
+
6
+ attr_reader :element, :uri
7
+
8
+ attr_accessor :status
9
+
10
+ def initialize request_element_type, uri, plumber
11
+ ::WebpageArchivist.debug "Creating request [#{uri}]" if ::WebpageArchivist.log
12
+
13
+ @requesters = []
14
+ @request_element_type = request_element_type
15
+ @uri = uri
16
+ @element = request_element_type.filter(:uri => uri).first
17
+ @status = :fetching
18
+ @plumber = plumber
19
+
20
+ @requesters_notified = false
21
+ end
22
+
23
+ # Start the request
24
+ # Not in initialize so we can register the request before executing it and throttle the number of connections
25
+ # retries:: number of retries in case of error
26
+ def start retries = 3
27
+ ::WebpageArchivist.debug "Starting request [#{uri}]" if ::WebpageArchivist.log
28
+ head = {'accept-encoding' => 'gzip, compressed'}
29
+ if element
30
+ head['If-Modified-Since'] = element.last_modified
31
+ @plumber.register_file_name element.file_name
32
+ end
33
+
34
+ # Don't overflow the servers or they will kick us out
35
+ http = EventMachine::HttpRequest.new(uri).get :redirects => 5, :timeout => 30, :head => head
36
+ http.callback do
37
+ if ([500, 503].include? http.response_header.status) && (retries > 0)
38
+ start(retries - 1)
39
+ else
40
+ will_process_response http
41
+ end
42
+ end
43
+ http.errback do
44
+ ::WebpageArchivist.debug "[#{@uri}] errback" if ::WebpageArchivist.log
45
+ if retries > 0
46
+ start(retries - 1)
47
+ else
48
+ will_process_response http
49
+ end
50
+ end
51
+ end
52
+
53
+ # Call process_response and ensure managment is done
54
+ def will_process_response http
55
+ begin
56
+ process_response http
57
+ rescue Exception => e
58
+ ::WebpageArchivist.error e if ::WebpageArchivist.log
59
+ notify_requesters
60
+ ensure
61
+ @plumber.request_ended
62
+ end
63
+ end
64
+
65
+ # Add a requester to be notified when the request is over
66
+ def add_requester requester
67
+ @requesters << requester
68
+ end
69
+
70
+ # Process the response
71
+ def process_response http
72
+ result_code = http.response_header.status
73
+ ::WebpageArchivist.debug "[#{@uri}] returned #{result_code}" if ::WebpageArchivist.log
74
+
75
+ if [304, 408, 0].include? result_code
76
+ # Not changed or connection error
77
+ if element
78
+ element.update(:last_fetched => DateTime.now)
79
+ end
80
+ content_not_changed http
81
+ elsif result_code == 200
82
+ if element
83
+ element.update(:last_fetched => DateTime.now,
84
+ :last_modified => http.response_header.last_modified || DateTime.now.rfc2822)
85
+ else
86
+ extension = @request_element_type.extention(@uri, http.response_header[EventMachine::HttpClient::CONTENT_TYPE])
87
+ @element = @request_element_type.create(:webpage => @plumber.webpage,
88
+ :uri => @uri,
89
+ :file_hash => @plumber.get_hash(@request_element_type, @uri, extension),
90
+ :extension => extension,
91
+ :last_fetched => DateTime.now,
92
+ :last_modified => @plumber.last_modified(http))
93
+ end
94
+ save_content_end_request http
95
+ else
96
+ self.status= :over
97
+ notify_requesters
98
+ end
99
+ end
100
+
101
+ # Content has not changed since last fetch
102
+ def content_not_changed http
103
+ self.status= :over
104
+ notify_requesters
105
+ end
106
+
107
+ # Content has changed: save the content and end the request
108
+ def save_content_end_request http
109
+ ::WebpageArchivist.debug "[#{@uri}] writing content to #{element.file_name}" if ::WebpageArchivist.log
110
+
111
+ element.save_content http.response
112
+ self.status= :over
113
+ notify_requesters
114
+ end
115
+
116
+ def notify_requesters
117
+ unless @requesters_notified
118
+ ::WebpageArchivist.debug "[#{@uri}] notify #{@requesters.length}" if ::WebpageArchivist.log
119
+ @requesters.each { |r| r.request_over(@uri) }
120
+ @requester_notified = true
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ end
@@ -0,0 +1,83 @@
1
+ require 'eventmachine'
2
+ require_relative 'thread-pool'
3
+
4
+ # Module in charge of fetching pages
5
+ module WebpageArchivist::Fetcher
6
+
7
+ SEMAPHORE = Mutex.new
8
+
9
+ # Fetch several webpages, return an hash indexed by the ids holding the corresponding Instances or http result codes
10
+ # (may be existing instances if the pages haven't changed)
11
+ def self.fetch_webpages ids
12
+ if ids.empty?
13
+ []
14
+ else
15
+ SEMAPHORE.synchronize do
16
+ @fetcher_watcher = FetcherWatcher.new
17
+ EventMachine.run do
18
+ WebpageArchivist::Webpage.filter(:id => ids).each do |webpage|
19
+ @fetcher_watcher.add_request WebpageRequest.new(webpage, @fetcher_watcher)
20
+ end
21
+ @fetcher_watcher.wait
22
+ end
23
+
24
+ result = {}
25
+ @fetcher_watcher.requests.each do |webpage_request|
26
+ result[webpage_request.webpage.id] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
27
+ end
28
+ result
29
+ end
30
+ end
31
+ end
32
+
33
+ # Wait for callbacks for webpages' requests, stop event machine when all requests are over and manage the snapshot generation
34
+ class FetcherWatcher
35
+
36
+ attr_reader :requests
37
+
38
+ def initialize
39
+ @waiting_requests = 0
40
+ @status = :starting
41
+ @requests = []
42
+ @thread_pool = Pool.new 1
43
+ end
44
+
45
+ # Start to wait
46
+ def wait
47
+ @status = :waiting
48
+ if @waiting_requests == 0
49
+ end_watcher
50
+ end
51
+ end
52
+
53
+ # Add a request to wait for
54
+ def add_request request
55
+ @waiting_requests += 1
56
+ @requests << request
57
+ end
58
+
59
+ # A request is over
60
+ # request:: the request
61
+ # ok:: indicates if the request went ok, in this case ask for a snapshot
62
+ def end_request request, ok
63
+ @waiting_requests -= 1
64
+ if ok && request.instance
65
+ @thread_pool.schedule do
66
+ ::WebpageArchivist::Snapshoter.snapshot_instance request.instance
67
+ end
68
+ end
69
+ if (@status == :waiting) && (@waiting_requests <= 0)
70
+ end_watcher
71
+ end
72
+ end
73
+
74
+ # End the watch
75
+ def end_watcher
76
+ EM.stop
77
+ @thread_pool.shutdown
78
+ end
79
+
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,140 @@
1
+ require 'digest/sha1'
2
+ require 'andand'
3
+
4
+ module WebpageArchivist::Fetcher
5
+ # Contains the plumbing for the fetching code
6
+ class RequestsPlumber
7
+
8
+ MAX_RUNNING_REQUESTS = (ENV['ARCHIVIST_MAX_RUNNING_REQUESTS'].andand.to_i || 20)
9
+
10
+ attr_reader :requests_files
11
+
12
+ @@next_tick = false
13
+ @@waiting_requests = []
14
+ @@running_requests = 0
15
+
16
+ def initialize webpage_request
17
+ @requests = {}
18
+ @requests_hashes = Set.new
19
+ @requests_files = Set.new
20
+ @webpage_request = webpage_request
21
+ @@waiting_requests << webpage_request
22
+ RequestsPlumber.new_request
23
+ end
24
+
25
+ # The page being fetched
26
+ def webpage
27
+ @webpage_request.webpage
28
+ end
29
+
30
+ # Access an element request by its uri
31
+ def [] key
32
+ @requests[key]
33
+ end
34
+
35
+ # Request an element to be fetched
36
+ # When the fetch is called, request_over will be called on the requester with the uri
37
+ # requester:: the element doing the request
38
+ # type:: the type the type of requested element
39
+ # uri:: the requested uri
40
+ def request_element requester, request_element_type, uri
41
+ ::WebpageArchivist.debug "Requesting [#{uri}] for [#{requester.uri}]" if ::WebpageArchivist.log
42
+
43
+ if request = @requests[uri]
44
+ if request.status == :over
45
+ ::WebpageArchivist.debug "Request already done" if ::WebpageArchivist.log
46
+ requester.request_over uri
47
+ else
48
+ ::WebpageArchivist.debug "Adding to requesters" if ::WebpageArchivist.log
49
+ request.add_requester requester
50
+ end
51
+ else
52
+ ::WebpageArchivist.debug "Creating new request" if ::WebpageArchivist.log
53
+ if request_element_type == WebpageArchivist::Stylesheet
54
+ request = StyleSheetRequest.new(uri, self)
55
+ else
56
+ request = ElementRequest.new(request_element_type, uri, self)
57
+ end
58
+ @requests[uri] = request
59
+ request.add_requester requester
60
+
61
+ @@waiting_requests << request
62
+
63
+ # try registering for the next tick
64
+ RequestsPlumber.new_request
65
+ end
66
+ end
67
+
68
+ # Notify that a requst as ended so it can start another one
69
+ def request_ended
70
+ @@running_requests -= 1
71
+ unless @@waiting_requests.empty?
72
+ RequestsPlumber.new_request
73
+ end
74
+ end
75
+
76
+ # Get the has corresponding to an uri and make sure there is no collision
77
+ def get_hash type, uri, extension
78
+ file_hash = Digest::SHA1.hexdigest(uri)
79
+ if @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
80
+ i = 0
81
+ begin
82
+ file_hash = Digest::SHA1.hexdigest("#{uri}#{i}")
83
+ i += 1
84
+ end while @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
85
+ end
86
+
87
+ @requests_files << "#{file_hash}#{extension}"
88
+ file_hash
89
+ end
90
+
91
+ # Register a filename so it is considered part of the webpage
92
+ def register_file_name file_name
93
+ @requests_files << file_name
94
+ end
95
+
96
+ # Get the charset of a response, may be nil
97
+ def response_charset http
98
+ type = http.response_header[EventMachine::HttpClient::CONTENT_TYPE]
99
+ if type
100
+ match = /.+;\s*charset=(.+)/i.match(type)
101
+ if match
102
+ match[1].upcase
103
+ end
104
+ end
105
+ end
106
+
107
+ def last_modified http
108
+ http.response_header.last_modified || DateTime.now.rfc2822
109
+ end
110
+
111
+ private
112
+
113
+ # Called when a new request has been added
114
+ # Register a callback for the next tick
115
+ def self.new_request
116
+ if (!@@next_tick) && (@@running_requests < MAX_RUNNING_REQUESTS)
117
+ EventMachine.next_tick do
118
+ RequestsPlumber.next_tick
119
+ end
120
+ @@next_tick = true
121
+ end
122
+ end
123
+
124
+ # Start the first waiting request
125
+ def self.next_tick
126
+ unless @@waiting_requests.empty?
127
+ @@waiting_requests.shift.start
128
+ @@running_requests += 1
129
+ end
130
+
131
+ @@next_tick = false
132
+
133
+ unless @@waiting_requests.empty?
134
+ RequestsPlumber.new_request
135
+ end
136
+ end
137
+
138
+ end
139
+
140
+ end