webpage-archivist 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ *.gem
3
+ *.sqlite3
4
+ .bundle
5
+ Gemfile.lock
6
+ pkg/*
7
+ webpages/*
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://www.rubygems.org"
2
+
3
+ gemspec
data/README.rdoc ADDED
@@ -0,0 +1,50 @@
1
+ An utility to archive webpages through time.
2
+
3
+ Takes snapshots and make incremental backups of webpages assets so you can follow the pages' evolutions through time.
4
+
5
+ * Assets are stored in a git respository to simplify incremental storage and easy retrieval
6
+ * Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
7
+ * List of webpages and archives instances are stored in an SQL database
8
+ * Some caching data are stored in the same databse
9
+
10
+ = Required tools:
11
+
12
+ * An SQL database supported by Sequel[http://sequel.rubyforge.org/]
13
+ * Git[http://git-scm.com/]
14
+ * ImageMagick[http://www.imagemagick.org/script/index.php]
15
+ * wkhtmltoimage[http://code.google.com/p/wkhtmltopdf/] (work but not so well on Mac OS, prefer Linux for real usage)
16
+
17
+ = Installation
18
+
19
+ * Install the required tools
20
+ * Install the gem
21
+ * All configuration items have default value, have a look bellow if you want to customize them (default database configuration require the sqlite3 gem)
22
+ * Use it !: all the required files and database structure will be created at first call
23
+
24
+ = API
25
+
26
+ The public API is provided by WebpageArchivist::WebpageArchivist, example:
27
+
28
+ require 'webpage-archivist'
29
+ archivist = WebpageArchivist::WebpageArchivist.new
30
+ webpage = archivist.add_webpage('http://www.nytimes.com/' , 'The New York Times')
31
+ archivist.fetch_webpages [webpage.id]
32
+
33
+ Models are available in the lib/webpag-archivist/models.rb file, have a look at the Sequel[http://sequel.rubyforge.org/] API if you want to querry them.
34
+
35
+ = Configuration
36
+
37
+ Basic configuration is done through environment variables:
38
+
39
+ * +DATABASE_URL+ : database url, default to <tt>sqlite://#{Dir.pwd}/webpage-archivist.sqlite3</tt> syntax is described here[http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html], remember to add the required database gem
40
+ * +ARCHIVIST_ASSETS_PATH+ : path to store the assets, default to +./archivist_assets+
41
+ * +ARCHIVIST_SNAPSHOTS_PATH+ : path to store the thumbnail, default to +./archivist_snapshots+
42
+ * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
43
+ * +IMAGE_MAGICK_PATH+ : path to ImageMagick executables if they aren't in the path
44
+
45
+ Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
46
+
47
+ To enable debugging use
48
+
49
+ WebpageArchivist.log= true
50
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+
4
+ desc "Run all tests"
5
+ Rake::TestTask.new do |t|
6
+ t.test_files = FileList['test/*_test.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,20 @@
1
+ require_relative 'webpage-archivist/migrations'
2
+
3
+ require_relative 'webpage-archivist/patches'
4
+
5
+ require_relative 'webpage-archivist/html_document'
6
+ require_relative 'webpage-archivist/stylesheet_document'
7
+
8
+ require_relative 'webpage-archivist/webpage-archivist'
9
+
10
+ require_relative 'webpage-archivist/models'
11
+
12
+ require_relative 'webpage-archivist/extracter'
13
+
14
+ require_relative 'webpage-archivist/snapshoter'
15
+
16
+ require_relative 'webpage-archivist/fetcher/fetcher'
17
+ require_relative 'webpage-archivist/fetcher/requests_plumber'
18
+ require_relative 'webpage-archivist/fetcher/element_request'
19
+ require_relative 'webpage-archivist/fetcher/stylesheet_request'
20
+ require_relative 'webpage-archivist/fetcher/webpage_request'
@@ -0,0 +1,31 @@
1
+ require 'addressable/uri'
2
+
3
+ module WebpageArchivist
4
+
5
+ # Fetch webpages' content
6
+ module Extracter
7
+
8
+ # Write the full content of a webpage instance into a zip file
9
+ # id:: the instance id
10
+ # file:: the file to write to
11
+ def self.instance_content id, file
12
+ @instance = Instance.filter(:id => id).first
13
+ unless @instance
14
+ raise "Instance [#{id}] not found"
15
+ end
16
+ repository = @instance.webpage.repository
17
+ commit = repository.log.find{ |l| l.message == @instance.commit_timestamp }
18
+
19
+ unless file.end_with? '.zip'
20
+ file << '.zip'
21
+ end
22
+
23
+ if commit
24
+ repository.archive_zip commit.id, file
25
+ else
26
+ raise "Version [#{@instance.commit_timestamp}] not found"
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,126 @@
1
+ module WebpageArchivist::Fetcher
2
+
3
+ # Requesting a webpage
4
+ class ElementRequest
5
+
6
+ attr_reader :element, :uri
7
+
8
+ attr_accessor :status
9
+
10
+ def initialize request_element_type, uri, plumber
11
+ ::WebpageArchivist.debug "Creating request [#{uri}]" if ::WebpageArchivist.log
12
+
13
+ @requesters = []
14
+ @request_element_type = request_element_type
15
+ @uri = uri
16
+ @element = request_element_type.filter(:uri => uri).first
17
+ @status = :fetching
18
+ @plumber = plumber
19
+
20
+ @requesters_notified = false
21
+ end
22
+
23
+ # Start the request
24
+ # Not in initialize so we can register the request before executing it and throttle the number of connections
25
+ # retries:: number of retries in case of error
26
+ def start retries = 3
27
+ ::WebpageArchivist.debug "Starting request [#{uri}]" if ::WebpageArchivist.log
28
+ head = {'accept-encoding' => 'gzip, compressed'}
29
+ if element
30
+ head['If-Modified-Since'] = element.last_modified
31
+ @plumber.register_file_name element.file_name
32
+ end
33
+
34
+ # Don't overflow the servers or they will kick us out
35
+ http = EventMachine::HttpRequest.new(uri).get :redirects => 5, :timeout => 30, :head => head
36
+ http.callback do
37
+ if ([500, 503].include? http.response_header.status) && (retries > 0)
38
+ start(retries - 1)
39
+ else
40
+ will_process_response http
41
+ end
42
+ end
43
+ http.errback do
44
+ ::WebpageArchivist.debug "[#{@uri}] errback" if ::WebpageArchivist.log
45
+ if retries > 0
46
+ start(retries - 1)
47
+ else
48
+ will_process_response http
49
+ end
50
+ end
51
+ end
52
+
53
+ # Call process_response and ensure managment is done
54
+ def will_process_response http
55
+ begin
56
+ process_response http
57
+ rescue Exception => e
58
+ ::WebpageArchivist.error e if ::WebpageArchivist.log
59
+ notify_requesters
60
+ ensure
61
+ @plumber.request_ended
62
+ end
63
+ end
64
+
65
+ # Add a requester to be notified when the request is over
66
+ def add_requester requester
67
+ @requesters << requester
68
+ end
69
+
70
+ # Process the response
71
+ def process_response http
72
+ result_code = http.response_header.status
73
+ ::WebpageArchivist.debug "[#{@uri}] returned #{result_code}" if ::WebpageArchivist.log
74
+
75
+ if [304, 408, 0].include? result_code
76
+ # Not changed or connection error
77
+ if element
78
+ element.update(:last_fetched => DateTime.now)
79
+ end
80
+ content_not_changed http
81
+ elsif result_code == 200
82
+ if element
83
+ element.update(:last_fetched => DateTime.now,
84
+ :last_modified => http.response_header.last_modified || DateTime.now.rfc2822)
85
+ else
86
+ extension = @request_element_type.extention(@uri, http.response_header[EventMachine::HttpClient::CONTENT_TYPE])
87
+ @element = @request_element_type.create(:webpage => @plumber.webpage,
88
+ :uri => @uri,
89
+ :file_hash => @plumber.get_hash(@request_element_type, @uri, extension),
90
+ :extension => extension,
91
+ :last_fetched => DateTime.now,
92
+ :last_modified => @plumber.last_modified(http))
93
+ end
94
+ save_content_end_request http
95
+ else
96
+ self.status= :over
97
+ notify_requesters
98
+ end
99
+ end
100
+
101
+ # Content has not changed since last fetch
102
+ def content_not_changed http
103
+ self.status= :over
104
+ notify_requesters
105
+ end
106
+
107
+ # Content has changed: save the content and end the request
108
+ def save_content_end_request http
109
+ ::WebpageArchivist.debug "[#{@uri}] writing content to #{element.file_name}" if ::WebpageArchivist.log
110
+
111
+ element.save_content http.response
112
+ self.status= :over
113
+ notify_requesters
114
+ end
115
+
116
+ def notify_requesters
117
+ unless @requesters_notified
118
+ ::WebpageArchivist.debug "[#{@uri}] notify #{@requesters.length}" if ::WebpageArchivist.log
119
+ @requesters.each { |r| r.request_over(@uri) }
120
+ @requester_notified = true
121
+ end
122
+ end
123
+
124
+ end
125
+
126
+ end
@@ -0,0 +1,83 @@
1
+ require 'eventmachine'
2
+ require_relative 'thread-pool'
3
+
4
+ # Module in charge of fetching pages
5
+ module WebpageArchivist::Fetcher
6
+
7
+ SEMAPHORE = Mutex.new
8
+
9
+ # Fetch several webpages, return an hash indexed by the ids holding the corresponding Instances or http result codes
10
+ # (may be existing instances if the pages haven't changed)
11
+ def self.fetch_webpages ids
12
+ if ids.empty?
13
+ []
14
+ else
15
+ SEMAPHORE.synchronize do
16
+ @fetcher_watcher = FetcherWatcher.new
17
+ EventMachine.run do
18
+ WebpageArchivist::Webpage.filter(:id => ids).each do |webpage|
19
+ @fetcher_watcher.add_request WebpageRequest.new(webpage, @fetcher_watcher)
20
+ end
21
+ @fetcher_watcher.wait
22
+ end
23
+
24
+ result = {}
25
+ @fetcher_watcher.requests.each do |webpage_request|
26
+ result[webpage_request.webpage.id] = webpage_request.instance ? webpage_request.instance : webpage_request.result_code
27
+ end
28
+ result
29
+ end
30
+ end
31
+ end
32
+
33
+ # Wait for callbacks for webpages' requests, stop event machine when all requests are over and manage the snapshot generation
34
+ class FetcherWatcher
35
+
36
+ attr_reader :requests
37
+
38
+ def initialize
39
+ @waiting_requests = 0
40
+ @status = :starting
41
+ @requests = []
42
+ @thread_pool = Pool.new 1
43
+ end
44
+
45
+ # Start to wait
46
+ def wait
47
+ @status = :waiting
48
+ if @waiting_requests == 0
49
+ end_watcher
50
+ end
51
+ end
52
+
53
+ # Add a request to wait for
54
+ def add_request request
55
+ @waiting_requests += 1
56
+ @requests << request
57
+ end
58
+
59
+ # A request is over
60
+ # request:: the request
61
+ # ok:: indicates if the request went ok, in this case ask for a snapshot
62
+ def end_request request, ok
63
+ @waiting_requests -= 1
64
+ if ok && request.instance
65
+ @thread_pool.schedule do
66
+ ::WebpageArchivist::Snapshoter.snapshot_instance request.instance
67
+ end
68
+ end
69
+ if (@status == :waiting) && (@waiting_requests <= 0)
70
+ end_watcher
71
+ end
72
+ end
73
+
74
+ # End the watch
75
+ def end_watcher
76
+ EM.stop
77
+ @thread_pool.shutdown
78
+ end
79
+
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,140 @@
1
+ require 'digest/sha1'
2
+ require 'andand'
3
+
4
+ module WebpageArchivist::Fetcher
5
+ # Contains the plumbing for the fetching code
6
+ class RequestsPlumber
7
+
8
+ MAX_RUNNING_REQUESTS = (ENV['ARCHIVIST_MAX_RUNNING_REQUESTS'].andand.to_i || 20)
9
+
10
+ attr_reader :requests_files
11
+
12
+ @@next_tick = false
13
+ @@waiting_requests = []
14
+ @@running_requests = 0
15
+
16
+ def initialize webpage_request
17
+ @requests = {}
18
+ @requests_hashes = Set.new
19
+ @requests_files = Set.new
20
+ @webpage_request = webpage_request
21
+ @@waiting_requests << webpage_request
22
+ RequestsPlumber.new_request
23
+ end
24
+
25
+ # The page being fetched
26
+ def webpage
27
+ @webpage_request.webpage
28
+ end
29
+
30
+ # Access an element request by its uri
31
+ def [] key
32
+ @requests[key]
33
+ end
34
+
35
+ # Request an element to be fetched
36
+ # When the fetch is called, request_over will be called on the requester with the uri
37
+ # requester:: the element doing the request
38
+ # type:: the type the type of requested element
39
+ # uri:: the requested uri
40
+ def request_element requester, request_element_type, uri
41
+ ::WebpageArchivist.debug "Requesting [#{uri}] for [#{requester.uri}]" if ::WebpageArchivist.log
42
+
43
+ if request = @requests[uri]
44
+ if request.status == :over
45
+ ::WebpageArchivist.debug "Request already done" if ::WebpageArchivist.log
46
+ requester.request_over uri
47
+ else
48
+ ::WebpageArchivist.debug "Adding to requesters" if ::WebpageArchivist.log
49
+ request.add_requester requester
50
+ end
51
+ else
52
+ ::WebpageArchivist.debug "Creating new request" if ::WebpageArchivist.log
53
+ if request_element_type == WebpageArchivist::Stylesheet
54
+ request = StyleSheetRequest.new(uri, self)
55
+ else
56
+ request = ElementRequest.new(request_element_type, uri, self)
57
+ end
58
+ @requests[uri] = request
59
+ request.add_requester requester
60
+
61
+ @@waiting_requests << request
62
+
63
+ # try registering for the next tick
64
+ RequestsPlumber.new_request
65
+ end
66
+ end
67
+
68
+ # Notify that a requst as ended so it can start another one
69
+ def request_ended
70
+ @@running_requests -= 1
71
+ unless @@waiting_requests.empty?
72
+ RequestsPlumber.new_request
73
+ end
74
+ end
75
+
76
+ # Get the has corresponding to an uri and make sure there is no collision
77
+ def get_hash type, uri, extension
78
+ file_hash = Digest::SHA1.hexdigest(uri)
79
+ if @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
80
+ i = 0
81
+ begin
82
+ file_hash = Digest::SHA1.hexdigest("#{uri}#{i}")
83
+ i += 1
84
+ end while @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
85
+ end
86
+
87
+ @requests_files << "#{file_hash}#{extension}"
88
+ file_hash
89
+ end
90
+
91
+ # Register a filename so it is considered part of the webpage
92
+ def register_file_name file_name
93
+ @requests_files << file_name
94
+ end
95
+
96
+ # Get the charset of a response, may be nil
97
+ def response_charset http
98
+ type = http.response_header[EventMachine::HttpClient::CONTENT_TYPE]
99
+ if type
100
+ match = /.+;\s*charset=(.+)/i.match(type)
101
+ if match
102
+ match[1].upcase
103
+ end
104
+ end
105
+ end
106
+
107
+ def last_modified http
108
+ http.response_header.last_modified || DateTime.now.rfc2822
109
+ end
110
+
111
+ private
112
+
113
+ # Called when a new request has been added
114
+ # Register a callback for the next tick
115
+ def self.new_request
116
+ if (!@@next_tick) && (@@running_requests < MAX_RUNNING_REQUESTS)
117
+ EventMachine.next_tick do
118
+ RequestsPlumber.next_tick
119
+ end
120
+ @@next_tick = true
121
+ end
122
+ end
123
+
124
+ # Start the first waiting request
125
+ def self.next_tick
126
+ unless @@waiting_requests.empty?
127
+ @@waiting_requests.shift.start
128
+ @@running_requests += 1
129
+ end
130
+
131
+ @@next_tick = false
132
+
133
+ unless @@waiting_requests.empty?
134
+ RequestsPlumber.new_request
135
+ end
136
+ end
137
+
138
+ end
139
+
140
+ end