webpage-archivist 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/README.rdoc +50 -0
- data/Rakefile +10 -0
- data/lib/webpage-archivist.rb +20 -0
- data/lib/webpage-archivist/extracter.rb +31 -0
- data/lib/webpage-archivist/fetcher/element_request.rb +126 -0
- data/lib/webpage-archivist/fetcher/fetcher.rb +83 -0
- data/lib/webpage-archivist/fetcher/requests_plumber.rb +140 -0
- data/lib/webpage-archivist/fetcher/stylesheet_request.rb +112 -0
- data/lib/webpage-archivist/fetcher/thread-pool.rb +101 -0
- data/lib/webpage-archivist/fetcher/webpage_request.rb +197 -0
- data/lib/webpage-archivist/html_document.rb +66 -0
- data/lib/webpage-archivist/migrations.rb +93 -0
- data/lib/webpage-archivist/models.rb +190 -0
- data/lib/webpage-archivist/patches.rb +63 -0
- data/lib/webpage-archivist/snapshoter.rb +77 -0
- data/lib/webpage-archivist/stylesheet_document.rb +129 -0
- data/lib/webpage-archivist/version.rb +3 -0
- data/lib/webpage-archivist/webpage-archivist.rb +79 -0
- data/test/crud_test.rb +28 -0
- data/test/files/stylesheet.css +14 -0
- data/test/helper.rb +15 -0
- data/test/stylesheet_test.rb +48 -0
- data/webpage-archivist.gemspec +38 -0
- metadata +284 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'andand'
|
2
|
+
|
3
|
+
module WebpageArchivist::Fetcher
|
4
|
+
# A request specific for stylesheet as a stylesheet can reference images or other stylesheets
|
5
|
+
class StyleSheetRequest < ElementRequest
|
6
|
+
|
7
|
+
|
8
|
+
def initialize uri, plumber
|
9
|
+
super WebpageArchivist::Stylesheet, uri, plumber
|
10
|
+
@parsed_uri = Addressable::URI.parse(uri)
|
11
|
+
@waiting_requests = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def content_not_changed http
|
15
|
+
@modified = false
|
16
|
+
if element.andand.last_content
|
17
|
+
create_sub_requests http
|
18
|
+
else
|
19
|
+
self.status = :over
|
20
|
+
notify_requesters
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def save_content_end_request http
|
25
|
+
@modified = true
|
26
|
+
create_sub_requests http
|
27
|
+
end
|
28
|
+
|
29
|
+
# Don't directly save the content but first retrieve any possible other stylesheet and image
|
30
|
+
def create_sub_requests http
|
31
|
+
if @modified
|
32
|
+
charset = @plumber.response_charset(http)
|
33
|
+
@stylesheet = WebpageArchivist::StylesheetDocument.new(http.response, @uri, charset)
|
34
|
+
else
|
35
|
+
@stylesheet = WebpageArchivist::StylesheetDocument.new(element.last_content, @uri, element.last_charset)
|
36
|
+
end
|
37
|
+
|
38
|
+
already = Set.new
|
39
|
+
@stylesheet.each_import do |i|
|
40
|
+
unless already.include? i
|
41
|
+
already.add i
|
42
|
+
i = @parsed_uri.absolutize(i)
|
43
|
+
@plumber.request_element self, WebpageArchivist::Stylesheet, i
|
44
|
+
@waiting_requests += 1
|
45
|
+
i
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
already = Set.new
|
50
|
+
@stylesheet.each_image do |i|
|
51
|
+
unless already.include? i
|
52
|
+
already.add i
|
53
|
+
i = @parsed_uri.absolutize(i)
|
54
|
+
@plumber.request_element self, WebpageArchivist::Image, i
|
55
|
+
@waiting_requests += 1
|
56
|
+
i
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
if @modified
|
61
|
+
element.update(:last_content => @stylesheet.to_css, :last_charset => @stylesheet.charset)
|
62
|
+
end
|
63
|
+
|
64
|
+
self.status = :fetching_requests
|
65
|
+
|
66
|
+
# No external resource -> end here
|
67
|
+
if @waiting_requests == 0
|
68
|
+
after_requests
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
# Called by a request when it is over
|
74
|
+
# uri:: the request uri
|
75
|
+
def request_over uri
|
76
|
+
@waiting_requests -= 1
|
77
|
+
::WebpageArchivist.debug "Request over for [#{@uri}] on [#{uri}], missing #{@waiting_requests}" if ::WebpageArchivist.log
|
78
|
+
|
79
|
+
if (@waiting_requests == 0) && (status == :fetching_requests)
|
80
|
+
after_requests
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Process the response once all the elements have been fetched
|
85
|
+
def after_requests
|
86
|
+
::WebpageArchivist.debug "After requests [#{@uri}]" if ::WebpageArchivist.log
|
87
|
+
if @modified
|
88
|
+
@stylesheet.each_import do |i|
|
89
|
+
if e = @plumber[i].andand.element
|
90
|
+
e.file_name
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
@stylesheet.each_image do |i|
|
97
|
+
if e = @plumber[i].andand.element
|
98
|
+
e.file_name
|
99
|
+
else
|
100
|
+
nil
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
element.save_content @stylesheet.to_css
|
105
|
+
end
|
106
|
+
self.status = :over
|
107
|
+
notify_requesters
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# Ruby Thread Pool
|
2
|
+
# ================
|
3
|
+
# A thread pool is useful when you wish to do some work in a thread, but do
|
4
|
+
# not know how much work you will be doing in advance. Spawning one thread
|
5
|
+
# for each task is potentially expensive, as threads are not free.
|
6
|
+
#
|
7
|
+
# In this case, it might be more beneficial to start a predefined set of
|
8
|
+
# threads and then hand off work to them as it becomes available. This is
|
9
|
+
# the pure essence of what a thread pool is: an array of threads, all just
|
10
|
+
# waiting to do some work for you!
|
11
|
+
#
|
12
|
+
# Prerequisites
|
13
|
+
# -------------
|
14
|
+
|
15
|
+
# We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
|
16
|
+
# thread pool is largely dependent on it. Thanks to this, the implementation
|
17
|
+
# becomes very simple!
|
18
|
+
require 'thread'
|
19
|
+
|
20
|
+
# Public Interface
|
21
|
+
# ----------------
|
22
|
+
|
23
|
+
# `Pool` is our thread pool class. It will allow us to do three operations:
|
24
|
+
#
|
25
|
+
# - `.new(size)` creates a thread pool of a given size
|
26
|
+
# - `#schedule(*args, &job)` schedules a new job to be executed
|
27
|
+
# - `#shutdown` shuts down all threads (after letting them finish working, of course)
|
28
|
+
class Pool
|
29
|
+
|
30
|
+
# ### initialization, or `Pool.new(size)`
|
31
|
+
# Creating a new `Pool` involves a certain amount of work. First, however,
|
32
|
+
# we need to define its’ `size`. It defines how many threads we will have
|
33
|
+
# working internally.
|
34
|
+
#
|
35
|
+
# Which size is best for you is hard to answer. You do not want it to be
|
36
|
+
# too low, as then you won’t be able to do as many things concurrently.
|
37
|
+
# However, if you make it too high Ruby will spend too much time switching
|
38
|
+
# between threads, and that will also degrade performance!
|
39
|
+
def initialize(size)
|
40
|
+
# Before we do anything else, we need to store some information about
|
41
|
+
# our pool. `@size` is useful later, when we want to shut our pool down,
|
42
|
+
# and `@jobs` is the heart of our pool that allows us to schedule work.
|
43
|
+
@size = size
|
44
|
+
@jobs = Queue.new
|
45
|
+
|
46
|
+
# #### Creating our pool of threads
|
47
|
+
# Once preparation is done, it’s time to create our pool of threads.
|
48
|
+
# Each thread store its’ index in a thread-local variable, in case we
|
49
|
+
# need to know which thread a job is executing in later on.
|
50
|
+
@pool = Array.new(@size) do |i|
|
51
|
+
Thread.new do
|
52
|
+
Thread.current[:id] = i
|
53
|
+
|
54
|
+
# We start off by defining a `catch` around our worker loop. This
|
55
|
+
# way we’ve provided a method for graceful shutdown of our threads.
|
56
|
+
# Shutting down is merely a `#schedule { throw :exit }` away!
|
57
|
+
catch(:exit) do
|
58
|
+
# The worker thread life-cycle is very simple. We continuously wait
|
59
|
+
# for tasks to be put into our job `Queue`. If the `Queue` is empty,
|
60
|
+
# we will wait until it’s not.
|
61
|
+
loop do
|
62
|
+
# Once we have a piece of work to be done, we will pull out the
|
63
|
+
# information we need and get to work.
|
64
|
+
job, args = @jobs.pop
|
65
|
+
job.call(*args)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# ### Work scheduling
|
73
|
+
|
74
|
+
# To schedule a piece of work to be done is to say to the `Pool` that you
|
75
|
+
# want something done.
|
76
|
+
def schedule(*args, &block)
|
77
|
+
# Your given task will not be run immediately; rather, it will be put
|
78
|
+
# into the work `Queue` and executed once a thread is ready to work.
|
79
|
+
@jobs << [block, args]
|
80
|
+
end
|
81
|
+
|
82
|
+
# ### Graceful shutdown
|
83
|
+
|
84
|
+
# If you ever wish to close down your application, I took the liberty of
|
85
|
+
# making it easy for you to wait for any currently executing jobs to finish
|
86
|
+
# before you exit.
|
87
|
+
def shutdown
|
88
|
+
# A graceful shutdown involves threads exiting cleanly themselves, and
|
89
|
+
# since we’ve defined a `catch`-handler around the threads’ worker loop
|
90
|
+
# it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
|
91
|
+
# for each thread in our pool, they will all exit eventually!
|
92
|
+
@size.times do
|
93
|
+
schedule { throw :exit }
|
94
|
+
end
|
95
|
+
|
96
|
+
# And now one final thing: wait for our `throw :exit` jobs to be run on
|
97
|
+
# all our worker threads. This call will not return until all worker threads
|
98
|
+
# have exited.
|
99
|
+
@pool.map(&:join)
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'em-http'
|
3
|
+
require 'andand'
|
4
|
+
|
5
|
+
module WebpageArchivist::Fetcher
|
6
|
+
|
7
|
+
# Requesting a webpage.
|
8
|
+
class WebpageRequest
|
9
|
+
|
10
|
+
attr_reader :webpage, :result_code, :instance, :uri, :status
|
11
|
+
|
12
|
+
# Create a request
|
13
|
+
# webpage:: the Webpage we want to fetch
|
14
|
+
# fetcher_watcher:: to be notified when the request is over
|
15
|
+
def initialize webpage, fetcher_watcher
|
16
|
+
@webpage = webpage
|
17
|
+
@fetcher_watcher = fetcher_watcher
|
18
|
+
|
19
|
+
@uri = Addressable::URI.parse(webpage.uri)
|
20
|
+
|
21
|
+
@waiting_requests = 0
|
22
|
+
|
23
|
+
@status = :fetching
|
24
|
+
|
25
|
+
@plumber = RequestsPlumber.new self
|
26
|
+
end
|
27
|
+
|
28
|
+
# Start the request
|
29
|
+
# Not in initialize so we can throttle the number of connection
|
30
|
+
# retries:: number of retries in case of error
|
31
|
+
def start retries = 3
|
32
|
+
@http = EventMachine::HttpRequest.new(@uri).get :redirects => 5, :timeout => 30, :head => {'If-Modified-Since' => webpage.last_modified, 'accept-encoding' => 'gzip, compressed'}
|
33
|
+
@http.callback do
|
34
|
+
if ([500, 503].include? @http.response_header.status) && (retries > 0)
|
35
|
+
start(retries - 1)
|
36
|
+
else
|
37
|
+
begin
|
38
|
+
process_response
|
39
|
+
rescue Exception => e
|
40
|
+
::WebpageArchivist.error e if ::WebpageArchivist.log
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@http.errback do
|
45
|
+
if retries > 0
|
46
|
+
start(retries - 1)
|
47
|
+
else
|
48
|
+
begin
|
49
|
+
process_response
|
50
|
+
rescue Exception => e
|
51
|
+
::WebpageArchivist.error e if ::WebpageArchivist.log
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Process the response
|
58
|
+
def process_response
|
59
|
+
@result_code = @http.response_header.status
|
60
|
+
::WebpageArchivist.debug "[#{@uri}] returned #{@result_code}" if ::WebpageArchivist.log
|
61
|
+
if [304, 408, 0].include? @result_code
|
62
|
+
# Not changed
|
63
|
+
::WebpageArchivist.debug "[#{@uri}] not modified" if ::WebpageArchivist.log
|
64
|
+
@modified = false
|
65
|
+
process_content
|
66
|
+
elsif result_code == 200
|
67
|
+
::WebpageArchivist.debug "[#{@uri}] modified" if ::WebpageArchivist.log
|
68
|
+
@modified = true
|
69
|
+
process_content
|
70
|
+
else
|
71
|
+
::WebpageArchivist.debug "Error #{@uri} #{@result_code}" if ::WebpageArchivist.log
|
72
|
+
@fetcher_watcher.end_request self, false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Process the content
|
77
|
+
def process_content
|
78
|
+
if @modified
|
79
|
+
charset = @plumber.response_charset @http
|
80
|
+
@content = WebpageArchivist::HtmlDocument.new(@http.response, @uri, charset)
|
81
|
+
else
|
82
|
+
@content = WebpageArchivist::HtmlDocument.new(webpage.last_content, @uri, webpage.last_encoding)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Elements from the stylesheets
|
86
|
+
already = Set.new
|
87
|
+
@content.each_stylesheet do |stylesheet|
|
88
|
+
uri = make_absolute_if_modified(stylesheet, 'href')
|
89
|
+
unless already.include? uri
|
90
|
+
already.add uri
|
91
|
+
@plumber.request_element self, WebpageArchivist::Stylesheet, uri
|
92
|
+
@waiting_requests += 1
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Elements from the scripts
|
97
|
+
already = Set.new
|
98
|
+
@content.each_script do |script|
|
99
|
+
uri = make_absolute_if_modified(script, 'src')
|
100
|
+
unless already.include? uri
|
101
|
+
already.add uri
|
102
|
+
@plumber.request_element self, WebpageArchivist::Script, uri
|
103
|
+
@waiting_requests += 1
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Elements from the images
|
108
|
+
already = Set.new
|
109
|
+
@content.each_image do |img|
|
110
|
+
uri = make_absolute_if_modified(img, 'src')
|
111
|
+
unless already.include? uri
|
112
|
+
already.add uri
|
113
|
+
@plumber.request_element self, WebpageArchivist::Image, uri
|
114
|
+
@waiting_requests += 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
if @modified
|
119
|
+
# Make links absolutes
|
120
|
+
@content.each_link do |link|
|
121
|
+
make_absolute_if_modified(link, 'href')
|
122
|
+
end
|
123
|
+
|
124
|
+
@webpage.update(
|
125
|
+
:last_modified => @plumber.last_modified(@http),
|
126
|
+
:last_content => @content.to_html,
|
127
|
+
:last_charset => @content.charset)
|
128
|
+
end
|
129
|
+
|
130
|
+
@status = :fetching_requests
|
131
|
+
|
132
|
+
# No external resource -> end here
|
133
|
+
if @waiting_requests == 0
|
134
|
+
after_requests
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
# Make an element's uri absolute
|
140
|
+
# element:: the element
|
141
|
+
# link_property:: the property holding the uri
|
142
|
+
def make_absolute_if_modified element, link_property
|
143
|
+
if @modified
|
144
|
+
element[link_property] = uri.absolutize(element[link_property])
|
145
|
+
else
|
146
|
+
element[link_property]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# Called by a request when it is over
|
151
|
+
# uri:: the request uri
|
152
|
+
def request_over uri
|
153
|
+
@waiting_requests -= 1
|
154
|
+
|
155
|
+
::WebpageArchivist.debug "Request over for [#{@uri}] on [#{uri}], missing #{@waiting_requests}" if ::WebpageArchivist.log
|
156
|
+
if (@status == :fetching_requests) && (@waiting_requests <= 0)
|
157
|
+
after_requests
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Process the response once all the elements have been fetched
|
162
|
+
def after_requests
|
163
|
+
::WebpageArchivist.debug "After requests #{@uri}" if ::WebpageArchivist.log
|
164
|
+
if @modified
|
165
|
+
|
166
|
+
# replace elements with the local uris of the elements
|
167
|
+
@content.each_stylesheet do |stylesheet|
|
168
|
+
if e = @plumber[stylesheet['href']].andand.element
|
169
|
+
stylesheet['href'] = e.file_name
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
@content.each_script do |script|
|
174
|
+
if e = @plumber[script['src']].andand.element
|
175
|
+
script['src'] = e.file_name
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
@content.each_image do |img|
|
180
|
+
if e = @plumber[img['src']].andand.element
|
181
|
+
img['src'] = e.file_name
|
182
|
+
end
|
183
|
+
end
|
184
|
+
webpage.save_content @content.to_html
|
185
|
+
end
|
186
|
+
|
187
|
+
commit_timestamp = DateTime.now.strftime('%Y/%m/%d %H:%M:%S')
|
188
|
+
# store the content, clean the repo and commit
|
189
|
+
webpage.update_repo_commit_changes @plumber.requests_files, commit_timestamp
|
190
|
+
@instance = WebpageArchivist::Instance.create(:webpage => webpage, :commit_timestamp => commit_timestamp)
|
191
|
+
|
192
|
+
@fetcher_watcher.end_request self, true
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'iconv'
|
3
|
+
|
4
|
+
module WebpageArchivist
|
5
|
+
|
6
|
+
# API around Nokogiri
|
7
|
+
class HtmlDocument
|
8
|
+
|
9
|
+
attr_reader :content, :charset
|
10
|
+
|
11
|
+
ENCODING_REGEXP = /<meta http-equiv="content-type" content="text\/html; charset=([^"]+)"/i
|
12
|
+
|
13
|
+
CONVERTER = Iconv.new('UTF-8//IGNORE//TRANSLIT', 'ASCII//IGNORE//TRANSLIT')
|
14
|
+
|
15
|
+
# Create document
|
16
|
+
# content:: the content
|
17
|
+
def initialize content, uri = nil, charset = nil
|
18
|
+
@charset = charset
|
19
|
+
unless @charset
|
20
|
+
@charset = ENCODING_REGEXP.match(CONVERTER.iconv(content)).andand[1].andand.upcase
|
21
|
+
end
|
22
|
+
@content = Nokogiri::HTML(content, uri, @charset)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Convert document to html
|
26
|
+
def to_html
|
27
|
+
content.to_html
|
28
|
+
end
|
29
|
+
|
30
|
+
# Call a block for each stylesheet
|
31
|
+
# Block call parameter will be the stylesheet node
|
32
|
+
def each_stylesheet &block
|
33
|
+
content.search('link[@type="text/css"]').each do |link|
|
34
|
+
block.yield link
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Call a block for each script
|
39
|
+
# Block call parameter will be the script node
|
40
|
+
def each_script &block
|
41
|
+
content.search('script[@src]').each do |script|
|
42
|
+
block.yield script
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Call a block for each image
|
47
|
+
# Block call parameter will be the image node
|
48
|
+
def each_image &block
|
49
|
+
content.search('img[@src]').each do |img|
|
50
|
+
block.yield img
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Call a block for each link
|
55
|
+
# Block call parameter will be the link node
|
56
|
+
def each_link &block
|
57
|
+
content.search('a[@href]').each do |link|
|
58
|
+
block.yield link
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
end
|