webpage-archivist 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ require 'andand'
2
+
3
+ module WebpageArchivist::Fetcher
4
+ # A request specific for stylesheet as a stylesheet can reference images or other stylesheets
5
+ class StyleSheetRequest < ElementRequest
6
+
7
+
8
+ def initialize uri, plumber
9
+ super WebpageArchivist::Stylesheet, uri, plumber
10
+ @parsed_uri = Addressable::URI.parse(uri)
11
+ @waiting_requests = 0
12
+ end
13
+
14
+ def content_not_changed http
15
+ @modified = false
16
+ if element.andand.last_content
17
+ create_sub_requests http
18
+ else
19
+ self.status = :over
20
+ notify_requesters
21
+ end
22
+ end
23
+
24
+ def save_content_end_request http
25
+ @modified = true
26
+ create_sub_requests http
27
+ end
28
+
29
+ # Don't directly save the content but first retrieve any possible other stylesheet and image
30
+ def create_sub_requests http
31
+ if @modified
32
+ charset = @plumber.response_charset(http)
33
+ @stylesheet = WebpageArchivist::StylesheetDocument.new(http.response, @uri, charset)
34
+ else
35
+ @stylesheet = WebpageArchivist::StylesheetDocument.new(element.last_content, @uri, element.last_charset)
36
+ end
37
+
38
+ already = Set.new
39
+ @stylesheet.each_import do |i|
40
+ unless already.include? i
41
+ already.add i
42
+ i = @parsed_uri.absolutize(i)
43
+ @plumber.request_element self, WebpageArchivist::Stylesheet, i
44
+ @waiting_requests += 1
45
+ i
46
+ end
47
+ end
48
+
49
+ already = Set.new
50
+ @stylesheet.each_image do |i|
51
+ unless already.include? i
52
+ already.add i
53
+ i = @parsed_uri.absolutize(i)
54
+ @plumber.request_element self, WebpageArchivist::Image, i
55
+ @waiting_requests += 1
56
+ i
57
+ end
58
+ end
59
+
60
+ if @modified
61
+ element.update(:last_content => @stylesheet.to_css, :last_charset => @stylesheet.charset)
62
+ end
63
+
64
+ self.status = :fetching_requests
65
+
66
+ # No external resource -> end here
67
+ if @waiting_requests == 0
68
+ after_requests
69
+ end
70
+
71
+ end
72
+
73
+ # Called by a request when it is over
74
+ # uri:: the request uri
75
+ def request_over uri
76
+ @waiting_requests -= 1
77
+ ::WebpageArchivist.debug "Request over for [#{@uri}] on [#{uri}], missing #{@waiting_requests}" if ::WebpageArchivist.log
78
+
79
+ if (@waiting_requests == 0) && (status == :fetching_requests)
80
+ after_requests
81
+ end
82
+ end
83
+
84
+ # Process the response once all the elements have been fetched
85
+ def after_requests
86
+ ::WebpageArchivist.debug "After requests [#{@uri}]" if ::WebpageArchivist.log
87
+ if @modified
88
+ @stylesheet.each_import do |i|
89
+ if e = @plumber[i].andand.element
90
+ e.file_name
91
+ else
92
+ nil
93
+ end
94
+ end
95
+
96
+ @stylesheet.each_image do |i|
97
+ if e = @plumber[i].andand.element
98
+ e.file_name
99
+ else
100
+ nil
101
+ end
102
+ end
103
+
104
+ element.save_content @stylesheet.to_css
105
+ end
106
+ self.status = :over
107
+ notify_requesters
108
+ end
109
+
110
+ end
111
+
112
+ end
@@ -0,0 +1,101 @@
1
+ # Ruby Thread Pool
2
+ # ================
3
+ # A thread pool is useful when you wish to do some work in a thread, but do
4
+ # not know how much work you will be doing in advance. Spawning one thread
5
+ # for each task is potentially expensive, as threads are not free.
6
+ #
7
+ # In this case, it might be more beneficial to start a predefined set of
8
+ # threads and then hand off work to them as it becomes available. This is
9
+ # the pure essence of what a thread pool is: an array of threads, all just
10
+ # waiting to do some work for you!
11
+ #
12
+ # Prerequisites
13
+ # -------------
14
+
15
+ # We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
16
+ # thread pool is largely dependent on it. Thanks to this, the implementation
17
+ # becomes very simple!
18
+ require 'thread'
19
+
20
+ # Public Interface
21
+ # ----------------
22
+
23
+ # `Pool` is our thread pool class. It will allow us to do three operations:
24
+ #
25
+ # - `.new(size)` creates a thread pool of a given size
26
+ # - `#schedule(*args, &job)` schedules a new job to be executed
27
+ # - `#shutdown` shuts down all threads (after letting them finish working, of course)
28
+ class Pool
29
+
30
+ # ### initialization, or `Pool.new(size)`
31
+ # Creating a new `Pool` involves a certain amount of work. First, however,
32
+ # we need to define its’ `size`. It defines how many threads we will have
33
+ # working internally.
34
+ #
35
+ # Which size is best for you is hard to answer. You do not want it to be
36
+ # too low, as then you won’t be able to do as many things concurrently.
37
+ # However, if you make it too high Ruby will spend too much time switching
38
+ # between threads, and that will also degrade performance!
39
+ def initialize(size)
40
+ # Before we do anything else, we need to store some information about
41
+ # our pool. `@size` is useful later, when we want to shut our pool down,
42
+ # and `@jobs` is the heart of our pool that allows us to schedule work.
43
+ @size = size
44
+ @jobs = Queue.new
45
+
46
+ # #### Creating our pool of threads
47
+ # Once preparation is done, it’s time to create our pool of threads.
48
+ # Each thread store its’ index in a thread-local variable, in case we
49
+ # need to know which thread a job is executing in later on.
50
+ @pool = Array.new(@size) do |i|
51
+ Thread.new do
52
+ Thread.current[:id] = i
53
+
54
+ # We start off by defining a `catch` around our worker loop. This
55
+ # way we’ve provided a method for graceful shutdown of our threads.
56
+ # Shutting down is merely a `#schedule { throw :exit }` away!
57
+ catch(:exit) do
58
+ # The worker thread life-cycle is very simple. We continuously wait
59
+ # for tasks to be put into our job `Queue`. If the `Queue` is empty,
60
+ # we will wait until it’s not.
61
+ loop do
62
+ # Once we have a piece of work to be done, we will pull out the
63
+ # information we need and get to work.
64
+ job, args = @jobs.pop
65
+ job.call(*args)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ # ### Work scheduling
73
+
74
+ # To schedule a piece of work to be done is to say to the `Pool` that you
75
+ # want something done.
76
+ def schedule(*args, &block)
77
+ # Your given task will not be run immediately; rather, it will be put
78
+ # into the work `Queue` and executed once a thread is ready to work.
79
+ @jobs << [block, args]
80
+ end
81
+
82
+ # ### Graceful shutdown
83
+
84
+ # If you ever wish to close down your application, I took the liberty of
85
+ # making it easy for you to wait for any currently executing jobs to finish
86
+ # before you exit.
87
+ def shutdown
88
+ # A graceful shutdown involves threads exiting cleanly themselves, and
89
+ # since we’ve defined a `catch`-handler around the threads’ worker loop
90
+ # it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
91
+ # for each thread in our pool, they will all exit eventually!
92
+ @size.times do
93
+ schedule { throw :exit }
94
+ end
95
+
96
+ # And now one final thing: wait for our `throw :exit` jobs to be run on
97
+ # all our worker threads. This call will not return until all worker threads
98
+ # have exited.
99
+ @pool.map(&:join)
100
+ end
101
+ end
@@ -0,0 +1,197 @@
1
+ require 'addressable/uri'
2
+ require 'em-http'
3
+ require 'andand'
4
+
5
+ module WebpageArchivist::Fetcher
6
+
7
+ # Requesting a webpage.
8
+ class WebpageRequest
9
+
10
+ attr_reader :webpage, :result_code, :instance, :uri, :status
11
+
12
+ # Create a request
13
+ # webpage:: the Webpage we want to fetch
14
+ # fetcher_watcher:: to be notified when the request is over
15
+ def initialize webpage, fetcher_watcher
16
+ @webpage = webpage
17
+ @fetcher_watcher = fetcher_watcher
18
+
19
+ @uri = Addressable::URI.parse(webpage.uri)
20
+
21
+ @waiting_requests = 0
22
+
23
+ @status = :fetching
24
+
25
+ @plumber = RequestsPlumber.new self
26
+ end
27
+
28
+ # Start the request
29
+ # Not in initialize so we can throttle the number of connection
30
+ # retries:: number of retries in case of error
31
+ def start retries = 3
32
+ @http = EventMachine::HttpRequest.new(@uri).get :redirects => 5, :timeout => 30, :head => {'If-Modified-Since' => webpage.last_modified, 'accept-encoding' => 'gzip, compressed'}
33
+ @http.callback do
34
+ if ([500, 503].include? @http.response_header.status) && (retries > 0)
35
+ start(retries - 1)
36
+ else
37
+ begin
38
+ process_response
39
+ rescue Exception => e
40
+ ::WebpageArchivist.error e if ::WebpageArchivist.log
41
+ end
42
+ end
43
+ end
44
+ @http.errback do
45
+ if retries > 0
46
+ start(retries - 1)
47
+ else
48
+ begin
49
+ process_response
50
+ rescue Exception => e
51
+ ::WebpageArchivist.error e if ::WebpageArchivist.log
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ # Process the response
58
+ def process_response
59
+ @result_code = @http.response_header.status
60
+ ::WebpageArchivist.debug "[#{@uri}] returned #{@result_code}" if ::WebpageArchivist.log
61
+ if [304, 408, 0].include? @result_code
62
+ # Not changed
63
+ ::WebpageArchivist.debug "[#{@uri}] not modified" if ::WebpageArchivist.log
64
+ @modified = false
65
+ process_content
66
+ elsif result_code == 200
67
+ ::WebpageArchivist.debug "[#{@uri}] modified" if ::WebpageArchivist.log
68
+ @modified = true
69
+ process_content
70
+ else
71
+ ::WebpageArchivist.debug "Error #{@uri} #{@result_code}" if ::WebpageArchivist.log
72
+ @fetcher_watcher.end_request self, false
73
+ end
74
+ end
75
+
76
+ # Process the content
77
+ def process_content
78
+ if @modified
79
+ charset = @plumber.response_charset @http
80
+ @content = WebpageArchivist::HtmlDocument.new(@http.response, @uri, charset)
81
+ else
82
+ @content = WebpageArchivist::HtmlDocument.new(webpage.last_content, @uri, webpage.last_encoding)
83
+ end
84
+
85
+ # Elements from the stylesheets
86
+ already = Set.new
87
+ @content.each_stylesheet do |stylesheet|
88
+ uri = make_absolute_if_modified(stylesheet, 'href')
89
+ unless already.include? uri
90
+ already.add uri
91
+ @plumber.request_element self, WebpageArchivist::Stylesheet, uri
92
+ @waiting_requests += 1
93
+ end
94
+ end
95
+
96
+ # Elements from the scripts
97
+ already = Set.new
98
+ @content.each_script do |script|
99
+ uri = make_absolute_if_modified(script, 'src')
100
+ unless already.include? uri
101
+ already.add uri
102
+ @plumber.request_element self, WebpageArchivist::Script, uri
103
+ @waiting_requests += 1
104
+ end
105
+ end
106
+
107
+ # Elements from the images
108
+ already = Set.new
109
+ @content.each_image do |img|
110
+ uri = make_absolute_if_modified(img, 'src')
111
+ unless already.include? uri
112
+ already.add uri
113
+ @plumber.request_element self, WebpageArchivist::Image, uri
114
+ @waiting_requests += 1
115
+ end
116
+ end
117
+
118
+ if @modified
119
+ # Make links absolutes
120
+ @content.each_link do |link|
121
+ make_absolute_if_modified(link, 'href')
122
+ end
123
+
124
+ @webpage.update(
125
+ :last_modified => @plumber.last_modified(@http),
126
+ :last_content => @content.to_html,
127
+ :last_charset => @content.charset)
128
+ end
129
+
130
+ @status = :fetching_requests
131
+
132
+ # No external resource -> end here
133
+ if @waiting_requests == 0
134
+ after_requests
135
+ end
136
+
137
+ end
138
+
139
+ # Make an element's uri absolute
140
+ # element:: the element
141
+ # link_property:: the property holding the uri
142
+ def make_absolute_if_modified element, link_property
143
+ if @modified
144
+ element[link_property] = uri.absolutize(element[link_property])
145
+ else
146
+ element[link_property]
147
+ end
148
+ end
149
+
150
+ # Called by a request when it is over
151
+ # uri:: the request uri
152
+ def request_over uri
153
+ @waiting_requests -= 1
154
+
155
+ ::WebpageArchivist.debug "Request over for [#{@uri}] on [#{uri}], missing #{@waiting_requests}" if ::WebpageArchivist.log
156
+ if (@status == :fetching_requests) && (@waiting_requests <= 0)
157
+ after_requests
158
+ end
159
+ end
160
+
161
+ # Process the response once all the elements have been fetched
162
+ def after_requests
163
+ ::WebpageArchivist.debug "After requests #{@uri}" if ::WebpageArchivist.log
164
+ if @modified
165
+
166
+ # replace elements with the local uris of the elements
167
+ @content.each_stylesheet do |stylesheet|
168
+ if e = @plumber[stylesheet['href']].andand.element
169
+ stylesheet['href'] = e.file_name
170
+ end
171
+ end
172
+
173
+ @content.each_script do |script|
174
+ if e = @plumber[script['src']].andand.element
175
+ script['src'] = e.file_name
176
+ end
177
+ end
178
+
179
+ @content.each_image do |img|
180
+ if e = @plumber[img['src']].andand.element
181
+ img['src'] = e.file_name
182
+ end
183
+ end
184
+ webpage.save_content @content.to_html
185
+ end
186
+
187
+ commit_timestamp = DateTime.now.strftime('%Y/%m/%d %H:%M:%S')
188
+ # store the content, clean the repo and commit
189
+ webpage.update_repo_commit_changes @plumber.requests_files, commit_timestamp
190
+ @instance = WebpageArchivist::Instance.create(:webpage => webpage, :commit_timestamp => commit_timestamp)
191
+
192
+ @fetcher_watcher.end_request self, true
193
+ end
194
+
195
+ end
196
+
197
+ end
@@ -0,0 +1,66 @@
1
+ require 'nokogiri'
2
+ require 'iconv'
3
+
4
+ module WebpageArchivist
5
+
6
+ # API around Nokogiri
7
+ class HtmlDocument
8
+
9
+ attr_reader :content, :charset
10
+
11
+ ENCODING_REGEXP = /<meta http-equiv="content-type" content="text\/html; charset=([^"]+)"/i
12
+
13
+ CONVERTER = Iconv.new('UTF-8//IGNORE//TRANSLIT', 'ASCII//IGNORE//TRANSLIT')
14
+
15
+ # Create document
16
+ # content:: the content
17
+ def initialize content, uri = nil, charset = nil
18
+ @charset = charset
19
+ unless @charset
20
+ @charset = ENCODING_REGEXP.match(CONVERTER.iconv(content)).andand[1].andand.upcase
21
+ end
22
+ @content = Nokogiri::HTML(content, uri, @charset)
23
+ end
24
+
25
+ # Convert document to html
26
+ def to_html
27
+ content.to_html
28
+ end
29
+
30
+ # Call a block for each stylesheet
31
+ # Block call parameter will be the stylesheet node
32
+ def each_stylesheet &block
33
+ content.search('link[@type="text/css"]').each do |link|
34
+ block.yield link
35
+ end
36
+ end
37
+
38
+ # Call a block for each script
39
+ # Block call parameter will be the script node
40
+ def each_script &block
41
+ content.search('script[@src]').each do |script|
42
+ block.yield script
43
+ end
44
+ end
45
+
46
+ # Call a block for each image
47
+ # Block call parameter will be the image node
48
+ def each_image &block
49
+ content.search('img[@src]').each do |img|
50
+ block.yield img
51
+ end
52
+ end
53
+
54
+ # Call a block for each link
55
+ # Block call parameter will be the link node
56
+ def each_link &block
57
+ content.search('a[@href]').each do |link|
58
+ block.yield link
59
+ end
60
+ end
61
+
62
+
63
+ end
64
+
65
+
66
+ end