webpage-archivist 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +6 -2
- data/README.rdoc +20 -2
- data/lib/webpage-archivist/fetcher/fetcher.rb +20 -14
- data/lib/webpage-archivist/migrations.rb +1 -0
- data/lib/webpage-archivist/models.rb +12 -0
- data/lib/webpage-archivist/version.rb +1 -1
- data/webpage-archivist.gemspec +4 -4
- metadata +27 -28
- data/lib/webpage-archivist/fetcher/thread-pool.rb +0 -101
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
# 0.0.3
|
2
|
+
|
3
|
+
- clear migration list after playing them
|
4
|
+
- use EvenetMachine.defer instead of custom pool size
|
5
|
+
|
1
6
|
# 0.0.2
|
2
7
|
|
3
8
|
- replace websnap + wkhtmltoimage by PhantomJS
|
4
9
|
- replace mini_magick by custom code
|
5
|
-
- WebpageArchivist#fetch_webpages
|
6
|
-
|
10
|
+
- WebpageArchivist#fetch_webpages now takes pages instead of ids
|
data/README.rdoc
CHANGED
@@ -5,7 +5,7 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
|
|
5
5
|
* Assets are stored in a git respository to simplify incremental storage and easy retrieval
|
6
6
|
* Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
|
7
7
|
* List of webpages and archives instances are stored in an SQL database
|
8
|
-
* Some caching data are stored in the same
|
8
|
+
* Some caching data are stored in the same database
|
9
9
|
|
10
10
|
= Required tools:
|
11
11
|
|
@@ -42,10 +42,28 @@ Basic configuration is done through environment variables:
|
|
42
42
|
* +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
|
43
43
|
* +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
|
44
44
|
* +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
|
45
|
-
|
45
|
+
* +BACKGROUND_THREAD_POOL_SIZE+: EventMachine pool size for background tasks like taking the snapshots (default to 20)
|
46
46
|
Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
|
47
47
|
|
48
48
|
To enable debugging use
|
49
49
|
|
50
50
|
WebpageArchivist.log= true
|
51
51
|
|
52
|
+
|
53
|
+
= Connect to the database / run migrations
|
54
|
+
|
55
|
+
The database connection is available as <tt>WebpageArchivist::DATABASE</tt> and if you want to run your own migrations use
|
56
|
+
|
57
|
+
require 'webpage-archivist/migrations'
|
58
|
+
WebpageArchivist::Migrations.migration 'create table foo' do
|
59
|
+
WebpageArchivist::DATABASE.create_table :foos do
|
60
|
+
primary_key :id
|
61
|
+
# ...
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
WebpageArchivist::Migrations.new.run
|
66
|
+
|
67
|
+
this way your migrations will be run when the corresponding class is loaded
|
68
|
+
|
69
|
+
= Released under the MIT license
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require 'eventmachine'
|
2
|
-
require_relative 'thread-pool'
|
3
2
|
|
4
3
|
# Module in charge of fetching pages
|
5
4
|
module WebpageArchivist::Fetcher
|
6
5
|
|
7
6
|
SEMAPHORE = Mutex.new
|
8
7
|
|
8
|
+
if ENV['BACKGROUND_THREAD_POOL_SIZE']
|
9
|
+
EventMachine.threadpool_size= ENV['BACKGROUND_THREAD_POOL_SIZE'].to_i
|
10
|
+
end
|
11
|
+
|
9
12
|
# Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
|
10
13
|
# (may be existing instances if the pages haven't changed)
|
11
14
|
def self.fetch_webpages webpages
|
@@ -39,33 +42,37 @@ module WebpageArchivist::Fetcher
|
|
39
42
|
@waiting_requests = 0
|
40
43
|
@status = :starting
|
41
44
|
@requests = []
|
42
|
-
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add a request to wait for
|
48
|
+
def add_request request
|
49
|
+
@waiting_requests += 1
|
50
|
+
@requests << request
|
43
51
|
end
|
44
52
|
|
45
53
|
# Start to wait
|
46
54
|
def wait
|
47
55
|
@status = :waiting
|
48
|
-
if @waiting_requests
|
56
|
+
if @waiting_requests <= 0
|
49
57
|
end_watcher
|
50
58
|
end
|
51
59
|
end
|
52
60
|
|
53
|
-
# Add a request to wait for
|
54
|
-
def add_request request
|
55
|
-
@waiting_requests += 1
|
56
|
-
@requests << request
|
57
|
-
end
|
58
|
-
|
59
61
|
# A request is over
|
60
62
|
# request:: the request
|
61
63
|
# ok:: indicates if the request went ok, in this case ask for a snapshot
|
62
64
|
def end_request request, ok
|
63
|
-
@waiting_requests -= 1
|
64
65
|
if ok && request.instance
|
65
|
-
|
66
|
-
|
67
|
-
|
66
|
+
operation = proc { ::WebpageArchivist::Snapshoter.snapshot_instance(request.instance) }
|
67
|
+
callback = proc { end_request_inner }
|
68
|
+
EM.defer(operation, callback)
|
69
|
+
else
|
70
|
+
end_request_inner
|
68
71
|
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def end_request_inner
|
75
|
+
@waiting_requests -= 1
|
69
76
|
if (@status == :waiting) && (@waiting_requests <= 0)
|
70
77
|
end_watcher
|
71
78
|
end
|
@@ -74,7 +81,6 @@ module WebpageArchivist::Fetcher
|
|
74
81
|
# End the watch
|
75
82
|
def end_watcher
|
76
83
|
EM.stop
|
77
|
-
@thread_pool.shutdown
|
78
84
|
end
|
79
85
|
|
80
86
|
|
@@ -95,6 +95,18 @@ module WebpageArchivist
|
|
95
95
|
validates_presence [:webpage_id, :commit_timestamp]
|
96
96
|
end
|
97
97
|
|
98
|
+
def small_snapshot_path
|
99
|
+
if snapshot
|
100
|
+
"#{webpage.id}/#{self.id}-small.#{Snapshoter.format}"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def snapshot_path
|
105
|
+
if snapshot
|
106
|
+
"#{webpage.id}/#{self.id}.#{Snapshoter.format}"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
98
110
|
end
|
99
111
|
|
100
112
|
module ElementWithContent
|
data/webpage-archivist.gemspec
CHANGED
@@ -18,14 +18,14 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.rdoc_options = ['--main', 'README.rdoc']
|
19
19
|
|
20
20
|
s.add_runtime_dependency 'andand', '~> 1.3.1'
|
21
|
-
s.add_runtime_dependency 'sequel', '~> 3.
|
22
|
-
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.
|
23
|
-
s.add_runtime_dependency 'em-http-request', '~> 1.0.0
|
21
|
+
s.add_runtime_dependency 'sequel', '~> 3.28'
|
22
|
+
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.4'
|
23
|
+
s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
|
24
24
|
s.add_runtime_dependency 'nokogiri', '~> 1.5'
|
25
25
|
s.add_runtime_dependency 'addressable', '~> 2.2.6'
|
26
26
|
s.add_runtime_dependency 'css_parser', '~> 1.2.3'
|
27
27
|
s.add_runtime_dependency 'grit', '~> 2.4.1'
|
28
|
-
s.add_runtime_dependency 'mime-types', '~> 1.
|
28
|
+
s.add_runtime_dependency 'mime-types', '~> 1.17.2'
|
29
29
|
|
30
30
|
s.add_development_dependency 'sqlite3', '~> 1.3.3'
|
31
31
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage-archivist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-11-01 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: andand
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153126980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,43 +21,43 @@ dependencies:
|
|
21
21
|
version: 1.3.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153126980
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: sequel
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153124860 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: '3.
|
32
|
+
version: '3.28'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153124860
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: eventmachine
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153124200 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: 1.0.0.beta.
|
43
|
+
version: 1.0.0.beta.4
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153124200
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: em-http-request
|
49
|
-
requirement: &
|
49
|
+
requirement: &2153123320 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.0.0
|
54
|
+
version: 1.0.0
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2153123320
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: nokogiri
|
60
|
-
requirement: &
|
60
|
+
requirement: &2153122200 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '1.5'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2153122200
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: addressable
|
71
|
-
requirement: &
|
71
|
+
requirement: &2153121340 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 2.2.6
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2153121340
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: css_parser
|
82
|
-
requirement: &
|
82
|
+
requirement: &2153120740 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 1.2.3
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2153120740
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: grit
|
93
|
-
requirement: &
|
93
|
+
requirement: &2153120160 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ~>
|
@@ -98,21 +98,21 @@ dependencies:
|
|
98
98
|
version: 2.4.1
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2153120160
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: mime-types
|
104
|
-
requirement: &
|
104
|
+
requirement: &2153119520 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
109
|
+
version: 1.17.2
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2153119520
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: sqlite3
|
115
|
-
requirement: &
|
115
|
+
requirement: &2153118740 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ~>
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.3.3
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *2153118740
|
124
124
|
description: An utility to archive webpages through time
|
125
125
|
email:
|
126
126
|
executables: []
|
@@ -139,7 +139,6 @@ files:
|
|
139
139
|
- lib/webpage-archivist/fetcher/fetcher.rb
|
140
140
|
- lib/webpage-archivist/fetcher/requests_plumber.rb
|
141
141
|
- lib/webpage-archivist/fetcher/stylesheet_request.rb
|
142
|
-
- lib/webpage-archivist/fetcher/thread-pool.rb
|
143
142
|
- lib/webpage-archivist/fetcher/webpage_request.rb
|
144
143
|
- lib/webpage-archivist/html_document.rb
|
145
144
|
- lib/webpage-archivist/migrations.rb
|
@@ -177,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
177
176
|
version: '0'
|
178
177
|
requirements: []
|
179
178
|
rubyforge_project: webpage-archivist
|
180
|
-
rubygems_version: 1.8.
|
179
|
+
rubygems_version: 1.8.11
|
181
180
|
signing_key:
|
182
181
|
specification_version: 3
|
183
182
|
summary: An utility to archive webpages through time
|
@@ -1,101 +0,0 @@
|
|
1
|
-
# Ruby Thread Pool
|
2
|
-
# ================
|
3
|
-
# A thread pool is useful when you wish to do some work in a thread, but do
|
4
|
-
# not know how much work you will be doing in advance. Spawning one thread
|
5
|
-
# for each task is potentially expensive, as threads are not free.
|
6
|
-
#
|
7
|
-
# In this case, it might be more beneficial to start a predefined set of
|
8
|
-
# threads and then hand off work to them as it becomes available. This is
|
9
|
-
# the pure essence of what a thread pool is: an array of threads, all just
|
10
|
-
# waiting to do some work for you!
|
11
|
-
#
|
12
|
-
# Prerequisites
|
13
|
-
# -------------
|
14
|
-
|
15
|
-
# We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
|
16
|
-
# thread pool is largely dependent on it. Thanks to this, the implementation
|
17
|
-
# becomes very simple!
|
18
|
-
require 'thread'
|
19
|
-
|
20
|
-
# Public Interface
|
21
|
-
# ----------------
|
22
|
-
|
23
|
-
# `Pool` is our thread pool class. It will allow us to do three operations:
|
24
|
-
#
|
25
|
-
# - `.new(size)` creates a thread pool of a given size
|
26
|
-
# - `#schedule(*args, &job)` schedules a new job to be executed
|
27
|
-
# - `#shutdown` shuts down all threads (after letting them finish working, of course)
|
28
|
-
class Pool
|
29
|
-
|
30
|
-
# ### initialization, or `Pool.new(size)`
|
31
|
-
# Creating a new `Pool` involves a certain amount of work. First, however,
|
32
|
-
# we need to define its’ `size`. It defines how many threads we will have
|
33
|
-
# working internally.
|
34
|
-
#
|
35
|
-
# Which size is best for you is hard to answer. You do not want it to be
|
36
|
-
# too low, as then you won’t be able to do as many things concurrently.
|
37
|
-
# However, if you make it too high Ruby will spend too much time switching
|
38
|
-
# between threads, and that will also degrade performance!
|
39
|
-
def initialize(size)
|
40
|
-
# Before we do anything else, we need to store some information about
|
41
|
-
# our pool. `@size` is useful later, when we want to shut our pool down,
|
42
|
-
# and `@jobs` is the heart of our pool that allows us to schedule work.
|
43
|
-
@size = size
|
44
|
-
@jobs = Queue.new
|
45
|
-
|
46
|
-
# #### Creating our pool of threads
|
47
|
-
# Once preparation is done, it’s time to create our pool of threads.
|
48
|
-
# Each thread store its’ index in a thread-local variable, in case we
|
49
|
-
# need to know which thread a job is executing in later on.
|
50
|
-
@pool = Array.new(@size) do |i|
|
51
|
-
Thread.new do
|
52
|
-
Thread.current[:id] = i
|
53
|
-
|
54
|
-
# We start off by defining a `catch` around our worker loop. This
|
55
|
-
# way we’ve provided a method for graceful shutdown of our threads.
|
56
|
-
# Shutting down is merely a `#schedule { throw :exit }` away!
|
57
|
-
catch(:exit) do
|
58
|
-
# The worker thread life-cycle is very simple. We continuously wait
|
59
|
-
# for tasks to be put into our job `Queue`. If the `Queue` is empty,
|
60
|
-
# we will wait until it’s not.
|
61
|
-
loop do
|
62
|
-
# Once we have a piece of work to be done, we will pull out the
|
63
|
-
# information we need and get to work.
|
64
|
-
job, args = @jobs.pop
|
65
|
-
job.call(*args)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# ### Work scheduling
|
73
|
-
|
74
|
-
# To schedule a piece of work to be done is to say to the `Pool` that you
|
75
|
-
# want something done.
|
76
|
-
def schedule(*args, &block)
|
77
|
-
# Your given task will not be run immediately; rather, it will be put
|
78
|
-
# into the work `Queue` and executed once a thread is ready to work.
|
79
|
-
@jobs << [block, args]
|
80
|
-
end
|
81
|
-
|
82
|
-
# ### Graceful shutdown
|
83
|
-
|
84
|
-
# If you ever wish to close down your application, I took the liberty of
|
85
|
-
# making it easy for you to wait for any currently executing jobs to finish
|
86
|
-
# before you exit.
|
87
|
-
def shutdown
|
88
|
-
# A graceful shutdown involves threads exiting cleanly themselves, and
|
89
|
-
# since we’ve defined a `catch`-handler around the threads’ worker loop
|
90
|
-
# it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
|
91
|
-
# for each thread in our pool, they will all exit eventually!
|
92
|
-
@size.times do
|
93
|
-
schedule { throw :exit }
|
94
|
-
end
|
95
|
-
|
96
|
-
# And now one final thing: wait for our `throw :exit` jobs to be run on
|
97
|
-
# all our worker threads. This call will not return until all worker threads
|
98
|
-
# have exited.
|
99
|
-
@pool.map(&:join)
|
100
|
-
end
|
101
|
-
end
|