webpage-archivist 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +6 -2
- data/README.rdoc +20 -2
- data/lib/webpage-archivist/fetcher/fetcher.rb +20 -14
- data/lib/webpage-archivist/migrations.rb +1 -0
- data/lib/webpage-archivist/models.rb +12 -0
- data/lib/webpage-archivist/version.rb +1 -1
- data/webpage-archivist.gemspec +4 -4
- metadata +27 -28
- data/lib/webpage-archivist/fetcher/thread-pool.rb +0 -101
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
# 0.0.3
|
2
|
+
|
3
|
+
- clear migration list after playing them
|
4
|
+
- use EvenetMachine.defer instead of custom pool size
|
5
|
+
|
1
6
|
# 0.0.2
|
2
7
|
|
3
8
|
- replace websnap + wkhtmltoimage by PhantomJS
|
4
9
|
- replace mini_magick by custom code
|
5
|
-
- WebpageArchivist#fetch_webpages
|
6
|
-
|
10
|
+
- WebpageArchivist#fetch_webpages now takes pages instead of ids
|
data/README.rdoc
CHANGED
@@ -5,7 +5,7 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
|
|
5
5
|
* Assets are stored in a git respository to simplify incremental storage and easy retrieval
|
6
6
|
* Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
|
7
7
|
* List of webpages and archives instances are stored in an SQL database
|
8
|
-
* Some caching data are stored in the same
|
8
|
+
* Some caching data are stored in the same database
|
9
9
|
|
10
10
|
= Required tools:
|
11
11
|
|
@@ -42,10 +42,28 @@ Basic configuration is done through environment variables:
|
|
42
42
|
* +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
|
43
43
|
* +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
|
44
44
|
* +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
|
45
|
-
|
45
|
+
* +BACKGROUND_THREAD_POOL_SIZE+: EventMachine pool size for background tasks like taking the snapshots (default to 20)
|
46
46
|
Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
|
47
47
|
|
48
48
|
To enable debugging use
|
49
49
|
|
50
50
|
WebpageArchivist.log= true
|
51
51
|
|
52
|
+
|
53
|
+
= Connect to the database / run migrations
|
54
|
+
|
55
|
+
The database connection is available as <tt>WebpageArchivist::DATABASE</tt> and if you want to run your own migrations use
|
56
|
+
|
57
|
+
require 'webpage-archivist/migrations'
|
58
|
+
WebpageArchivist::Migrations.migration 'create table foo' do
|
59
|
+
WebpageArchivist::DATABASE.create_table :foos do
|
60
|
+
primary_key :id
|
61
|
+
# ...
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
WebpageArchivist::Migrations.new.run
|
66
|
+
|
67
|
+
this way your migrations will be run when the corresponding class is loaded
|
68
|
+
|
69
|
+
= Released under the MIT license
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require 'eventmachine'
|
2
|
-
require_relative 'thread-pool'
|
3
2
|
|
4
3
|
# Module in charge of fetching pages
|
5
4
|
module WebpageArchivist::Fetcher
|
6
5
|
|
7
6
|
SEMAPHORE = Mutex.new
|
8
7
|
|
8
|
+
if ENV['BACKGROUND_THREAD_POOL_SIZE']
|
9
|
+
EventMachine.threadpool_size= ENV['BACKGROUND_THREAD_POOL_SIZE'].to_i
|
10
|
+
end
|
11
|
+
|
9
12
|
# Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
|
10
13
|
# (may be existing instances if the pages haven't changed)
|
11
14
|
def self.fetch_webpages webpages
|
@@ -39,33 +42,37 @@ module WebpageArchivist::Fetcher
|
|
39
42
|
@waiting_requests = 0
|
40
43
|
@status = :starting
|
41
44
|
@requests = []
|
42
|
-
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add a request to wait for
|
48
|
+
def add_request request
|
49
|
+
@waiting_requests += 1
|
50
|
+
@requests << request
|
43
51
|
end
|
44
52
|
|
45
53
|
# Start to wait
|
46
54
|
def wait
|
47
55
|
@status = :waiting
|
48
|
-
if @waiting_requests
|
56
|
+
if @waiting_requests <= 0
|
49
57
|
end_watcher
|
50
58
|
end
|
51
59
|
end
|
52
60
|
|
53
|
-
# Add a request to wait for
|
54
|
-
def add_request request
|
55
|
-
@waiting_requests += 1
|
56
|
-
@requests << request
|
57
|
-
end
|
58
|
-
|
59
61
|
# A request is over
|
60
62
|
# request:: the request
|
61
63
|
# ok:: indicates if the request went ok, in this case ask for a snapshot
|
62
64
|
def end_request request, ok
|
63
|
-
@waiting_requests -= 1
|
64
65
|
if ok && request.instance
|
65
|
-
|
66
|
-
|
67
|
-
|
66
|
+
operation = proc { ::WebpageArchivist::Snapshoter.snapshot_instance(request.instance) }
|
67
|
+
callback = proc { end_request_inner }
|
68
|
+
EM.defer(operation, callback)
|
69
|
+
else
|
70
|
+
end_request_inner
|
68
71
|
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def end_request_inner
|
75
|
+
@waiting_requests -= 1
|
69
76
|
if (@status == :waiting) && (@waiting_requests <= 0)
|
70
77
|
end_watcher
|
71
78
|
end
|
@@ -74,7 +81,6 @@ module WebpageArchivist::Fetcher
|
|
74
81
|
# End the watch
|
75
82
|
def end_watcher
|
76
83
|
EM.stop
|
77
|
-
@thread_pool.shutdown
|
78
84
|
end
|
79
85
|
|
80
86
|
|
@@ -95,6 +95,18 @@ module WebpageArchivist
|
|
95
95
|
validates_presence [:webpage_id, :commit_timestamp]
|
96
96
|
end
|
97
97
|
|
98
|
+
def small_snapshot_path
|
99
|
+
if snapshot
|
100
|
+
"#{webpage.id}/#{self.id}-small.#{Snapshoter.format}"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def snapshot_path
|
105
|
+
if snapshot
|
106
|
+
"#{webpage.id}/#{self.id}.#{Snapshoter.format}"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
98
110
|
end
|
99
111
|
|
100
112
|
module ElementWithContent
|
data/webpage-archivist.gemspec
CHANGED
@@ -18,14 +18,14 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.rdoc_options = ['--main', 'README.rdoc']
|
19
19
|
|
20
20
|
s.add_runtime_dependency 'andand', '~> 1.3.1'
|
21
|
-
s.add_runtime_dependency 'sequel', '~> 3.
|
22
|
-
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.
|
23
|
-
s.add_runtime_dependency 'em-http-request', '~> 1.0.0
|
21
|
+
s.add_runtime_dependency 'sequel', '~> 3.28'
|
22
|
+
s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.4'
|
23
|
+
s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
|
24
24
|
s.add_runtime_dependency 'nokogiri', '~> 1.5'
|
25
25
|
s.add_runtime_dependency 'addressable', '~> 2.2.6'
|
26
26
|
s.add_runtime_dependency 'css_parser', '~> 1.2.3'
|
27
27
|
s.add_runtime_dependency 'grit', '~> 2.4.1'
|
28
|
-
s.add_runtime_dependency 'mime-types', '~> 1.
|
28
|
+
s.add_runtime_dependency 'mime-types', '~> 1.17.2'
|
29
29
|
|
30
30
|
s.add_development_dependency 'sqlite3', '~> 1.3.3'
|
31
31
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage-archivist
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-11-01 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: andand
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153126980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,43 +21,43 @@ dependencies:
|
|
21
21
|
version: 1.3.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153126980
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: sequel
|
27
|
-
requirement: &
|
27
|
+
requirement: &2153124860 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: '3.
|
32
|
+
version: '3.28'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2153124860
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: eventmachine
|
38
|
-
requirement: &
|
38
|
+
requirement: &2153124200 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: 1.0.0.beta.
|
43
|
+
version: 1.0.0.beta.4
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153124200
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: em-http-request
|
49
|
-
requirement: &
|
49
|
+
requirement: &2153123320 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.0.0
|
54
|
+
version: 1.0.0
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2153123320
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: nokogiri
|
60
|
-
requirement: &
|
60
|
+
requirement: &2153122200 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '1.5'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2153122200
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: addressable
|
71
|
-
requirement: &
|
71
|
+
requirement: &2153121340 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 2.2.6
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2153121340
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: css_parser
|
82
|
-
requirement: &
|
82
|
+
requirement: &2153120740 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 1.2.3
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2153120740
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: grit
|
93
|
-
requirement: &
|
93
|
+
requirement: &2153120160 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ~>
|
@@ -98,21 +98,21 @@ dependencies:
|
|
98
98
|
version: 2.4.1
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2153120160
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: mime-types
|
104
|
-
requirement: &
|
104
|
+
requirement: &2153119520 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ~>
|
108
108
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
109
|
+
version: 1.17.2
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *2153119520
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: sqlite3
|
115
|
-
requirement: &
|
115
|
+
requirement: &2153118740 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ~>
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.3.3
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *2153118740
|
124
124
|
description: An utility to archive webpages through time
|
125
125
|
email:
|
126
126
|
executables: []
|
@@ -139,7 +139,6 @@ files:
|
|
139
139
|
- lib/webpage-archivist/fetcher/fetcher.rb
|
140
140
|
- lib/webpage-archivist/fetcher/requests_plumber.rb
|
141
141
|
- lib/webpage-archivist/fetcher/stylesheet_request.rb
|
142
|
-
- lib/webpage-archivist/fetcher/thread-pool.rb
|
143
142
|
- lib/webpage-archivist/fetcher/webpage_request.rb
|
144
143
|
- lib/webpage-archivist/html_document.rb
|
145
144
|
- lib/webpage-archivist/migrations.rb
|
@@ -177,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
177
176
|
version: '0'
|
178
177
|
requirements: []
|
179
178
|
rubyforge_project: webpage-archivist
|
180
|
-
rubygems_version: 1.8.
|
179
|
+
rubygems_version: 1.8.11
|
181
180
|
signing_key:
|
182
181
|
specification_version: 3
|
183
182
|
summary: An utility to archive webpages through time
|
@@ -1,101 +0,0 @@
|
|
1
|
-
# Ruby Thread Pool
|
2
|
-
# ================
|
3
|
-
# A thread pool is useful when you wish to do some work in a thread, but do
|
4
|
-
# not know how much work you will be doing in advance. Spawning one thread
|
5
|
-
# for each task is potentially expensive, as threads are not free.
|
6
|
-
#
|
7
|
-
# In this case, it might be more beneficial to start a predefined set of
|
8
|
-
# threads and then hand off work to them as it becomes available. This is
|
9
|
-
# the pure essence of what a thread pool is: an array of threads, all just
|
10
|
-
# waiting to do some work for you!
|
11
|
-
#
|
12
|
-
# Prerequisites
|
13
|
-
# -------------
|
14
|
-
|
15
|
-
# We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
|
16
|
-
# thread pool is largely dependent on it. Thanks to this, the implementation
|
17
|
-
# becomes very simple!
|
18
|
-
require 'thread'
|
19
|
-
|
20
|
-
# Public Interface
|
21
|
-
# ----------------
|
22
|
-
|
23
|
-
# `Pool` is our thread pool class. It will allow us to do three operations:
|
24
|
-
#
|
25
|
-
# - `.new(size)` creates a thread pool of a given size
|
26
|
-
# - `#schedule(*args, &job)` schedules a new job to be executed
|
27
|
-
# - `#shutdown` shuts down all threads (after letting them finish working, of course)
|
28
|
-
class Pool
|
29
|
-
|
30
|
-
# ### initialization, or `Pool.new(size)`
|
31
|
-
# Creating a new `Pool` involves a certain amount of work. First, however,
|
32
|
-
# we need to define its’ `size`. It defines how many threads we will have
|
33
|
-
# working internally.
|
34
|
-
#
|
35
|
-
# Which size is best for you is hard to answer. You do not want it to be
|
36
|
-
# too low, as then you won’t be able to do as many things concurrently.
|
37
|
-
# However, if you make it too high Ruby will spend too much time switching
|
38
|
-
# between threads, and that will also degrade performance!
|
39
|
-
def initialize(size)
|
40
|
-
# Before we do anything else, we need to store some information about
|
41
|
-
# our pool. `@size` is useful later, when we want to shut our pool down,
|
42
|
-
# and `@jobs` is the heart of our pool that allows us to schedule work.
|
43
|
-
@size = size
|
44
|
-
@jobs = Queue.new
|
45
|
-
|
46
|
-
# #### Creating our pool of threads
|
47
|
-
# Once preparation is done, it’s time to create our pool of threads.
|
48
|
-
# Each thread store its’ index in a thread-local variable, in case we
|
49
|
-
# need to know which thread a job is executing in later on.
|
50
|
-
@pool = Array.new(@size) do |i|
|
51
|
-
Thread.new do
|
52
|
-
Thread.current[:id] = i
|
53
|
-
|
54
|
-
# We start off by defining a `catch` around our worker loop. This
|
55
|
-
# way we’ve provided a method for graceful shutdown of our threads.
|
56
|
-
# Shutting down is merely a `#schedule { throw :exit }` away!
|
57
|
-
catch(:exit) do
|
58
|
-
# The worker thread life-cycle is very simple. We continuously wait
|
59
|
-
# for tasks to be put into our job `Queue`. If the `Queue` is empty,
|
60
|
-
# we will wait until it’s not.
|
61
|
-
loop do
|
62
|
-
# Once we have a piece of work to be done, we will pull out the
|
63
|
-
# information we need and get to work.
|
64
|
-
job, args = @jobs.pop
|
65
|
-
job.call(*args)
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# ### Work scheduling
|
73
|
-
|
74
|
-
# To schedule a piece of work to be done is to say to the `Pool` that you
|
75
|
-
# want something done.
|
76
|
-
def schedule(*args, &block)
|
77
|
-
# Your given task will not be run immediately; rather, it will be put
|
78
|
-
# into the work `Queue` and executed once a thread is ready to work.
|
79
|
-
@jobs << [block, args]
|
80
|
-
end
|
81
|
-
|
82
|
-
# ### Graceful shutdown
|
83
|
-
|
84
|
-
# If you ever wish to close down your application, I took the liberty of
|
85
|
-
# making it easy for you to wait for any currently executing jobs to finish
|
86
|
-
# before you exit.
|
87
|
-
def shutdown
|
88
|
-
# A graceful shutdown involves threads exiting cleanly themselves, and
|
89
|
-
# since we’ve defined a `catch`-handler around the threads’ worker loop
|
90
|
-
# it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
|
91
|
-
# for each thread in our pool, they will all exit eventually!
|
92
|
-
@size.times do
|
93
|
-
schedule { throw :exit }
|
94
|
-
end
|
95
|
-
|
96
|
-
# And now one final thing: wait for our `throw :exit` jobs to be run on
|
97
|
-
# all our worker threads. This call will not return until all worker threads
|
98
|
-
# have exited.
|
99
|
-
@pool.map(&:join)
|
100
|
-
end
|
101
|
-
end
|