webpage-archivist 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,10 @@
1
+ # 0.0.3
2
+
3
+ - clear migration list after playing them
4
+ - use EvenetMachine.defer instead of custom pool size
5
+
1
6
  # 0.0.2
2
7
 
3
8
  - replace websnap + wkhtmltoimage by PhantomJS
4
9
  - replace mini_magick by custom code
5
- - WebpageArchivist#fetch_webpages no takes pages instead of ids
6
-
10
+ - WebpageArchivist#fetch_webpages now takes pages instead of ids
@@ -5,7 +5,7 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
5
5
  * Assets are stored in a git respository to simplify incremental storage and easy retrieval
6
6
  * Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
7
7
  * List of webpages and archives instances are stored in an SQL database
8
- * Some caching data are stored in the same databse
8
+ * Some caching data are stored in the same database
9
9
 
10
10
  = Required tools:
11
11
 
@@ -42,10 +42,28 @@ Basic configuration is done through environment variables:
42
42
  * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
43
43
  * +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
44
44
  * +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
45
-
45
+ * +BACKGROUND_THREAD_POOL_SIZE+: EventMachine pool size for background tasks like taking the snapshots (default to 20)
46
46
  Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
47
47
 
48
48
  To enable debugging use
49
49
 
50
50
  WebpageArchivist.log= true
51
51
 
52
+
53
+ = Connect to the database / run migrations
54
+
55
+ The database connection is available as <tt>WebpageArchivist::DATABASE</tt> and if you want to run your own migrations use
56
+
57
+ require 'webpage-archivist/migrations'
58
+ WebpageArchivist::Migrations.migration 'create table foo' do
59
+ WebpageArchivist::DATABASE.create_table :foos do
60
+ primary_key :id
61
+ # ...
62
+ end
63
+ end
64
+
65
+ WebpageArchivist::Migrations.new.run
66
+
67
+ this way your migrations will be run when the corresponding class is loaded
68
+
69
+ = Released under the MIT license
@@ -1,11 +1,14 @@
1
1
  require 'eventmachine'
2
- require_relative 'thread-pool'
3
2
 
4
3
  # Module in charge of fetching pages
5
4
  module WebpageArchivist::Fetcher
6
5
 
7
6
  SEMAPHORE = Mutex.new
8
7
 
8
+ if ENV['BACKGROUND_THREAD_POOL_SIZE']
9
+ EventMachine.threadpool_size= ENV['BACKGROUND_THREAD_POOL_SIZE'].to_i
10
+ end
11
+
9
12
  # Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
10
13
  # (may be existing instances if the pages haven't changed)
11
14
  def self.fetch_webpages webpages
@@ -39,33 +42,37 @@ module WebpageArchivist::Fetcher
39
42
  @waiting_requests = 0
40
43
  @status = :starting
41
44
  @requests = []
42
- @thread_pool = Pool.new 1
45
+ end
46
+
47
+ # Add a request to wait for
48
+ def add_request request
49
+ @waiting_requests += 1
50
+ @requests << request
43
51
  end
44
52
 
45
53
  # Start to wait
46
54
  def wait
47
55
  @status = :waiting
48
- if @waiting_requests == 0
56
+ if @waiting_requests <= 0
49
57
  end_watcher
50
58
  end
51
59
  end
52
60
 
53
- # Add a request to wait for
54
- def add_request request
55
- @waiting_requests += 1
56
- @requests << request
57
- end
58
-
59
61
  # A request is over
60
62
  # request:: the request
61
63
  # ok:: indicates if the request went ok, in this case ask for a snapshot
62
64
  def end_request request, ok
63
- @waiting_requests -= 1
64
65
  if ok && request.instance
65
- @thread_pool.schedule do
66
- ::WebpageArchivist::Snapshoter.snapshot_instance request.instance
67
- end
66
+ operation = proc { ::WebpageArchivist::Snapshoter.snapshot_instance(request.instance) }
67
+ callback = proc { end_request_inner }
68
+ EM.defer(operation, callback)
69
+ else
70
+ end_request_inner
68
71
  end
72
+ end
73
+
74
+ def end_request_inner
75
+ @waiting_requests -= 1
69
76
  if (@status == :waiting) && (@waiting_requests <= 0)
70
77
  end_watcher
71
78
  end
@@ -74,7 +81,6 @@ module WebpageArchivist::Fetcher
74
81
  # End the watch
75
82
  def end_watcher
76
83
  EM.stop
77
- @thread_pool.shutdown
78
84
  end
79
85
 
80
86
 
@@ -21,6 +21,7 @@ module WebpageArchivist
21
21
  end
22
22
  end
23
23
  end
24
+ @@migrations.clear
24
25
  end
25
26
 
26
27
  def self.migration(name, &block)
@@ -95,6 +95,18 @@ module WebpageArchivist
95
95
  validates_presence [:webpage_id, :commit_timestamp]
96
96
  end
97
97
 
98
+ def small_snapshot_path
99
+ if snapshot
100
+ "#{webpage.id}/#{self.id}-small.#{Snapshoter.format}"
101
+ end
102
+ end
103
+
104
+ def snapshot_path
105
+ if snapshot
106
+ "#{webpage.id}/#{self.id}.#{Snapshoter.format}"
107
+ end
108
+ end
109
+
98
110
  end
99
111
 
100
112
  module ElementWithContent
@@ -1,3 +1,3 @@
1
1
  module WebpageArchivist
2
- VERSION = "0.0.2"
2
+ VERSION = '0.0.3'
3
3
  end
@@ -18,14 +18,14 @@ Gem::Specification.new do |s|
18
18
  s.rdoc_options = ['--main', 'README.rdoc']
19
19
 
20
20
  s.add_runtime_dependency 'andand', '~> 1.3.1'
21
- s.add_runtime_dependency 'sequel', '~> 3.25'
22
- s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
23
- s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
21
+ s.add_runtime_dependency 'sequel', '~> 3.28'
22
+ s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.4'
23
+ s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
24
24
  s.add_runtime_dependency 'nokogiri', '~> 1.5'
25
25
  s.add_runtime_dependency 'addressable', '~> 2.2.6'
26
26
  s.add_runtime_dependency 'css_parser', '~> 1.2.3'
27
27
  s.add_runtime_dependency 'grit', '~> 2.4.1'
28
- s.add_runtime_dependency 'mime-types', '~> 1.16'
28
+ s.add_runtime_dependency 'mime-types', '~> 1.17.2'
29
29
 
30
30
  s.add_development_dependency 'sqlite3', '~> 1.3.3'
31
31
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage-archivist
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-08 00:00:00.000000000Z
12
+ date: 2011-11-01 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: andand
16
- requirement: &2160802540 !ruby/object:Gem::Requirement
16
+ requirement: &2153126980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,43 +21,43 @@ dependencies:
21
21
  version: 1.3.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2160802540
24
+ version_requirements: *2153126980
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: sequel
27
- requirement: &2160802040 !ruby/object:Gem::Requirement
27
+ requirement: &2153124860 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
31
31
  - !ruby/object:Gem::Version
32
- version: '3.25'
32
+ version: '3.28'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2160802040
35
+ version_requirements: *2153124860
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: eventmachine
38
- requirement: &2160801580 !ruby/object:Gem::Requirement
38
+ requirement: &2153124200 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
42
42
  - !ruby/object:Gem::Version
43
- version: 1.0.0.beta.3
43
+ version: 1.0.0.beta.4
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2160801580
46
+ version_requirements: *2153124200
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: em-http-request
49
- requirement: &2160801120 !ruby/object:Gem::Requirement
49
+ requirement: &2153123320 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
53
53
  - !ruby/object:Gem::Version
54
- version: 1.0.0.beta.4
54
+ version: 1.0.0
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *2160801120
57
+ version_requirements: *2153123320
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: nokogiri
60
- requirement: &2160800660 !ruby/object:Gem::Requirement
60
+ requirement: &2153122200 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '1.5'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *2160800660
68
+ version_requirements: *2153122200
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: addressable
71
- requirement: &2160800200 !ruby/object:Gem::Requirement
71
+ requirement: &2153121340 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 2.2.6
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *2160800200
79
+ version_requirements: *2153121340
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: css_parser
82
- requirement: &2160799740 !ruby/object:Gem::Requirement
82
+ requirement: &2153120740 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ~>
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 1.2.3
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *2160799740
90
+ version_requirements: *2153120740
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: grit
93
- requirement: &2160799280 !ruby/object:Gem::Requirement
93
+ requirement: &2153120160 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ~>
@@ -98,21 +98,21 @@ dependencies:
98
98
  version: 2.4.1
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *2160799280
101
+ version_requirements: *2153120160
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: mime-types
104
- requirement: &2160798820 !ruby/object:Gem::Requirement
104
+ requirement: &2153119520 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ~>
108
108
  - !ruby/object:Gem::Version
109
- version: '1.16'
109
+ version: 1.17.2
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *2160798820
112
+ version_requirements: *2153119520
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: sqlite3
115
- requirement: &2160798360 !ruby/object:Gem::Requirement
115
+ requirement: &2153118740 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ~>
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.3.3
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *2160798360
123
+ version_requirements: *2153118740
124
124
  description: An utility to archive webpages through time
125
125
  email:
126
126
  executables: []
@@ -139,7 +139,6 @@ files:
139
139
  - lib/webpage-archivist/fetcher/fetcher.rb
140
140
  - lib/webpage-archivist/fetcher/requests_plumber.rb
141
141
  - lib/webpage-archivist/fetcher/stylesheet_request.rb
142
- - lib/webpage-archivist/fetcher/thread-pool.rb
143
142
  - lib/webpage-archivist/fetcher/webpage_request.rb
144
143
  - lib/webpage-archivist/html_document.rb
145
144
  - lib/webpage-archivist/migrations.rb
@@ -177,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
177
176
  version: '0'
178
177
  requirements: []
179
178
  rubyforge_project: webpage-archivist
180
- rubygems_version: 1.8.8
179
+ rubygems_version: 1.8.11
181
180
  signing_key:
182
181
  specification_version: 3
183
182
  summary: An utility to archive webpages through time
@@ -1,101 +0,0 @@
1
- # Ruby Thread Pool
2
- # ================
3
- # A thread pool is useful when you wish to do some work in a thread, but do
4
- # not know how much work you will be doing in advance. Spawning one thread
5
- # for each task is potentially expensive, as threads are not free.
6
- #
7
- # In this case, it might be more beneficial to start a predefined set of
8
- # threads and then hand off work to them as it becomes available. This is
9
- # the pure essence of what a thread pool is: an array of threads, all just
10
- # waiting to do some work for you!
11
- #
12
- # Prerequisites
13
- # -------------
14
-
15
- # We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
16
- # thread pool is largely dependent on it. Thanks to this, the implementation
17
- # becomes very simple!
18
- require 'thread'
19
-
20
- # Public Interface
21
- # ----------------
22
-
23
- # `Pool` is our thread pool class. It will allow us to do three operations:
24
- #
25
- # - `.new(size)` creates a thread pool of a given size
26
- # - `#schedule(*args, &job)` schedules a new job to be executed
27
- # - `#shutdown` shuts down all threads (after letting them finish working, of course)
28
- class Pool
29
-
30
- # ### initialization, or `Pool.new(size)`
31
- # Creating a new `Pool` involves a certain amount of work. First, however,
32
- # we need to define its’ `size`. It defines how many threads we will have
33
- # working internally.
34
- #
35
- # Which size is best for you is hard to answer. You do not want it to be
36
- # too low, as then you won’t be able to do as many things concurrently.
37
- # However, if you make it too high Ruby will spend too much time switching
38
- # between threads, and that will also degrade performance!
39
- def initialize(size)
40
- # Before we do anything else, we need to store some information about
41
- # our pool. `@size` is useful later, when we want to shut our pool down,
42
- # and `@jobs` is the heart of our pool that allows us to schedule work.
43
- @size = size
44
- @jobs = Queue.new
45
-
46
- # #### Creating our pool of threads
47
- # Once preparation is done, it’s time to create our pool of threads.
48
- # Each thread store its’ index in a thread-local variable, in case we
49
- # need to know which thread a job is executing in later on.
50
- @pool = Array.new(@size) do |i|
51
- Thread.new do
52
- Thread.current[:id] = i
53
-
54
- # We start off by defining a `catch` around our worker loop. This
55
- # way we’ve provided a method for graceful shutdown of our threads.
56
- # Shutting down is merely a `#schedule { throw :exit }` away!
57
- catch(:exit) do
58
- # The worker thread life-cycle is very simple. We continuously wait
59
- # for tasks to be put into our job `Queue`. If the `Queue` is empty,
60
- # we will wait until it’s not.
61
- loop do
62
- # Once we have a piece of work to be done, we will pull out the
63
- # information we need and get to work.
64
- job, args = @jobs.pop
65
- job.call(*args)
66
- end
67
- end
68
- end
69
- end
70
- end
71
-
72
- # ### Work scheduling
73
-
74
- # To schedule a piece of work to be done is to say to the `Pool` that you
75
- # want something done.
76
- def schedule(*args, &block)
77
- # Your given task will not be run immediately; rather, it will be put
78
- # into the work `Queue` and executed once a thread is ready to work.
79
- @jobs << [block, args]
80
- end
81
-
82
- # ### Graceful shutdown
83
-
84
- # If you ever wish to close down your application, I took the liberty of
85
- # making it easy for you to wait for any currently executing jobs to finish
86
- # before you exit.
87
- def shutdown
88
- # A graceful shutdown involves threads exiting cleanly themselves, and
89
- # since we’ve defined a `catch`-handler around the threads’ worker loop
90
- # it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
91
- # for each thread in our pool, they will all exit eventually!
92
- @size.times do
93
- schedule { throw :exit }
94
- end
95
-
96
- # And now one final thing: wait for our `throw :exit` jobs to be run on
97
- # all our worker threads. This call will not return until all worker threads
98
- # have exited.
99
- @pool.map(&:join)
100
- end
101
- end