webpage-archivist 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,10 @@
1
+ # 0.0.3
2
+
3
+ - clear migration list after playing them
4
+ - use EvenetMachine.defer instead of custom pool size
5
+
1
6
  # 0.0.2
2
7
 
3
8
  - replace websnap + wkhtmltoimage by PhantomJS
4
9
  - replace mini_magick by custom code
5
- - WebpageArchivist#fetch_webpages no takes pages instead of ids
6
-
10
+ - WebpageArchivist#fetch_webpages now takes pages instead of ids
@@ -5,7 +5,7 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
5
5
  * Assets are stored in a git respository to simplify incremental storage and easy retrieval
6
6
  * Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
7
7
  * List of webpages and archives instances are stored in an SQL database
8
- * Some caching data are stored in the same databse
8
+ * Some caching data are stored in the same database
9
9
 
10
10
  = Required tools:
11
11
 
@@ -42,10 +42,28 @@ Basic configuration is done through environment variables:
42
42
  * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
43
43
  * +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
44
44
  * +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
45
-
45
+ * +BACKGROUND_THREAD_POOL_SIZE+: EventMachine pool size for background tasks like taking the snapshots (default to 20)
46
46
  Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
47
47
 
48
48
  To enable debugging use
49
49
 
50
50
  WebpageArchivist.log= true
51
51
 
52
+
53
+ = Connect to the database / run migrations
54
+
55
+ The database connection is available as <tt>WebpageArchivist::DATABASE</tt> and if you want to run your own migrations use
56
+
57
+ require 'webpage-archivist/migrations'
58
+ WebpageArchivist::Migrations.migration 'create table foo' do
59
+ WebpageArchivist::DATABASE.create_table :foos do
60
+ primary_key :id
61
+ # ...
62
+ end
63
+ end
64
+
65
+ WebpageArchivist::Migrations.new.run
66
+
67
+ this way your migrations will be run when the corresponding class is loaded
68
+
69
+ = Released under the MIT license
@@ -1,11 +1,14 @@
1
1
  require 'eventmachine'
2
- require_relative 'thread-pool'
3
2
 
4
3
  # Module in charge of fetching pages
5
4
  module WebpageArchivist::Fetcher
6
5
 
7
6
  SEMAPHORE = Mutex.new
8
7
 
8
+ if ENV['BACKGROUND_THREAD_POOL_SIZE']
9
+ EventMachine.threadpool_size= ENV['BACKGROUND_THREAD_POOL_SIZE'].to_i
10
+ end
11
+
9
12
  # Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
10
13
  # (may be existing instances if the pages haven't changed)
11
14
  def self.fetch_webpages webpages
@@ -39,33 +42,37 @@ module WebpageArchivist::Fetcher
39
42
  @waiting_requests = 0
40
43
  @status = :starting
41
44
  @requests = []
42
- @thread_pool = Pool.new 1
45
+ end
46
+
47
+ # Add a request to wait for
48
+ def add_request request
49
+ @waiting_requests += 1
50
+ @requests << request
43
51
  end
44
52
 
45
53
  # Start to wait
46
54
  def wait
47
55
  @status = :waiting
48
- if @waiting_requests == 0
56
+ if @waiting_requests <= 0
49
57
  end_watcher
50
58
  end
51
59
  end
52
60
 
53
- # Add a request to wait for
54
- def add_request request
55
- @waiting_requests += 1
56
- @requests << request
57
- end
58
-
59
61
  # A request is over
60
62
  # request:: the request
61
63
  # ok:: indicates if the request went ok, in this case ask for a snapshot
62
64
  def end_request request, ok
63
- @waiting_requests -= 1
64
65
  if ok && request.instance
65
- @thread_pool.schedule do
66
- ::WebpageArchivist::Snapshoter.snapshot_instance request.instance
67
- end
66
+ operation = proc { ::WebpageArchivist::Snapshoter.snapshot_instance(request.instance) }
67
+ callback = proc { end_request_inner }
68
+ EM.defer(operation, callback)
69
+ else
70
+ end_request_inner
68
71
  end
72
+ end
73
+
74
+ def end_request_inner
75
+ @waiting_requests -= 1
69
76
  if (@status == :waiting) && (@waiting_requests <= 0)
70
77
  end_watcher
71
78
  end
@@ -74,7 +81,6 @@ module WebpageArchivist::Fetcher
74
81
  # End the watch
75
82
  def end_watcher
76
83
  EM.stop
77
- @thread_pool.shutdown
78
84
  end
79
85
 
80
86
 
@@ -21,6 +21,7 @@ module WebpageArchivist
21
21
  end
22
22
  end
23
23
  end
24
+ @@migrations.clear
24
25
  end
25
26
 
26
27
  def self.migration(name, &block)
@@ -95,6 +95,18 @@ module WebpageArchivist
95
95
  validates_presence [:webpage_id, :commit_timestamp]
96
96
  end
97
97
 
98
+ def small_snapshot_path
99
+ if snapshot
100
+ "#{webpage.id}/#{self.id}-small.#{Snapshoter.format}"
101
+ end
102
+ end
103
+
104
+ def snapshot_path
105
+ if snapshot
106
+ "#{webpage.id}/#{self.id}.#{Snapshoter.format}"
107
+ end
108
+ end
109
+
98
110
  end
99
111
 
100
112
  module ElementWithContent
@@ -1,3 +1,3 @@
1
1
  module WebpageArchivist
2
- VERSION = "0.0.2"
2
+ VERSION = '0.0.3'
3
3
  end
@@ -18,14 +18,14 @@ Gem::Specification.new do |s|
18
18
  s.rdoc_options = ['--main', 'README.rdoc']
19
19
 
20
20
  s.add_runtime_dependency 'andand', '~> 1.3.1'
21
- s.add_runtime_dependency 'sequel', '~> 3.25'
22
- s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
23
- s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
21
+ s.add_runtime_dependency 'sequel', '~> 3.28'
22
+ s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.4'
23
+ s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
24
24
  s.add_runtime_dependency 'nokogiri', '~> 1.5'
25
25
  s.add_runtime_dependency 'addressable', '~> 2.2.6'
26
26
  s.add_runtime_dependency 'css_parser', '~> 1.2.3'
27
27
  s.add_runtime_dependency 'grit', '~> 2.4.1'
28
- s.add_runtime_dependency 'mime-types', '~> 1.16'
28
+ s.add_runtime_dependency 'mime-types', '~> 1.17.2'
29
29
 
30
30
  s.add_development_dependency 'sqlite3', '~> 1.3.3'
31
31
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage-archivist
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-08 00:00:00.000000000Z
12
+ date: 2011-11-01 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: andand
16
- requirement: &2160802540 !ruby/object:Gem::Requirement
16
+ requirement: &2153126980 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,43 +21,43 @@ dependencies:
21
21
  version: 1.3.1
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2160802540
24
+ version_requirements: *2153126980
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: sequel
27
- requirement: &2160802040 !ruby/object:Gem::Requirement
27
+ requirement: &2153124860 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
31
31
  - !ruby/object:Gem::Version
32
- version: '3.25'
32
+ version: '3.28'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2160802040
35
+ version_requirements: *2153124860
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: eventmachine
38
- requirement: &2160801580 !ruby/object:Gem::Requirement
38
+ requirement: &2153124200 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
42
42
  - !ruby/object:Gem::Version
43
- version: 1.0.0.beta.3
43
+ version: 1.0.0.beta.4
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2160801580
46
+ version_requirements: *2153124200
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: em-http-request
49
- requirement: &2160801120 !ruby/object:Gem::Requirement
49
+ requirement: &2153123320 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
53
53
  - !ruby/object:Gem::Version
54
- version: 1.0.0.beta.4
54
+ version: 1.0.0
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *2160801120
57
+ version_requirements: *2153123320
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: nokogiri
60
- requirement: &2160800660 !ruby/object:Gem::Requirement
60
+ requirement: &2153122200 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '1.5'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *2160800660
68
+ version_requirements: *2153122200
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: addressable
71
- requirement: &2160800200 !ruby/object:Gem::Requirement
71
+ requirement: &2153121340 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 2.2.6
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *2160800200
79
+ version_requirements: *2153121340
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: css_parser
82
- requirement: &2160799740 !ruby/object:Gem::Requirement
82
+ requirement: &2153120740 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ~>
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: 1.2.3
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *2160799740
90
+ version_requirements: *2153120740
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: grit
93
- requirement: &2160799280 !ruby/object:Gem::Requirement
93
+ requirement: &2153120160 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ~>
@@ -98,21 +98,21 @@ dependencies:
98
98
  version: 2.4.1
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *2160799280
101
+ version_requirements: *2153120160
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: mime-types
104
- requirement: &2160798820 !ruby/object:Gem::Requirement
104
+ requirement: &2153119520 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ~>
108
108
  - !ruby/object:Gem::Version
109
- version: '1.16'
109
+ version: 1.17.2
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *2160798820
112
+ version_requirements: *2153119520
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: sqlite3
115
- requirement: &2160798360 !ruby/object:Gem::Requirement
115
+ requirement: &2153118740 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ~>
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.3.3
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *2160798360
123
+ version_requirements: *2153118740
124
124
  description: An utility to archive webpages through time
125
125
  email:
126
126
  executables: []
@@ -139,7 +139,6 @@ files:
139
139
  - lib/webpage-archivist/fetcher/fetcher.rb
140
140
  - lib/webpage-archivist/fetcher/requests_plumber.rb
141
141
  - lib/webpage-archivist/fetcher/stylesheet_request.rb
142
- - lib/webpage-archivist/fetcher/thread-pool.rb
143
142
  - lib/webpage-archivist/fetcher/webpage_request.rb
144
143
  - lib/webpage-archivist/html_document.rb
145
144
  - lib/webpage-archivist/migrations.rb
@@ -177,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
177
176
  version: '0'
178
177
  requirements: []
179
178
  rubyforge_project: webpage-archivist
180
- rubygems_version: 1.8.8
179
+ rubygems_version: 1.8.11
181
180
  signing_key:
182
181
  specification_version: 3
183
182
  summary: An utility to archive webpages through time
@@ -1,101 +0,0 @@
1
- # Ruby Thread Pool
2
- # ================
3
- # A thread pool is useful when you wish to do some work in a thread, but do
4
- # not know how much work you will be doing in advance. Spawning one thread
5
- # for each task is potentially expensive, as threads are not free.
6
- #
7
- # In this case, it might be more beneficial to start a predefined set of
8
- # threads and then hand off work to them as it becomes available. This is
9
- # the pure essence of what a thread pool is: an array of threads, all just
10
- # waiting to do some work for you!
11
- #
12
- # Prerequisites
13
- # -------------
14
-
15
- # We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
16
- # thread pool is largely dependent on it. Thanks to this, the implementation
17
- # becomes very simple!
18
- require 'thread'
19
-
20
- # Public Interface
21
- # ----------------
22
-
23
- # `Pool` is our thread pool class. It will allow us to do three operations:
24
- #
25
- # - `.new(size)` creates a thread pool of a given size
26
- # - `#schedule(*args, &job)` schedules a new job to be executed
27
- # - `#shutdown` shuts down all threads (after letting them finish working, of course)
28
- class Pool
29
-
30
- # ### initialization, or `Pool.new(size)`
31
- # Creating a new `Pool` involves a certain amount of work. First, however,
32
- # we need to define its’ `size`. It defines how many threads we will have
33
- # working internally.
34
- #
35
- # Which size is best for you is hard to answer. You do not want it to be
36
- # too low, as then you won’t be able to do as many things concurrently.
37
- # However, if you make it too high Ruby will spend too much time switching
38
- # between threads, and that will also degrade performance!
39
- def initialize(size)
40
- # Before we do anything else, we need to store some information about
41
- # our pool. `@size` is useful later, when we want to shut our pool down,
42
- # and `@jobs` is the heart of our pool that allows us to schedule work.
43
- @size = size
44
- @jobs = Queue.new
45
-
46
- # #### Creating our pool of threads
47
- # Once preparation is done, it’s time to create our pool of threads.
48
- # Each thread store its’ index in a thread-local variable, in case we
49
- # need to know which thread a job is executing in later on.
50
- @pool = Array.new(@size) do |i|
51
- Thread.new do
52
- Thread.current[:id] = i
53
-
54
- # We start off by defining a `catch` around our worker loop. This
55
- # way we’ve provided a method for graceful shutdown of our threads.
56
- # Shutting down is merely a `#schedule { throw :exit }` away!
57
- catch(:exit) do
58
- # The worker thread life-cycle is very simple. We continuously wait
59
- # for tasks to be put into our job `Queue`. If the `Queue` is empty,
60
- # we will wait until it’s not.
61
- loop do
62
- # Once we have a piece of work to be done, we will pull out the
63
- # information we need and get to work.
64
- job, args = @jobs.pop
65
- job.call(*args)
66
- end
67
- end
68
- end
69
- end
70
- end
71
-
72
- # ### Work scheduling
73
-
74
- # To schedule a piece of work to be done is to say to the `Pool` that you
75
- # want something done.
76
- def schedule(*args, &block)
77
- # Your given task will not be run immediately; rather, it will be put
78
- # into the work `Queue` and executed once a thread is ready to work.
79
- @jobs << [block, args]
80
- end
81
-
82
- # ### Graceful shutdown
83
-
84
- # If you ever wish to close down your application, I took the liberty of
85
- # making it easy for you to wait for any currently executing jobs to finish
86
- # before you exit.
87
- def shutdown
88
- # A graceful shutdown involves threads exiting cleanly themselves, and
89
- # since we’ve defined a `catch`-handler around the threads’ worker loop
90
- # it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
91
- # for each thread in our pool, they will all exit eventually!
92
- @size.times do
93
- schedule { throw :exit }
94
- end
95
-
96
- # And now one final thing: wait for our `throw :exit` jobs to be run on
97
- # all our worker threads. This call will not return until all worker threads
98
- # have exited.
99
- @pool.map(&:join)
100
- end
101
- end