RubyGems - webpage-archivist - Versions diffs - 0.0.2 → 0.0.3 - Mend

webpage-archivist 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/CHANGELOG.md +6 -2
data/README.rdoc +20 -2
data/lib/webpage-archivist/fetcher/fetcher.rb +20 -14
data/lib/webpage-archivist/migrations.rb +1 -0
data/lib/webpage-archivist/models.rb +12 -0
data/lib/webpage-archivist/version.rb +1 -1
data/webpage-archivist.gemspec +4 -4
metadata +27 -28
data/lib/webpage-archivist/fetcher/thread-pool.rb +0 -101

data/CHANGELOG.md CHANGED

@@ -1,6 +1,10 @@
+# 0.0.3
+- clear migration list after playing them
+- use EvenetMachine.defer instead of custom pool size
 # 0.0.2
 - replace websnap + wkhtmltoimage by PhantomJS
 - replace mini_magick by custom code
-- WebpageArchivist#fetch_webpages no takes pages instead of ids
+- WebpageArchivist#fetch_webpages now takes pages instead of ids

data/README.rdoc CHANGED

@@ -5,7 +5,7 @@ Takes snapshots and make incremental backups of webpages assets so you can follo
 * Assets are stored in a git respository to simplify incremental storage and easy retrieval
 * Snapshots and thumbails are stored in a plain repository so they can easily be served by a webserver
 * List of webpages and archives instances are stored in an SQL database
-* Some caching data are stored in the same databse
+* Some caching data are stored in the same database
 = Required tools:
@@ -42,10 +42,28 @@ Basic configuration is done through environment variables:
 * +ARCHIVIST_MAX_RUNNING_REQUESTS+ : number of elements requests running in parallel (not so important since requests are run using EventMachine[http://rubyeventmachine.com/], default to 20
 * +PHANTOMJS_PATH+: path to PhantomJS executable if they aren't in the path
 * +GRAPHICS_MAGICK_PATH+ : path to GraphicsMagick executable if it isn't in the path
+* +BACKGROUND_THREAD_POOL_SIZE+: EventMachine pool size for background tasks like taking the snapshots (default to 20)
 Configuration for snapshoting is done through the WebpageArchivist::Snapshoter class.
 To enable debugging use
   WebpageArchivist.log= true
+= Connect to the database / run migrations
+The database connection is available as <tt>WebpageArchivist::DATABASE</tt> and if you want to run your own migrations use
+  require 'webpage-archivist/migrations'
+  WebpageArchivist::Migrations.migration 'create table foo' do
+    WebpageArchivist::DATABASE.create_table :foos do
+      primary_key :id
+      # ...
+    end
+  end
+  WebpageArchivist::Migrations.new.run
+this way your migrations will be run when the corresponding class is loaded
+= Released under the MIT license

data/lib/webpage-archivist/fetcher/fetcher.rb CHANGED

@@ -1,11 +1,14 @@
 require 'eventmachine'
-require_relative 'thread-pool'
 # Module in charge of fetching pages
 module WebpageArchivist::Fetcher
   SEMAPHORE = Mutex.new
+  if ENV['BACKGROUND_THREAD_POOL_SIZE']
+    EventMachine.threadpool_size= ENV['BACKGROUND_THREAD_POOL_SIZE'].to_i
+  end
   # Fetch several webpages, return an hash indexed by the webpages holding the corresponding Instances or http result codes
   # (may be existing instances if the pages haven't changed)
   def self.fetch_webpages webpages
@@ -39,33 +42,37 @@ module WebpageArchivist::Fetcher
       @waiting_requests = 0
       @status = :starting
       @requests = []
-      @thread_pool = Pool.new 1
+    end
+    # Add a request to wait for
+    def add_request request
+      @waiting_requests += 1
+      @requests << request
     end
     # Start to wait
     def wait
       @status = :waiting
-      if @waiting_requests == 0
+      if @waiting_requests <= 0
         end_watcher
       end
     end
-    # Add a request to wait for
-    def add_request request
-      @waiting_requests += 1
-      @requests << request
-    end
     # A request is over
     # request:: the request
     # ok:: indicates if the request went ok, in this case ask for a snapshot
     def end_request request, ok
-      @waiting_requests -= 1
       if ok && request.instance
-        @thread_pool.schedule do
-          ::WebpageArchivist::Snapshoter.snapshot_instance request.instance
-        end
+        operation = proc { ::WebpageArchivist::Snapshoter.snapshot_instance(request.instance) }
+        callback = proc { end_request_inner }
+        EM.defer(operation, callback)
+      else
+        end_request_inner
       end
+    end
+    def end_request_inner
+      @waiting_requests -= 1
       if (@status == :waiting) && (@waiting_requests <= 0)
         end_watcher
       end
@@ -74,7 +81,6 @@ module WebpageArchivist::Fetcher
     # End the watch
     def end_watcher
       EM.stop
-      @thread_pool.shutdown
     end

data/lib/webpage-archivist/migrations.rb CHANGED

@@ -21,6 +21,7 @@ module WebpageArchivist
           end
         end
       end
+      @@migrations.clear
     end
     def self.migration(name, &block)

data/lib/webpage-archivist/models.rb CHANGED

@@ -95,6 +95,18 @@ module WebpageArchivist
       validates_presence [:webpage_id, :commit_timestamp]
     end
+    def small_snapshot_path
+      if snapshot
+        "#{webpage.id}/#{self.id}-small.#{Snapshoter.format}"
+      end
+    end
+    def snapshot_path
+      if snapshot
+        "#{webpage.id}/#{self.id}.#{Snapshoter.format}"
+      end
+    end
   end
   module ElementWithContent

data/lib/webpage-archivist/version.rb CHANGED

@@ -1,3 +1,3 @@
 module WebpageArchivist
-    VERSION = "0.0.2"
+    VERSION = '0.0.3'
 end

data/webpage-archivist.gemspec CHANGED

@@ -18,14 +18,14 @@ Gem::Specification.new do |s|
   s.rdoc_options = ['--main', 'README.rdoc']
   s.add_runtime_dependency 'andand', '~> 1.3.1'
-  s.add_runtime_dependency 'sequel', '~> 3.25'
-  s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.3'
-  s.add_runtime_dependency 'em-http-request', '~> 1.0.0.beta.4'
+  s.add_runtime_dependency 'sequel', '~> 3.28'
+  s.add_runtime_dependency 'eventmachine', '~> 1.0.0.beta.4'
+  s.add_runtime_dependency 'em-http-request', '~> 1.0.0'
   s.add_runtime_dependency 'nokogiri', '~> 1.5'
   s.add_runtime_dependency 'addressable', '~> 2.2.6'
   s.add_runtime_dependency 'css_parser', '~> 1.2.3'
   s.add_runtime_dependency 'grit', '~> 2.4.1'
-  s.add_runtime_dependency 'mime-types', '~> 1.16'
+  s.add_runtime_dependency 'mime-types', '~> 1.17.2'
   s.add_development_dependency 'sqlite3', '~> 1.3.3'

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: webpage-archivist
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-09-08 00:00:00.000000000Z
+date: 2011-11-01 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: andand
-  requirement: &2160802540 !ruby/object:Gem::Requirement
+  requirement: &2153126980 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,43 +21,43 @@ dependencies:
         version: 1.3.1
   type: :runtime
   prerelease: false
-  version_requirements: *2160802540
+  version_requirements: *2153126980
 - !ruby/object:Gem::Dependency
   name: sequel
-  requirement: &2160802040 !ruby/object:Gem::Requirement
+  requirement: &2153124860 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '3.25'
+        version: '3.28'
   type: :runtime
   prerelease: false
-  version_requirements: *2160802040
+  version_requirements: *2153124860
 - !ruby/object:Gem::Dependency
   name: eventmachine
-  requirement: &2160801580 !ruby/object:Gem::Requirement
+  requirement: &2153124200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 1.0.0.beta.3
+        version: 1.0.0.beta.4
   type: :runtime
   prerelease: false
-  version_requirements: *2160801580
+  version_requirements: *2153124200
 - !ruby/object:Gem::Dependency
   name: em-http-request
-  requirement: &2160801120 !ruby/object:Gem::Requirement
+  requirement: &2153123320 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 1.0.0.beta.4
+        version: 1.0.0
   type: :runtime
   prerelease: false
-  version_requirements: *2160801120
+  version_requirements: *2153123320
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &2160800660 !ruby/object:Gem::Requirement
+  requirement: &2153122200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -65,10 +65,10 @@ dependencies:
         version: '1.5'
   type: :runtime
   prerelease: false
-  version_requirements: *2160800660
+  version_requirements: *2153122200
 - !ruby/object:Gem::Dependency
   name: addressable
-  requirement: &2160800200 !ruby/object:Gem::Requirement
+  requirement: &2153121340 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -76,10 +76,10 @@ dependencies:
         version: 2.2.6
   type: :runtime
   prerelease: false
-  version_requirements: *2160800200
+  version_requirements: *2153121340
 - !ruby/object:Gem::Dependency
   name: css_parser
-  requirement: &2160799740 !ruby/object:Gem::Requirement
+  requirement: &2153120740 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -87,10 +87,10 @@ dependencies:
         version: 1.2.3
   type: :runtime
   prerelease: false
-  version_requirements: *2160799740
+  version_requirements: *2153120740
 - !ruby/object:Gem::Dependency
   name: grit
-  requirement: &2160799280 !ruby/object:Gem::Requirement
+  requirement: &2153120160 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -98,21 +98,21 @@ dependencies:
         version: 2.4.1
   type: :runtime
   prerelease: false
-  version_requirements: *2160799280
+  version_requirements: *2153120160
 - !ruby/object:Gem::Dependency
   name: mime-types
-  requirement: &2160798820 !ruby/object:Gem::Requirement
+  requirement: &2153119520 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '1.16'
+        version: 1.17.2
   type: :runtime
   prerelease: false
-  version_requirements: *2160798820
+  version_requirements: *2153119520
 - !ruby/object:Gem::Dependency
   name: sqlite3
-  requirement: &2160798360 !ruby/object:Gem::Requirement
+  requirement: &2153118740 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -120,7 +120,7 @@ dependencies:
         version: 1.3.3
   type: :development
   prerelease: false
-  version_requirements: *2160798360
+  version_requirements: *2153118740
 description: An utility to archive webpages through time
 email:
 executables: []
@@ -139,7 +139,6 @@ files:
 - lib/webpage-archivist/fetcher/fetcher.rb
 - lib/webpage-archivist/fetcher/requests_plumber.rb
 - lib/webpage-archivist/fetcher/stylesheet_request.rb
-- lib/webpage-archivist/fetcher/thread-pool.rb
 - lib/webpage-archivist/fetcher/webpage_request.rb
 - lib/webpage-archivist/html_document.rb
 - lib/webpage-archivist/migrations.rb
@@ -177,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project: webpage-archivist
-rubygems_version: 1.8.8
+rubygems_version: 1.8.11
 signing_key:
 specification_version: 3
 summary: An utility to archive webpages through time

data/lib/webpage-archivist/fetcher/thread-pool.rb DELETED

@@ -1,101 +0,0 @@
-# Ruby Thread Pool
-# ================
-# A thread pool is useful when you wish to do some work in a thread, but do
-# not know how much work you will be doing in advance. Spawning one thread
-# for each task is potentially expensive, as threads are not free.
-#
-# In this case, it might be more beneficial to start a predefined set of
-# threads and then hand off work to them as it becomes available. This is
-# the pure essence of what a thread pool is: an array of threads, all just
-# waiting to do some work for you!
-#
-# Prerequisites
-# -------------
-# We need the [Queue](http://rdoc.info/stdlib/thread/1.9.2/Queue), as our
-# thread pool is largely dependent on it. Thanks to this, the implementation
-# becomes very simple!
-require 'thread'
-# Public Interface
-# ----------------
-# `Pool` is our thread pool class. It will allow us to do three operations:
-#
-# - `.new(size)` creates a thread pool of a given size
-# - `#schedule(*args, &job)` schedules a new job to be executed
-# - `#shutdown` shuts down all threads (after letting them finish working, of course)
-class Pool
-  # ### initialization, or `Pool.new(size)`
-  # Creating a new `Pool` involves a certain amount of work. First, however,
-  # we need to define its’ `size`. It defines how many threads we will have
-  # working internally.
-  #
-  # Which size is best for you is hard to answer. You do not want it to be
-  # too low, as then you won’t be able to do as many things concurrently.
-  # However, if you make it too high Ruby will spend too much time switching
-  # between threads, and that will also degrade performance!
-  def initialize(size)
-    # Before we do anything else, we need to store some information about
-    # our pool. `@size` is useful later, when we want to shut our pool down,
-    # and `@jobs` is the heart of our pool that allows us to schedule work.
-    @size = size
-    @jobs = Queue.new
-    # #### Creating our pool of threads
-    # Once preparation is done, it’s time to create our pool of threads.
-    # Each thread store its’ index in a thread-local variable, in case we
-    # need to know which thread a job is executing in later on.
-    @pool = Array.new(@size) do |i|
-      Thread.new do
-        Thread.current[:id] = i
-        # We start off by defining a `catch` around our worker loop. This
-        # way we’ve provided a method for graceful shutdown of our threads.
-        # Shutting down is merely a `#schedule { throw :exit }` away!
-        catch(:exit) do
-          # The worker thread life-cycle is very simple. We continuously wait
-          # for tasks to be put into our job `Queue`. If the `Queue` is empty,
-          # we will wait until it’s not.
-          loop do
-            # Once we have a piece of work to be done, we will pull out the
-            # information we need and get to work.
-            job, args = @jobs.pop
-            job.call(*args)
-          end
-        end
-      end
-    end
-  end
-  # ### Work scheduling
-  # To schedule a piece of work to be done is to say to the `Pool` that you
-  # want something done.
-  def schedule(*args, &block)
-    # Your given task will not be run immediately; rather, it will be put
-    # into the work `Queue` and executed once a thread is ready to work.
-    @jobs << [block, args]
-  end
-  # ### Graceful shutdown
-  # If you ever wish to close down your application, I took the liberty of
-  # making it easy for you to wait for any currently executing jobs to finish
-  # before you exit.
-  def shutdown
-    # A graceful shutdown involves threads exiting cleanly themselves, and
-    # since we’ve defined a `catch`-handler around the threads’ worker loop
-    # it is simply a matter of throwing `:exit`. Thus, if we throw one `:exit`
-    # for each thread in our pool, they will all exit eventually!
-    @size.times do
-      schedule { throw :exit }
-    end
-    # And now one final thing: wait for our `throw :exit` jobs to be run on
-    # all our worker threads. This call will not return until all worker threads
-    # have exited.
-    @pool.map(&:join)
-  end
-end