RubyGems - massive - Versions diffs - 0.1.0 - Mend

massive 0.1.0

Files changed (55) hide show

checksums.yaml +15 -0
data/.gitignore +22 -0
data/.rspec +3 -0
data/.rvmrc +1 -0
data/.travis.yml +7 -0
data/Gemfile +19 -0
data/Gemfile.lock +141 -0
data/Guardfile +9 -0
data/LICENSE.txt +22 -0
data/README.md +196 -0
data/Rakefile +8 -0
data/lib/massive.rb +63 -0
data/lib/massive/cancelling.rb +20 -0
data/lib/massive/file.rb +80 -0
data/lib/massive/file_job.rb +9 -0
data/lib/massive/file_process.rb +7 -0
data/lib/massive/file_step.rb +7 -0
data/lib/massive/job.rb +115 -0
data/lib/massive/locking.rb +27 -0
data/lib/massive/memory_consumption.rb +15 -0
data/lib/massive/notifications.rb +40 -0
data/lib/massive/notifiers.rb +6 -0
data/lib/massive/notifiers/base.rb +32 -0
data/lib/massive/notifiers/pusher.rb +17 -0
data/lib/massive/process.rb +69 -0
data/lib/massive/process_serializer.rb +12 -0
data/lib/massive/retry.rb +49 -0
data/lib/massive/status.rb +59 -0
data/lib/massive/step.rb +143 -0
data/lib/massive/step_serializer.rb +12 -0
data/lib/massive/timing_support.rb +10 -0
data/lib/massive/version.rb +3 -0
data/massive.gemspec +23 -0
data/spec/fixtures/custom_job.rb +4 -0
data/spec/fixtures/custom_step.rb +19 -0
data/spec/models/massive/cancelling_spec.rb +83 -0
data/spec/models/massive/file_job_spec.rb +24 -0
data/spec/models/massive/file_spec.rb +209 -0
data/spec/models/massive/file_step_spec.rb +22 -0
data/spec/models/massive/job_spec.rb +319 -0
data/spec/models/massive/locking_spec.rb +52 -0
data/spec/models/massive/memory_consumption_spec.rb +24 -0
data/spec/models/massive/notifications_spec.rb +107 -0
data/spec/models/massive/notifiers/base_spec.rb +48 -0
data/spec/models/massive/notifiers/pusher_spec.rb +49 -0
data/spec/models/massive/process_serializer_spec.rb +38 -0
data/spec/models/massive/process_spec.rb +235 -0
data/spec/models/massive/status_spec.rb +104 -0
data/spec/models/massive/step_serializer_spec.rb +40 -0
data/spec/models/massive/step_spec.rb +490 -0
data/spec/models/massive/timing_support_spec.rb +55 -0
data/spec/shared/step_context.rb +25 -0
data/spec/spec_helper.rb +42 -0
data/spec/support/mongoid.yml +78 -0
metadata +175 -0

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    NDgzMGYwNDUzYmUyZTIzYTYwNmJiMzU1MzdkYTY1MGY5OWUzYmYwOQ==
+  data.tar.gz: !binary |-
+    ODUxZDVlMjA3YWVmZjE0MjEzNTdhODU5ODRiNWU2ODg0YTBhZTE3YQ==
+SHA512:
+  metadata.gz: !binary |-
+    OTE3ODEyZjc5NDVjYWI4MWVmMDk1YzkwYjFkYTJjZGM1YWU1ZmY0YjJiYzY4
+    MTZhZjU0ZGU0MGUwYTUzZWNjOWE4YmMyMDE5MzEzNTE4MGM4YjM1MzFhODEy
+    YzIyNTA1M2RmOWI4MDk1NTAyZjRkOTMyNjQ4ZjM0Mzk5NmFlNGU=
+  data.tar.gz: !binary |-
+    NTM0ZGQ5MDc0NGYwYjdmYTYwYjdlNWZiOWJhODFjZThlNjY3ZWJjZWEzMmI3
+    YWYzN2RkMDlkODllNzczNzMzZDdjMjk0ZGUwOGE5NTJhMmI4MTRhY2E0ZTNk
+    YWU0YWQ5YjQ3NjE2MDQxNDk3MTcyMTkwOWM0NjU0OTUwMThkNWI=

data/.gitignore ADDED

@@ -0,0 +1,22 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+.DS_Store
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+spec/dummy/db/*.sqlite3
+spec/dummy/log/*.log
+spec/dummy/tmp/
+spec/dummy/.sass-cache
+vendor/bundle

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--color
+--format documentation
+--drb

data/.rvmrc ADDED

	@@ -0,0 +1 @@
1	+ rvm use 1.9.3@massive --create

data/.travis.yml ADDED

@@ -0,0 +1,7 @@
+language: ruby
+rvm:
+  - 1.9.3
+services:
+  - mongodb
+  - redis-server
+bundler_args: --without development

data/Gemfile ADDED

@@ -0,0 +1,19 @@
+source 'https://rubygems.org'
+gemspec
+gem 'rake'
+gem 'fog'
+group(:development) do
+  gem 'debugger'
+  gem 'guard-rspec', require: false
+  gem 'terminal-notifier-guard'
+end
+group(:test) do
+  gem 'simplecov', require: false
+  gem 'rspec'
+  gem 'database_cleaner'
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,141 @@
+PATH
+  remote: .
+  specs:
+    massive (0.1.0)
+      active_model_serializers
+      file_processor (= 0.2.0)
+      mongoid (~> 3.1.x)
+      resque
+GEM
+  remote: https://rubygems.org/
+  specs:
+    active_model_serializers (0.8.1)
+      activemodel (>= 3.0)
+    activemodel (3.2.17)
+      activesupport (= 3.2.17)
+      builder (~> 3.0.0)
+    activesupport (3.2.17)
+      i18n (~> 0.6, >= 0.6.4)
+      multi_json (~> 1.0)
+    builder (3.0.4)
+    celluloid (0.15.2)
+      timers (~> 1.1.0)
+    celluloid-io (0.15.0)
+      celluloid (>= 0.15.0)
+      nio4r (>= 0.5.0)
+    coderay (1.1.0)
+    columnize (0.3.6)
+    database_cleaner (1.2.0)
+    debugger (1.6.2)
+      columnize (>= 0.3.1)
+      debugger-linecache (~> 1.2.0)
+      debugger-ruby_core_source (~> 1.2.3)
+    debugger-linecache (1.2.0)
+    debugger-ruby_core_source (1.2.3)
+    diff-lcs (1.2.4)
+    excon (0.31.0)
+    ffi (1.9.3)
+    file_processor (0.2.0)
+    fog (1.20.0)
+      builder
+      excon (~> 0.31.0)
+      formatador (~> 0.2.0)
+      mime-types
+      multi_json (~> 1.0)
+      net-scp (~> 1.1)
+      net-ssh (>= 2.1.3)
+      nokogiri (>= 1.5.11)
+    formatador (0.2.4)
+    guard (2.5.1)
+      formatador (>= 0.2.4)
+      listen (~> 2.6)
+      lumberjack (~> 1.0)
+      pry (>= 0.9.12)
+      thor (>= 0.18.1)
+    guard-rspec (4.2.8)
+      guard (~> 2.1)
+      rspec (>= 2.14, < 4.0)
+    i18n (0.6.9)
+    listen (2.7.0)
+      celluloid (>= 0.15.2)
+      celluloid-io (>= 0.15.0)
+      rb-fsevent (>= 0.9.3)
+      rb-inotify (>= 0.9)
+    lumberjack (1.0.4)
+    method_source (0.8.2)
+    mime-types (2.1)
+    mini_portile (0.5.2)
+    mongoid (3.1.6)
+      activemodel (~> 3.2)
+      moped (~> 1.4)
+      origin (~> 1.0)
+      tzinfo (~> 0.3.29)
+    mono_logger (1.1.0)
+    moped (1.5.2)
+    multi_json (1.8.2)
+    net-scp (1.1.2)
+      net-ssh (>= 2.6.5)
+    net-ssh (2.8.0)
+    nio4r (1.0.0)
+    nokogiri (1.6.1)
+      mini_portile (~> 0.5.0)
+    origin (1.1.0)
+    pry (0.9.12.6)
+      coderay (~> 1.0)
+      method_source (~> 0.8)
+      slop (~> 3.4)
+    rack (1.5.2)
+    rack-protection (1.5.2)
+      rack
+    rake (10.1.0)
+    rb-fsevent (0.9.4)
+    rb-inotify (0.9.3)
+      ffi (>= 0.5.0)
+    redis (3.0.7)
+    redis-namespace (1.4.1)
+      redis (~> 3.0.4)
+    resque (1.25.2)
+      mono_logger (~> 1.0)
+      multi_json (~> 1.0)
+      redis-namespace (~> 1.3)
+      sinatra (>= 0.9.2)
+      vegas (~> 0.1.2)
+    rspec (2.14.1)
+      rspec-core (~> 2.14.0)
+      rspec-expectations (~> 2.14.0)
+      rspec-mocks (~> 2.14.0)
+    rspec-core (2.14.7)
+    rspec-expectations (2.14.3)
+      diff-lcs (>= 1.1.3, < 2.0)
+    rspec-mocks (2.14.4)
+    simplecov (0.7.1)
+      multi_json (~> 1.0)
+      simplecov-html (~> 0.7.1)
+    simplecov-html (0.7.1)
+    sinatra (1.4.4)
+      rack (~> 1.4)
+      rack-protection (~> 1.4)
+      tilt (~> 1.3, >= 1.3.4)
+    slop (3.4.7)
+    terminal-notifier-guard (1.5.3)
+    thor (0.18.1)
+    tilt (1.4.1)
+    timers (1.1.0)
+    tzinfo (0.3.39)
+    vegas (0.1.11)
+      rack (>= 1.0.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  database_cleaner
+  debugger
+  fog
+  guard-rspec
+  massive!
+  rake
+  rspec
+  simplecov
+  terminal-notifier-guard

data/Guardfile ADDED

@@ -0,0 +1,9 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard :rspec do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/models/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Vicente Mundim
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,196 @@
+# Massive
+[![build status][1]][2]
+[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/dtmtec/massive/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
+[1]: https://travis-ci.org/dtmtec/massive.png
+[2]: http://travis-ci.org/dtmtec/massive
+Massive gives you a basic infrastructure to parallelize processing of large files and/or data using Resque, Redis and MongoDB.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'massive'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install massive
+## Requirements
+If you want notifications using [Pusher][http://pusher.com], you'll need to add `pusher-gem` to your Gemfile. Also, if you'd like these notifications to be sent in less than one second intervals you'll need to use 2.6 version of Redis.
+## Usage
+Massive gives you a basic model structure to process data, either coming from a file or from some other source, like a database. It has three basic concepts:
+* __Process__: defines the set of steps and control their execution.
+* __Step__: a step of the process, for example, when processing a CSV file you may want to gather some info of the file, them read the data from the file and import it to the database, and later perform some processing on that data. In this scenario you would create 3 steps, each step would split the processing into smaller jobs.
+* __Job__: here lies the basic processing logic, iterating through each item from the data set reserved for it, and them process the item. It also updates the number of processed items, so you can poll the jobs about their progress.
+The main usage would consist in subclassing `Massive::Step` and `Massive::Job` to add the required logic for your processing.
+For example, suppose you want to perform an operation on a model, for example, cache the number of friends a `User` has on a social network. Let's suppose that we have 100 thousands users in the database, so this would probably take some time, so we want to do it in background.
+We just need one step for it, and we will call it `CacheFriendsStep`:
+```ruby
+  class CacheFriendsStep < Massive::Step
+    # here we tell it how to calculate the total number of items we want it to process
+    calculates_total_count_with { User.count }
+    # we define the job class, otherwise it would use the default, which is Massive::Job
+    job_class 'CacheFriendsJob'
+  end
+```
+Then we define the job class `CacheFriendsJob`, redefining two methods `each_item` and `process_each`. The first one is used to iterate through our data set, yielding the given block on each pass. Note that it uses the job offset and limit, so that the job can be parallelized. The last one is used to actually process an item, receiving its index within the job data set.
+```ruby
+  class CacheFriendsJob < Massive::Job
+    def each_item(&block)
+      User.offset(offset).limit(limit).each(&block)
+    end
+    def process_each(user, index)
+      user.friends_count = user.friends.count
+    end
+  end
+```
+Now we just create a process, and add the `CacheFriendsStep` to it, then enqueue the step:
+```ruby
+  process = Massive::Process.new
+  process.steps << CacheFriendsStep.new
+  process.save
+  process.enqueue_next
+```
+Now the `CacheFriendsStep` is enqueued in the Resque queue. When it is run by a Resque worker it will split the processing into a number of jobs based on the step `limit_ratio`. This  `limit_ratio` could be defined like this:
+```ruby
+  class CacheFriendsStep < Massive::Step
+    # here we tell it how to calculate the total number of items we want it to process
+    calculates_total_count_with { User.count }
+    # we define the job class, otherwise it would use the default, which is Massive::Job
+    job_class 'CacheFriendsJob'
+    # defining a different limit ratio
+    limit_ratio 2000 => 1000, 1000 => 200, 0 => 100
+  end
+```
+What this means is that when the number of items to process is greater or equal than 2000, it will split jobs making each one process 1000 items. If the number of items is less than 2000 but greater than 1000, it will process 200 items each. If the number of items is less than 1000, it will process 100 items each.
+The default limit ratio is defined like this: `3000 => 1000, 0 => 100`. When its greater than or equal to 3000, process 1000 items each, otherwise, process 100.
+For the above example, it would create `100000 / 1000 == 100` jobs, where the first one would have an offset of 0, and a limit of 1000, the next one an offset of 1000 and a limit of 1000, and so on.
+With 100 jobs in a Resque queue you may want to start more than one worker so that it can process this queue more quickly.
+Now you just need to wait until all jobs have been completed, by polling the step once in a while:
+```ruby
+  process.reload
+  step = process.steps.first
+  step.processed            # gives you the sum of the number of items processed by all jobs
+  step.processed_percentage # the percentage of processed items based on the total count
+  step.elapsed_time         # the elapsed time from when the step started processing until now, or its duration once it is finished
+  step.processing_time      # the sum of the elapsed time for each job, which basically gives you the total time spent processing your data set.
+```
+You can check whether the step is completed, or started:
+```ruby
+  step.started?     # true   when it has been started
+  step.completed?   # false  when it has been completed, i.e., there is at least one job that has not been completed
+```
+### Retry
+When an error occurs while processing an item, it will automatically retry it for a number of times, giving an interval. By default it will retry 10 times with a 2 second interval. This is very useful when you'd expect some external service to fail for a small period of time, but you want to make sure that you recover from it, without the need to retry the entire job processing.
+If the processing of a single item fails for the maximum number of retries the exception will be raised again, making the job fail. The error message will be stored and can be accessed through `Massive::Job#last_error`. It will also record the time when the error occurred.
+You can change retry interval and the maximum number of retries if you want:
+```ruby
+  class CacheFriendsJob < Massive::Job
+    retry_interval 5
+    maximum_retries 3
+    def each_item(&block)
+      User.offset(offset).limit(limit).each(&block)
+    end
+    def process_each(user, index)
+      user.friends_count = user.friends.count
+    end
+  end
+```
+### File
+One common use for __Massive__ is to process large CSV files, so it comes with `Massive::FileProcess`, `Massive::FileStep` and `Massive::FileJob`. A `Massive::FileProcess` embeds one `Massive::File`, which has a URL to a local or external file, and a [file processor](https://github.com/dtmtec/file_processor).
+With this structure you can easily import users from a CSV file:
+```ruby
+  class ImportUsersStep < Massive::FileStep
+    job_class 'ImportUsersJob'
+  end
+  class ImportUsersJob < Massive::FileJob
+    def process_each(row, index)
+      User.create(row)
+    end
+  end
+  process = Massive::FileProcess.new(file_attributes: { url: 'http://myserver.com/path/to/my/file.csv' })
+  process.steps << ImportUsersStep.new
+```
+Notice that we didn't had to specify how the step would calculate the total count for the `ImportUsersStep`. It is already set to the number of lines in the CSV file of the `Massive::FileProcess`. For this we want to make sure that we have gathered information about the file:
+```ruby
+  process.file.gather_info!
+```
+We also didn't have to specify how the job would iterate through each row, it is already defined. We just get a CSV::Row, which will be a Hash-like structure where the header of the CSV is the key, so we can just pass it to `User.create`. Of course this is a simple example, you should protect the attributes, or even pass only the ones you want from the CSV.
+The `Massive::File` has support for [Fog::Storage][http://fog.io/storage/]. To use it yoy must define the `fog_credentials` and optionally the `fog_directory` and `fog_authenticated_url_expiration`:
+```ruby
+  Massive.fog_credentials = {
+    provider: 'AWS',
+    aws_access_key_id: 'INSERT-YOUR-AWS-KEY-HERE',
+    aws_secret_access_key: 'INSERT-YOUR-AWS-SECRET-HERE'
+  }
+  Massive.fog_directory = 'your-bucket-here' # defaults to 'massive'
+  Massive.fog_authenticated_url_expiration = 30.minutes # defaults to 1.hour
+```
+Then set the `filename` field when creating the `Massive::File` instead of setting its `url`. Notice that the filename should point to the full path within the bucket, not just the actual filename.
+```ruby
+  process = Massive::FileProcess.new(file_attributes: { filename: '/path/to/my/file.csv' })
+```
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request