RubyGems - parallel588_polipus - Versions diffs - 0.4.0 - Mend

parallel588_polipus 0.4.0

Files changed (72) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/.gitignore +53 -0
data/.rspec +2 -0
data/.rubocop.yml +17 -0
data/.rubocop_todo.yml +33 -0
data/.travis.yml +22 -0
data/AUTHORS.md +5 -0
data/CHANGELOG.md +61 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +70 -0
data/Rakefile +8 -0
data/examples/basic.rb +63 -0
data/examples/error_handling.rb +23 -0
data/examples/incremental.rb +63 -0
data/examples/robots_txt_handling.rb +14 -0
data/examples/survival.rb +10 -0
data/lib/polipus.rb +488 -0
data/lib/polipus/http.rb +282 -0
data/lib/polipus/page.rb +256 -0
data/lib/polipus/plugin.rb +14 -0
data/lib/polipus/plugins/cleaner.rb +25 -0
data/lib/polipus/plugins/sample.rb +15 -0
data/lib/polipus/plugins/sleeper.rb +22 -0
data/lib/polipus/queue_overflow.rb +26 -0
data/lib/polipus/queue_overflow/base.rb +7 -0
data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
data/lib/polipus/queue_overflow/manager.rb +57 -0
data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
data/lib/polipus/queue_overflow/worker.rb +24 -0
data/lib/polipus/robotex.rb +145 -0
data/lib/polipus/signal_handler.rb +42 -0
data/lib/polipus/storage.rb +31 -0
data/lib/polipus/storage/base.rb +20 -0
data/lib/polipus/storage/dev_null.rb +35 -0
data/lib/polipus/storage/memory_store.rb +56 -0
data/lib/polipus/storage/mongo_store.rb +90 -0
data/lib/polipus/storage/rethink_store.rb +90 -0
data/lib/polipus/url_tracker.rb +21 -0
data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
data/lib/polipus/url_tracker/redis_set.rb +27 -0
data/lib/polipus/version.rb +5 -0
data/polipus.gemspec +44 -0
data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
data/spec/cassettes/gzipped_on.yml +147 -0
data/spec/cassettes/http_cookies.yml +133 -0
data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
data/spec/cassettes/http_test.yml +1418 -0
data/spec/cassettes/http_test_redirect.yml +71 -0
data/spec/clear.rb +12 -0
data/spec/polipus/http_spec.rb +139 -0
data/spec/polipus/page_spec.rb +68 -0
data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
data/spec/polipus/queue_overflow_spec.rb +66 -0
data/spec/polipus/robotex_spec.rb +85 -0
data/spec/polipus/signal_handler_spec.rb +15 -0
data/spec/polipus/storage/memory_store_spec.rb +87 -0
data/spec/polipus/storage/mongo_store_spec.rb +119 -0
data/spec/polipus/storage/rethink_store_spec.rb +117 -0
data/spec/polipus/url_tracker_spec.rb +29 -0
data/spec/polipus_spec.rb +107 -0
data/spec/spec_helper.rb +42 -0
metadata +348 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: bb50db8ac7efd7f04d3e044b2e1d708ab2434eb3
+  data.tar.gz: fa4158852a74a75603d7a6f0e6af1de9b7940dfc
+SHA512:
+  metadata.gz: 8e43cec9505d08bceddfe6f64f5a335a8ee12a37fd5f11d1d1c4f20351df651e656b282e5f884259b0cd1df345975be105fa12dd77de52529d7d62c3387bcf38
+  data.tar.gz: fe3941eaee3fc53104e222cc9761e51bbd2fa8d1eaea02e5c66015fe5a520bc68b43b6537fd0059056988f6674058307bf93e200009ffba045b449a678cff1f7

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.gitignore ADDED Viewed

@@ -0,0 +1,53 @@
+# rcov generated
+coverage
+coverage.data
+# rdoc generated
+rdoc
+# yard generated
+doc
+.yardoc
+# bundler
+.bundle
+# jeweler generated
+pkg
+# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
+#
+# * Create a file at ~/.gitignore
+# * Include files you want ignored
+# * Run: git config --global core.excludesfile ~/.gitignore
+#
+# After doing this, these files will be ignored in all your git projects,
+# saving you from having to 'pollute' every project you touch with them
+#
+# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
+#
+# For MacOS:
+#
+.DS_Store
+# For TextMate
+#*.tmproj
+#tmtags
+# For emacs:
+#*~
+#\#*
+#.\#*
+# For vim:
+#*.swp
+# For redcar:
+#.redcar
+# For rubinius:
+#*.rbc
+Gemfile.lock
+my_test/

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format documentation

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,17 @@
+inherit_from: .rubocop_todo.yml
+AllCops:
+  Exclude:
+   - my_test/**/*
+   - examples/**/*
+Metrics/LineLength:
+  Enabled: false
+Style/TrivialAccessors:
+  Enabled: false
+Metrics/ClassLength:
+  Enabled: false
+Metrics/MethodLength:
+  Enabled: false

data/.rubocop_todo.yml ADDED Viewed

@@ -0,0 +1,33 @@
+# This configuration was generated by `rubocop --auto-gen-config`
+# on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 1
+Style/ClassVars:
+  Enabled: false
+# Offense count: 10
+Metrics/CyclomaticComplexity:
+  Max: 16
+# Offense count: 26
+Style/Documentation:
+  Enabled: false
+# Offense count: 2
+# Configuration parameters: EnforcedStyle, SupportedStyles.
+Style/Next:
+  Enabled: false
+# Offense count: 5
+# Configuration parameters: MaxSlashes.
+Style/RegexpLiteral:
+  Enabled: false
+# Offense count: 4
+Style/RescueModifier:
+  Enabled: false

data/.travis.yml ADDED Viewed

@@ -0,0 +1,22 @@
+language: ruby
+rvm:
+  - jruby
+  - 1.9.3
+  - 2.0.0
+  - 2.1.5
+  - 2.2.0
+  - rbx-2
+# Until travis supports rethinkdb as service...
+before_install:
+  - source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
+  - wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
+  - sudo apt-get update -q
+  - sudo apt-get install rethinkdb
+  - sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
+  - sudo service rethinkdb restart
+services:
+  - redis
+  - mongodb
+#  - rethinkdb

data/AUTHORS.md ADDED Viewed

@@ -0,0 +1,5 @@
+# Authors
+* [Francesco Laurita](francesco.laurita@gmail.com)
+* [Tobias L. Maier](http://tobiasmaier.info/)
+* [Marcos Piccinini](https://github.com/nofxx)

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,61 @@
+# Changelog
+## 0.4.0 (2015-01-12)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
+* Adds RethinkDB Storage
+* BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
+* Organize and update specs to rspec 3
+## 0.3.3 (2015-06-26)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
+* BugFix: Better compatibility for mongo 2.6.x on index creation
+## 0.3.2 (2015-06-17)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.1...0.3.2)
+* BugFix: When a page contains an error, Mongo trows `BSON::InvalidDocument`. `Excpetion` is not serializable
+  [31647cc](https://github.com/taganaka/polipus/commit/31647ccd8fe64247e4e6d75ced097607f1fb4b2d)
+## 0.3.1 (2015-06-17)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.0...0.3.1)
+* Major Code-Style changes and cleanup
+  [#35](https://github.com/taganaka/polipus/pull/35)
+* BugFix: proper initialization of internal_queue
+  [#38](https://github.com/taganaka/polipus/pull/38)
+* Better INT / TERM Signal handling [#34](https://github.com/taganaka/polipus/pull/34)
+  New option added:
+    ```ruby
+    enable_signal_handler: true / false
+    ```
+* Zlib::GzipFile::Error handling
+  [da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
+* Faster and easier overflow management
+  [#39](https://github.com/taganaka/polipus/pull/39)
+## 0.3.0 (2015-06-02)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
+* Add `PolipusCrawler#add_to_queue` to add a page back to the queue
+  [#24](https://github.com/taganaka/polipus/pull/24)
+* Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
+  For example a connectivity error.
+  See `/examples/error_handling.rb`
+  [#15](https://github.com/taganaka/polipus/issues/15)
+* Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
+* Polipus supports now `robots.txt` directives.
+  Set the option `:obey_robots_txt` to `true`.
+  See `/examples/robots_txt_handling.rb`
+  [#30](https://github.com/taganaka/polipus/pull/30)
+* Add support for GZIP and deflate compressed HTTP requests
+  [#26](https://github.com/taganaka/polipus/pull/26)
+* Minor improvements to code style

data/Gemfile ADDED Viewed

@@ -0,0 +1,12 @@
+source 'https://rubygems.org'
+gemspec
+platform :ruby do
+  gem 'bson_ext'
+end
+platform :jruby do
+  gem 'json'
+  gem 'bson'
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2013 Francesco Laurita
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,70 @@
+[![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
+[![Coverage Status](https://img.shields.io/coveralls/taganaka/polipus/master.svg)](https://coveralls.io/r/taganaka/polipus?branch=master)
+[![Code Climate](https://codeclimate.com/github/taganaka/polipus.svg)](https://codeclimate.com/github/taganaka/polipus)
+[![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
+# Polipus #
+A distributed web crawler written in ruby, backed by Redis
+This project has been presented to the RubyDay2013
+http://www.slideshare.net/francescolaurita/roll-your-own-web-crawler-rubyday
+## Features ##
+* Easy to use
+* Distributed and scalable
+* It uses a smart/fast and space-efficient probabilistic data structure to determine if an url should be visited or not
+* It doesn't exaust your Redis server
+* Play nicely with MongoDB even if it is not strictly required
+* Easy to write your own page storage strategy
+* Focus crawling made easy
+* Heavily inspired to Anemone https://github.com/chriskite/anemone/
+## Supported Ruby Interpreters
+* MRI 1.9.x >= 1.9.1
+* MRI 2.0.0
+* MRI 2.1.2
+* JRuby 1.9 mode
+* Rubinius
+## Survival code example
+```ruby
+require "polipus"
+Polipus.crawler("rubygems","http://rubygems.org/") do |crawler|
+  # In-place page processing
+  crawler.on_page_downloaded do |page|
+    # A nokogiri object
+    puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
+  end
+end
+```
+## Installation
+    $ gem install polipus
+## Testing
+    $ bundle install
+    $ rake
+## Contributing to polipus ##
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
+* Fork the project.
+* Start a feature/bugfix branch.
+* Commit and push until you are happy with your contribution.
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Install [Rubocop](https://github.com/bbatsov/rubocop) and make sure it is happy
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+## Copyright ##
+Copyright (c) 2013 Francesco Laurita. See LICENSE.txt for
+further details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+# encoding: UTF-8
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec
+task test: :spec

data/examples/basic.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# encoding: UTF-8
+require 'polipus'
+require 'mongo'
+require 'polipus/plugins/cleaner'
+# Define a Mongo connection
+mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
+# Override some default options
+options = {
+  # Redis connection
+  redis_options: {
+    host: 'localhost',
+    db: 5,
+    driver: 'hiredis'
+  },
+  # Page storage: pages is the name of the collection where
+  # pages will be stored
+  storage: Polipus::Storage.mongo_store(mongo, 'pages'),
+  # Use your custom user agent
+  user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
+  # Use 5 threads
+  workers: 5,
+  # Queue overflow settings:
+  #  * No more than 5000 elements on the Redis queue
+  #  * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
+  #  * Check cycle is done every 60 sec
+  queue_items_limit: 5_000,
+  queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
+  queue_overflow_manager_check_time: 60,
+  # Logs goes to the stdout
+  logger: Logger.new(STDOUT)
+}
+Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
+starting_urls = ['http://rubygems.org/gems']
+# Crawl the entire rubygems's site
+# Polipus.crawler('polipus-rubygems', starting_urls, options)
+Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
+  # Ignore urls pointing to a gem file
+  crawler.skip_links_like(/\.gem$/)
+  # Ignore urls pointing to an atom feed
+  crawler.skip_links_like(/\.atom$/)
+  # Ignore urls containing /versions/ path
+  crawler.skip_links_like(/\/versions\//)
+  # Adding some metadata to a page
+  # The metadata will be stored on mongo
+  crawler.on_before_save do |page|
+    page.user_data.processed = false
+  end
+  # In-place page processing
+  crawler.on_page_downloaded do |page|
+    # A nokogiri object
+    puts "Page title: #{page.doc.css('title').text}"
+  end
+  # Do a nifty stuff at the end of the crawling session
+  crawler.on_crawl_end do
+    # Gong.bang(:loudly)
+  end
+end

data/examples/error_handling.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# encoding: UTF-8
+require 'polipus'
+Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
+  # Handle connectivity errors
+  # Only runs when there is an error
+  crawler.on_page_error do |page|
+    # Don't store the page
+    page.storable = false
+    # Add the URL again to the queue
+    crawler.add_to_queue(page)
+  end
+  # In-place page processing
+  # Runs also when there was an error in the page
+  crawler.on_page_downloaded do |page|
+    # Skip block if there is an error
+    return if page.error
+    # A nokogiri object
+    puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
+  end
+end

data/examples/incremental.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# encoding: UTF-8
+require 'polipus'
+require 'mongo'
+# Define a Mongo connection
+mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
+# Override some default options
+options = {
+  # Redis connection
+  redis_options: {
+    host: 'localhost',
+    db: 5,
+    driver: 'hiredis'
+  },
+  # Page storage: pages is the name of the collection where
+  # pages will be stored
+  storage: Polipus::Storage.mongo_store(mongo, 'pages'),
+  # Use your custom user agent
+  user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
+  # Use 10 threads
+  workers: 20,
+  # Logs goes to the crawler.log file
+  logger: Logger.new(STDOUT),
+  # Do not go deeper than 2 levels
+  depth_limit: 5,
+  # Incremental download:
+  # Set a ttl for each stored page
+  # If a previous stored page is now expired, it will re-downloaded
+  # Mark a page expired after 60s
+  ttl_page: 60
+}
+starting_urls = ['http://rubygems.org/gems']
+# Crawl the entire rubygems's site
+# Polipus.crawler('polipus-rubygems', starting_urls, options)
+Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
+  # Ignore urls pointing to a gem file
+  crawler.skip_links_like(/\.gem$/)
+  # Ignore urls pointing to an atom feed
+  crawler.skip_links_like(/\.atom$/)
+  # Ignore urls containing /versions/ path
+  crawler.skip_links_like(/\/versions\//)
+  # Adding some metadata to a page
+  # The metadata will be stored on mongo
+  crawler.on_before_save do |page|
+    page.user_data.processed = false
+  end
+  # In-place page processing
+  crawler.on_page_downloaded do |page|
+    # A nokogiri object
+    puts "Page title: #{page.doc.css('title').text}" rescue 'ERROR'
+  end
+  # Do a nifty stuff at the end of the crawling session
+  crawler.on_crawl_end do
+    # Gong.bang(:loudly)
+  end
+end