parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bb50db8ac7efd7f04d3e044b2e1d708ab2434eb3
4
+ data.tar.gz: fa4158852a74a75603d7a6f0e6af1de9b7940dfc
5
+ SHA512:
6
+ metadata.gz: 8e43cec9505d08bceddfe6f64f5a335a8ee12a37fd5f11d1d1c4f20351df651e656b282e5f884259b0cd1df345975be105fa12dd77de52529d7d62c3387bcf38
7
+ data.tar.gz: fe3941eaee3fc53104e222cc9761e51bbd2fa8d1eaea02e5c66015fe5a520bc68b43b6537fd0059056988f6674058307bf93e200009ffba045b449a678cff1f7
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,53 @@
1
+ # rcov generated
2
+ coverage
3
+ coverage.data
4
+
5
+ # rdoc generated
6
+ rdoc
7
+
8
+ # yard generated
9
+ doc
10
+ .yardoc
11
+
12
+ # bundler
13
+ .bundle
14
+
15
+ # jeweler generated
16
+ pkg
17
+
18
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
19
+ #
20
+ # * Create a file at ~/.gitignore
21
+ # * Include files you want ignored
22
+ # * Run: git config --global core.excludesfile ~/.gitignore
23
+ #
24
+ # After doing this, these files will be ignored in all your git projects,
25
+ # saving you from having to 'pollute' every project you touch with them
26
+ #
27
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
28
+ #
29
+ # For MacOS:
30
+ #
31
+ .DS_Store
32
+
33
+ # For TextMate
34
+ #*.tmproj
35
+ #tmtags
36
+
37
+ # For emacs:
38
+ #*~
39
+ #\#*
40
+ #.\#*
41
+
42
+ # For vim:
43
+ #*.swp
44
+
45
+ # For redcar:
46
+ #.redcar
47
+
48
+ # For rubinius:
49
+ #*.rbc
50
+
51
+ Gemfile.lock
52
+
53
+ my_test/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/.rubocop.yml ADDED
@@ -0,0 +1,17 @@
1
+ inherit_from: .rubocop_todo.yml
2
+ AllCops:
3
+ Exclude:
4
+ - my_test/**/*
5
+ - examples/**/*
6
+
7
+ Metrics/LineLength:
8
+ Enabled: false
9
+
10
+ Style/TrivialAccessors:
11
+ Enabled: false
12
+
13
+ Metrics/ClassLength:
14
+ Enabled: false
15
+
16
+ Metrics/MethodLength:
17
+ Enabled: false
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,33 @@
1
+ # This configuration was generated by `rubocop --auto-gen-config`
2
+ # on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
3
+ # The point is for the user to remove these configuration records
4
+ # one by one as the offenses are removed from the code base.
5
+ # Note that changes in the inspected code, or installation of new
6
+ # versions of RuboCop, may require this file to be generated again.
7
+
8
+ # Offense count: 1
9
+ Style/ClassVars:
10
+ Enabled: false
11
+
12
+ # Offense count: 10
13
+ Metrics/CyclomaticComplexity:
14
+ Max: 16
15
+
16
+ # Offense count: 26
17
+ Style/Documentation:
18
+ Enabled: false
19
+
20
+
21
+ # Offense count: 2
22
+ # Configuration parameters: EnforcedStyle, SupportedStyles.
23
+ Style/Next:
24
+ Enabled: false
25
+
26
+ # Offense count: 5
27
+ # Configuration parameters: MaxSlashes.
28
+ Style/RegexpLiteral:
29
+ Enabled: false
30
+
31
+ # Offense count: 4
32
+ Style/RescueModifier:
33
+ Enabled: false
data/.travis.yml ADDED
@@ -0,0 +1,22 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.5
7
+ - 2.2.0
8
+ - rbx-2
9
+
10
+ # Until travis supports rethinkdb as service...
11
+ before_install:
12
+ - source /etc/lsb-release && echo "deb http://download.rethinkdb.com/apt $DISTRIB_CODENAME main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
13
+ - wget -qO- http://download.rethinkdb.com/apt/pubkey.gpg | sudo apt-key add -
14
+ - sudo apt-get update -q
15
+ - sudo apt-get install rethinkdb
16
+ - sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
17
+ - sudo service rethinkdb restart
18
+
19
+ services:
20
+ - redis
21
+ - mongodb
22
+ # - rethinkdb
data/AUTHORS.md ADDED
@@ -0,0 +1,5 @@
1
+ # Authors
2
+
3
+ * [Francesco Laurita](francesco.laurita@gmail.com)
4
+ * [Tobias L. Maier](http://tobiasmaier.info/)
5
+ * [Marcos Piccinini](https://github.com/nofxx)
data/CHANGELOG.md ADDED
@@ -0,0 +1,61 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0 (2015-01-12)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.3...0.4.0)
6
+
7
+ * Adds RethinkDB Storage
8
+ * BugFix: Update and fix mongo driver v1.11.1 'upsert: 1' -> 'upsert: true'
9
+ * Organize and update specs to rspec 3
10
+
11
+ ## 0.3.3 (2015-06-26)
12
+
13
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.2...0.3.3)
14
+
15
+ * BugFix: Better compatibility for mongo 2.6.x on index creation
16
+
17
+ ## 0.3.2 (2015-06-17)
18
+
19
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.1...0.3.2)
20
+
21
+ * BugFix: When a page contains an error, Mongo trows `BSON::InvalidDocument`. `Excpetion` is not serializable
22
+ [31647cc](https://github.com/taganaka/polipus/commit/31647ccd8fe64247e4e6d75ced097607f1fb4b2d)
23
+
24
+ ## 0.3.1 (2015-06-17)
25
+
26
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.0...0.3.1)
27
+
28
+ * Major Code-Style changes and cleanup
29
+ [#35](https://github.com/taganaka/polipus/pull/35)
30
+ * BugFix: proper initialization of internal_queue
31
+ [#38](https://github.com/taganaka/polipus/pull/38)
32
+ * Better INT / TERM Signal handling [#34](https://github.com/taganaka/polipus/pull/34)
33
+
34
+ New option added:
35
+ ```ruby
36
+ enable_signal_handler: true / false
37
+ ```
38
+
39
+ * Zlib::GzipFile::Error handling
40
+ [da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
41
+ * Faster and easier overflow management
42
+ [#39](https://github.com/taganaka/polipus/pull/39)
43
+
44
+ ## 0.3.0 (2015-06-02)
45
+
46
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
47
+
48
+ * Add `PolipusCrawler#add_to_queue` to add a page back to the queue
49
+ [#24](https://github.com/taganaka/polipus/pull/24)
50
+ * Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
51
+ For example a connectivity error.
52
+ See `/examples/error_handling.rb`
53
+ [#15](https://github.com/taganaka/polipus/issues/15)
54
+ * Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
55
+ * Polipus supports now `robots.txt` directives.
56
+ Set the option `:obey_robots_txt` to `true`.
57
+ See `/examples/robots_txt_handling.rb`
58
+ [#30](https://github.com/taganaka/polipus/pull/30)
59
+ * Add support for GZIP and deflate compressed HTTP requests
60
+ [#26](https://github.com/taganaka/polipus/pull/26)
61
+ * Minor improvements to code style
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ platform :ruby do
6
+ gem 'bson_ext'
7
+ end
8
+
9
+ platform :jruby do
10
+ gem 'json'
11
+ gem 'bson'
12
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Francesco Laurita
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ [![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
2
+ [![Coverage Status](https://img.shields.io/coveralls/taganaka/polipus/master.svg)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
+ [![Code Climate](https://codeclimate.com/github/taganaka/polipus.svg)](https://codeclimate.com/github/taganaka/polipus)
4
+ [![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
5
+
6
+ # Polipus #
7
+
8
+ A distributed web crawler written in ruby, backed by Redis
9
+ This project has been presented to the RubyDay2013
10
+ http://www.slideshare.net/francescolaurita/roll-your-own-web-crawler-rubyday
11
+
12
+ ## Features ##
13
+
14
+ * Easy to use
15
+ * Distributed and scalable
16
+ * It uses a smart/fast and space-efficient probabilistic data structure to determine if an url should be visited or not
17
+ * It doesn't exaust your Redis server
18
+ * Play nicely with MongoDB even if it is not strictly required
19
+ * Easy to write your own page storage strategy
20
+ * Focus crawling made easy
21
+ * Heavily inspired to Anemone https://github.com/chriskite/anemone/
22
+
23
+ ## Supported Ruby Interpreters
24
+
25
+ * MRI 1.9.x >= 1.9.1
26
+ * MRI 2.0.0
27
+ * MRI 2.1.2
28
+ * JRuby 1.9 mode
29
+ * Rubinius
30
+
31
+
32
+ ## Survival code example
33
+
34
+ ```ruby
35
+ require "polipus"
36
+
37
+ Polipus.crawler("rubygems","http://rubygems.org/") do |crawler|
38
+ # In-place page processing
39
+ crawler.on_page_downloaded do |page|
40
+ # A nokogiri object
41
+ puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
42
+ end
43
+ end
44
+ ```
45
+
46
+ ## Installation
47
+
48
+ $ gem install polipus
49
+
50
+ ## Testing
51
+
52
+ $ bundle install
53
+ $ rake
54
+
55
+ ## Contributing to polipus ##
56
+
57
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
58
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
59
+ * Fork the project.
60
+ * Start a feature/bugfix branch.
61
+ * Commit and push until you are happy with your contribution.
62
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
63
+ * Install [Rubocop](https://github.com/bbatsov/rubocop) and make sure it is happy
64
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
65
+
66
+ ## Copyright ##
67
+
68
+ Copyright (c) 2013 Francesco Laurita. See LICENSE.txt for
69
+ further details.
70
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # encoding: UTF-8
2
+ require 'bundler/gem_tasks'
3
+ require 'rspec/core/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task default: :spec
8
+ task test: :spec
data/examples/basic.rb ADDED
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+ require 'mongo'
4
+ require 'polipus/plugins/cleaner'
5
+ # Define a Mongo connection
6
+ mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
7
+
8
+ # Override some default options
9
+ options = {
10
+ # Redis connection
11
+ redis_options: {
12
+ host: 'localhost',
13
+ db: 5,
14
+ driver: 'hiredis'
15
+ },
16
+ # Page storage: pages is the name of the collection where
17
+ # pages will be stored
18
+ storage: Polipus::Storage.mongo_store(mongo, 'pages'),
19
+ # Use your custom user agent
20
+ user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
21
+ # Use 5 threads
22
+ workers: 5,
23
+ # Queue overflow settings:
24
+ # * No more than 5000 elements on the Redis queue
25
+ # * Exceeded Items will stored on Mongo into 'rubygems_queue_overflow' collection
26
+ # * Check cycle is done every 60 sec
27
+ queue_items_limit: 5_000,
28
+ queue_overflow_adapter: Polipus::QueueOverflow.mongo_queue(mongo, 'rubygems_queue_overflow'),
29
+ queue_overflow_manager_check_time: 60,
30
+ # Logs goes to the stdout
31
+ logger: Logger.new(STDOUT)
32
+ }
33
+ Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
34
+ starting_urls = ['http://rubygems.org/gems']
35
+
36
+ # Crawl the entire rubygems's site
37
+ # Polipus.crawler('polipus-rubygems', starting_urls, options)
38
+
39
+ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
40
+ # Ignore urls pointing to a gem file
41
+ crawler.skip_links_like(/\.gem$/)
42
+ # Ignore urls pointing to an atom feed
43
+ crawler.skip_links_like(/\.atom$/)
44
+ # Ignore urls containing /versions/ path
45
+ crawler.skip_links_like(/\/versions\//)
46
+
47
+ # Adding some metadata to a page
48
+ # The metadata will be stored on mongo
49
+ crawler.on_before_save do |page|
50
+ page.user_data.processed = false
51
+ end
52
+
53
+ # In-place page processing
54
+ crawler.on_page_downloaded do |page|
55
+ # A nokogiri object
56
+ puts "Page title: #{page.doc.css('title').text}"
57
+ end
58
+
59
+ # Do a nifty stuff at the end of the crawling session
60
+ crawler.on_crawl_end do
61
+ # Gong.bang(:loudly)
62
+ end
63
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+
4
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
5
+ # Handle connectivity errors
6
+ # Only runs when there is an error
7
+ crawler.on_page_error do |page|
8
+ # Don't store the page
9
+ page.storable = false
10
+ # Add the URL again to the queue
11
+ crawler.add_to_queue(page)
12
+ end
13
+
14
+ # In-place page processing
15
+ # Runs also when there was an error in the page
16
+ crawler.on_page_downloaded do |page|
17
+ # Skip block if there is an error
18
+ return if page.error
19
+
20
+ # A nokogiri object
21
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
22
+ end
23
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+ require 'mongo'
4
+
5
+ # Define a Mongo connection
6
+ mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
7
+ # Override some default options
8
+ options = {
9
+ # Redis connection
10
+ redis_options: {
11
+ host: 'localhost',
12
+ db: 5,
13
+ driver: 'hiredis'
14
+ },
15
+ # Page storage: pages is the name of the collection where
16
+ # pages will be stored
17
+ storage: Polipus::Storage.mongo_store(mongo, 'pages'),
18
+ # Use your custom user agent
19
+ user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
20
+ # Use 10 threads
21
+ workers: 20,
22
+ # Logs goes to the crawler.log file
23
+ logger: Logger.new(STDOUT),
24
+ # Do not go deeper than 2 levels
25
+ depth_limit: 5,
26
+
27
+ # Incremental download:
28
+ # Set a ttl for each stored page
29
+ # If a previous stored page is now expired, it will re-downloaded
30
+ # Mark a page expired after 60s
31
+ ttl_page: 60
32
+ }
33
+
34
+ starting_urls = ['http://rubygems.org/gems']
35
+
36
+ # Crawl the entire rubygems's site
37
+ # Polipus.crawler('polipus-rubygems', starting_urls, options)
38
+
39
+ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
40
+ # Ignore urls pointing to a gem file
41
+ crawler.skip_links_like(/\.gem$/)
42
+ # Ignore urls pointing to an atom feed
43
+ crawler.skip_links_like(/\.atom$/)
44
+ # Ignore urls containing /versions/ path
45
+ crawler.skip_links_like(/\/versions\//)
46
+
47
+ # Adding some metadata to a page
48
+ # The metadata will be stored on mongo
49
+ crawler.on_before_save do |page|
50
+ page.user_data.processed = false
51
+ end
52
+
53
+ # In-place page processing
54
+ crawler.on_page_downloaded do |page|
55
+ # A nokogiri object
56
+ puts "Page title: #{page.doc.css('title').text}" rescue 'ERROR'
57
+ end
58
+
59
+ # Do a nifty stuff at the end of the crawling session
60
+ crawler.on_crawl_end do
61
+ # Gong.bang(:loudly)
62
+ end
63
+ end