harvestman 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in harvestman.gemspec
4
+ gemspec
@@ -0,0 +1,20 @@
1
+ MIT License
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,107 @@
1
+ # Harvestman
2
+
3
+ Harvestman is a very simple, lightweight web crawler for Quick'n'Dirty™ web scraping.<br>
4
+ It's quite useful for scraping search result pages:
5
+
6
+ ```ruby
7
+ require 'harvestman'
8
+
9
+ Harvestman.crawl 'http://www.foo.com/bars?page=*', (1..5) do
10
+ price = css 'div.item-price a'
11
+ ...
12
+ end
13
+ ```
14
+
15
+ **[!] Warning**: this gem is in alpha stage (no tests), don't use it for anything serious.
16
+
17
+ ## Installation
18
+
19
+ Via command line:
20
+
21
+ $ gem install harvestman
22
+
23
+ ## Basic usage
24
+
25
+ Harvestman is fairly simple to use: you specify the URL to crawl and pass in a block.
26
+ Inside the block you can call the ``css`` (or ``xpath``) method to search the HTML document and get the inner text inside each node.
27
+ See [Nokogiri](http://nokogiri.org/tutorials/searching_a_xml_html_document.html) for more information.
28
+
29
+ ###### Perhaps this is best understood with an example:
30
+
31
+ ```ruby
32
+ Harvestman.crawl "http://www.24pullrequests.com" do
33
+ headline = xpath "//h3"
34
+ catchy_phrase = css "div.visible-phone h3"
35
+
36
+ puts "Headline: #{headline}"
37
+ puts "Catchy phrase: #{catchy_phrase}"
38
+ end
39
+ ```
40
+
41
+ ## One node to rule them all
42
+
43
+ Harvestman assumes there's only one node at the path you passed to the ``css``.
44
+ If there is **more than one node** at that path, you can pass in an additional block.
45
+
46
+ ###### Another example:
47
+
48
+ ```ruby
49
+ Harvestman.crawl 'http://en.wikipedia.org/wiki/Main_Page' do
50
+ # Print today's featured article
51
+ tfa = css "div#mp-tfa"
52
+
53
+ puts "Today's featured article: #{tfa}"
54
+
55
+ # Print all the sister projects
56
+ sister_projects = []
57
+
58
+ css "div#mp-sister b" do
59
+ sister_projects << css("a")
60
+ end
61
+
62
+ puts "Sister projects:"
63
+ sister_projects.each { |sp| puts "- #{sp}" }
64
+ end
65
+ ```
66
+
67
+ Note that inside the block we use ``css("a")`` and *not* ``css("div#mp-sister b a")``. Calls to ``css`` or ``xpath`` here assume ``div#mp-sister b`` is the parent node.
68
+
69
+ ## Pages / Search results
70
+
71
+ If you want to crawl a group of similar pages (eg: search results, as shown above), you can insert a ``*`` somewhere in the URL string and it will be replaced by each element in the second argument.
72
+
73
+ ###### Final example:
74
+
75
+ ```ruby
76
+ require 'harvestman'
77
+
78
+ Harvestman.crawl 'http://www.etsy.com/browse/vintage-category/electronics/*', (1..3) do
79
+ css "div.listing-hover" do
80
+ title = css "div.title a"
81
+ price = css "span.listing-price"
82
+
83
+ puts "* #{title} (#{price})"
84
+ end
85
+ end
86
+ ```
87
+
88
+ The above code is going to crawl Etsy's electronics category pages (from 1 to 3) and output every item's title and price. Here we're using a range ``(1..3)`` but you could've passed an array with search queries:
89
+
90
+ "http://www.site.com?query=*", ["dogs", "cats", "birds"]
91
+
92
+ ### Performance
93
+ When using the * feature described above, each page is run inside a separate thread. You can disable multithreading by passing an additional argument `:plain` to the `crawl` method, like this:
94
+
95
+ ```ruby
96
+ require 'harvestman'
97
+
98
+ Harvestman.crawl 'http://www.store.com/products?page=*', (1..99), :plain do
99
+ ...
100
+ end
101
+ ```
102
+
103
+ Needless to say, this will greatly decrease performance.
104
+
105
+ ## License
106
+
107
+ See LICENSE.txt
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ task :default => :test
@@ -0,0 +1,15 @@
1
+ require 'harvestman'
2
+
3
+ # Crawl Etsy's electronics category pages (from 1 to 3) and output every item's
4
+ # title and price.
5
+
6
+ base_url = 'http://www.etsy.com/browse/vintage-category/electronics/*'
7
+
8
+ Harvestman.crawl base_url, (1..3) do
9
+ css "div.listing-hover" do
10
+ title = css "div.title a"
11
+ price = css "span.listing-price"
12
+
13
+ puts "* #{title} (#{price})"
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ require 'harvestman'
2
+
3
+ Harvestman.crawl 'http://en.wikipedia.org/wiki/Main_Page' do
4
+ # Print today's featured article
5
+ tfa = css "div#mp-tfa"
6
+
7
+ puts "Today's featured article: #{tfa}"
8
+
9
+ # Print all the sister projects
10
+ sister_projects = []
11
+
12
+ css "div#mp-sister b" do
13
+ sister_projects << css("a")
14
+ end
15
+
16
+ puts "Sister projects:"
17
+ sister_projects.each { |sp| puts "- #{sp}" }
18
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'harvestman/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "harvestman"
8
+ gem.version = Harvestman::VERSION
9
+ gem.authors = ["Gabriel Vieira"]
10
+ gem.email = ["gluisvieira@gmail.com"]
11
+ gem.summary = %q{Lightweight web crawler}
12
+ gem.homepage = ""
13
+
14
+ # Runtime dependencies
15
+ gem.add_dependency "nokogiri", "~> 1.5.6"
16
+
17
+ # Development dependencies
18
+ gem.add_development_dependency "rake"
19
+ gem.add_development_dependency "rspec", "~> 2.0"
20
+
21
+ gem.files = `git ls-files`.split($/)
22
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
23
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
24
+ gem.require_paths = ["lib"]
25
+ end
@@ -0,0 +1,36 @@
1
+ require "nokogiri"
2
+ require "open-uri"
3
+
4
+ require 'harvestman/version'
5
+ require 'harvestman/crawler'
6
+
7
+ module Harvestman
8
+ # Public: Crawl a website. You can visit similar URLs (eg: pages in a search
9
+ # result) by passing an optional argument.
10
+ #
11
+ # url - A String containing the url to be crawled.
12
+ # pages - Zero or more Strings that will replace a * in the
13
+ # base url. Note: this does not need to be an Array.
14
+ # type - Optional. You can use a "plain" (default) or "fast" crawler.
15
+ # Fast mode uses threads for performance.
16
+ #
17
+ # Example: Crawl Etsy.com, printing the title and price of each item in
18
+ # pages 1, 2 and 3 of the Electronics category.
19
+ #
20
+ # Harvestman.crawl 'http://www.etsy.com/browse/vintage-category/electronics/*', (1..3) do
21
+ # css "div.listing-hover" do
22
+ # title = css "div.title a"
23
+ # price = css "span.listing-price"
24
+ #
25
+ # puts "* #{title} (#{price})"
26
+ # end
27
+ # end
28
+ #
29
+ # Returns nothing.
30
+ def self.crawl(url, pages = nil, type = :fast, &block)
31
+ crawler = Harvestman::Crawler.new(url, pages, type)
32
+ if block_given?
33
+ crawler.crawl(&block)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,24 @@
1
+ module Harvestman
2
+ module Crawler
3
+ # Raised when given crawler type is not registered.
4
+ class UnknownCrawler < ArgumentError; end
5
+
6
+ def self.register(type, klass)
7
+ @crawlers ||= {}
8
+ @crawlers[type] = klass
9
+ end
10
+
11
+ def self.new(base_url, pages, type)
12
+ if crawler = @crawlers[type]
13
+ crawler.new(base_url, pages)
14
+ else
15
+ raise UnknownCrawler, "No such type: #{type}"
16
+ end
17
+ end
18
+
19
+ require 'harvestman/crawler/parser'
20
+ require 'harvestman/crawler/base'
21
+ require 'harvestman/crawler/plain'
22
+ require 'harvestman/crawler/fast'
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ module Harvestman
2
+ module Crawler
3
+ class Base
4
+ def initialize(base_url, pages)
5
+ @base_url = base_url
6
+ @pages = pages
7
+ end
8
+
9
+ protected
10
+
11
+ def crawl_url(url, &block)
12
+ parser = Parser.new(url)
13
+ parser.instance_eval(&block)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Harvestman
2
+ module Crawler
3
+ class Fast < Base
4
+ def crawl(&block)
5
+ if @pages.nil?
6
+ crawl_url(@base_url, &block)
7
+ else
8
+ threads = []
9
+ @pages.each do |p|
10
+ threads << Thread.new(p) { |page| crawl_url(@base_url.gsub('*', p.to_s), &block) }
11
+ end
12
+ threads.each { |t| t.join }
13
+ end
14
+ end
15
+ end
16
+
17
+ register :fast, Fast
18
+ end
19
+ end
@@ -0,0 +1,32 @@
1
+ module Harvestman
2
+ module Crawler
3
+ class Parser
4
+ def initialize(url)
5
+ @document = Nokogiri::HTML(open(url))
6
+ end
7
+
8
+ def css(path, &block)
9
+ parse(:css, path, &block)
10
+ end
11
+
12
+ def xpath(path, &block)
13
+ parse(:xpath, path, &block)
14
+ end
15
+
16
+ private
17
+
18
+ def parse(path_type, path, &block)
19
+ if block_given?
20
+ @document.send(path_type, path).each do |node|
21
+ doc = @document
22
+ @document = node
23
+ instance_eval(&block)
24
+ @document = doc
25
+ end
26
+ else
27
+ @document.send("at_#{path_type}", path).inner_text
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,17 @@
1
+ module Harvestman
2
+ module Crawler
3
+ class Plain < Base
4
+ def crawl(&block)
5
+ if @pages.nil?
6
+ crawl_url(@base_url, &block)
7
+ else
8
+ @pages.each do |p|
9
+ crawl_url(@base_url.gsub('*', p.to_s), &block)
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ register :plain, Plain
16
+ end
17
+ end
@@ -0,0 +1,3 @@
1
+ module Harvestman
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+
3
+ describe Harvestman do
4
+ specify { Harvestman.should be_a(Module) }
5
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.expand_path('../../lib', __FILE__))
2
+
3
+ require 'rspec'
4
+ require 'harvestman'
@@ -0,0 +1,23 @@
1
+ <html lang="en">
2
+ <head>
3
+ <meta charset="UTF-8" />
4
+ <title>ex1</title>
5
+ </head>
6
+ <body>
7
+ <header>
8
+ <div class="title">
9
+ <h1>ex1_header_h1</h1>
10
+ </div>
11
+ </header>
12
+ <div class="main">
13
+ <ul>
14
+ <li>ex1_main_li_1</li>
15
+ <li>ex1_main_li_2</li>
16
+ <li>ex1_main_li_3</li>
17
+ </ul>
18
+ </div>
19
+ <footer>
20
+ <span><a href="#">ex1_footer_span_a</a></span>
21
+ </footer>
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ <html lang="en">
2
+ <head>
3
+ <meta charset="UTF-8" />
4
+ <title>ex2</title>
5
+ </head>
6
+ <body>
7
+ <header>
8
+ <div class="title">
9
+ <h1>ex2_header_h1</h1>
10
+ </div>
11
+ </header>
12
+ <div class="main">
13
+ <ul>
14
+ <li>ex2_main_li_1</li>
15
+ <li>ex2_main_li_2</li>
16
+ <li>ex2_main_li_3</li>
17
+ </ul>
18
+ </div>
19
+ <footer>
20
+ <span><a href="#">ex2_footer_span_a</a></span>
21
+ </footer>
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ <html lang="en">
2
+ <head>
3
+ <meta charset="UTF-8" />
4
+ <title>ex3</title>
5
+ </head>
6
+ <body>
7
+ <header>
8
+ <div class="title">
9
+ <h1>ex3_header_h1</h1>
10
+ </div>
11
+ </header>
12
+ <div class="main">
13
+ <ul>
14
+ <li>ex3_main_li_1</li>
15
+ <li>ex3_main_li_2</li>
16
+ <li>ex3_main_li_3</li>
17
+ </ul>
18
+ </div>
19
+ <footer>
20
+ <span><a href="#">ex3_footer_span_a</a></span>
21
+ </footer>
22
+ </body>
23
+ </html>
@@ -0,0 +1,2 @@
1
+ require 'test/unit'
2
+ require 'harvestman'
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestHarvestman < Test::Unit::TestCase
4
+ def test_namespace
5
+ assert Harvestman.is_a?(Module)
6
+ end
7
+
8
+ def test_scraping
9
+ results = []
10
+
11
+ Harvestman.crawl "test/example*.html", (1..3), :plain do
12
+ r = {
13
+ :title => css("head title"),
14
+ :header => css("header div.title h1"),
15
+ :footer => css("footer span a"),
16
+ :list => []
17
+ }
18
+
19
+ css "div.main ul" do
20
+ r[:list] << css("li")
21
+ end
22
+
23
+ results << r
24
+ end
25
+
26
+ results.each_with_index do |r, i|
27
+ assert_equal(r[:title], "ex#{i+1}")
28
+ assert_equal(r[:header], "#{r[:title]}_header_h1")
29
+ assert_equal(r[:footer], "#{r[:title]}_footer_span_a")
30
+ assert_equal(r[:list].count, 3)
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: harvestman
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gabriel Vieira
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.6
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.6
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '2.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ description:
63
+ email:
64
+ - gluisvieira@gmail.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - LICENSE.txt
72
+ - README.md
73
+ - Rakefile
74
+ - examples/etsy.rb
75
+ - examples/wikipedia.rb
76
+ - harvestman.gemspec
77
+ - lib/harvestman.rb
78
+ - lib/harvestman/crawler.rb
79
+ - lib/harvestman/crawler/base.rb
80
+ - lib/harvestman/crawler/fast.rb
81
+ - lib/harvestman/crawler/parser.rb
82
+ - lib/harvestman/crawler/plain.rb
83
+ - lib/harvestman/version.rb
84
+ - spec/harvestman_spec.rb
85
+ - spec/spec_helper.rb
86
+ - test/example1.html
87
+ - test/example2.html
88
+ - test/example3.html
89
+ - test/helper.rb
90
+ - test/test_harvestman.rb
91
+ homepage: ''
92
+ licenses: []
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ segments:
104
+ - 0
105
+ hash: 129965805
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ segments:
113
+ - 0
114
+ hash: 129965805
115
+ requirements: []
116
+ rubyforge_project:
117
+ rubygems_version: 1.8.25
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: Lightweight web crawler
121
+ test_files:
122
+ - spec/harvestman_spec.rb
123
+ - spec/spec_helper.rb
124
+ - test/example1.html
125
+ - test/example2.html
126
+ - test/example3.html
127
+ - test/helper.rb
128
+ - test/test_harvestman.rb