rumba-crawler 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d37db998d18c3cfbe4e350026c474f37a05e2469
4
+ data.tar.gz: 1df7bf2ad9d67e46440bd5a0fa57e6b73893a52d
5
+ SHA512:
6
+ metadata.gz: 92455f66a6efcfc923b4d0d9f760f405418f9673e65dad7908e00ea0b103c5b2f0f103eabc9c6bb1fbcbbb90dfddc0ecf7189e87b0b5df24b23658515f6aa097
7
+ data.tar.gz: a8dc4d46b1a2db70b222ad383cebcd756a5e2e2dcc866a35da78cab04dea11a113722f69b79022b69a70829f8271e80ea4f583a1ca5715f1c7c42d7cc857f97c
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ script: bundle exec rspec spec/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rumba-crawler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Volodymyr Ladnik
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,86 @@
1
+ # Rumba::Crawler
2
+ [![Dependency Status](https://gemnasium.com/vladnik/rumba-crawler.png)](https://gemnasium.com/vladnik/rumba-crawler)
3
+ [![Build Status](https://travis-ci.org/vladnik/rumba-crawler.png?branch=master)](https://travis-ci.org/vladnik/rumba-crawler)
4
+ [![Code Climate](https://codeclimate.com/github/vladnik/rumba-crawler.png)](https://codeclimate.com/github/vladnik/rumba-crawler)
5
+
6
+ Web crawler with JSON-based DSL and EventMachine-powered page fetching
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'rumba-crawler'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install rumba-crawler
21
+
22
+ ## Usage
23
+ Gem supports ```"css", "root" and "regexp"``` service keys:
24
+ * css: CSS locator for node
25
+ * root: custom root for current node (parent node is used by default )
26
+ * regexp: regular expression to extract data
27
+
28
+ If you have multiple objects of the same type, put object description into ```[ ]```.
29
+
30
+ Besides it supports shortcut for ```"css"``` attribute, so if you don't have any nested nodes(leaf node)
31
+ and you don't need ```"root" and "regexp"``` keys, you can omit "css" key and provide locator as string value
32
+ (```"name": "span.name"``` in example).
33
+
34
+ Map your models to page structure like this:
35
+ ```json
36
+ [
37
+ {
38
+ "game": {
39
+ "css":".games",
40
+ "teams":[
41
+ {
42
+ "css": ".teams",
43
+ "name": "span.name",
44
+ "scores": {
45
+ "css":".score",
46
+ "regexp":"[0-9]+"
47
+ },
48
+ "sport": {
49
+ "css":"#sport",
50
+ "root":"body"
51
+ }
52
+ }
53
+ ]
54
+ }
55
+ }
56
+ ]
57
+ ```
58
+ This will allow you to extract data from page for current model structure:
59
+ ```ruby
60
+ class Game
61
+ # has_many :teams
62
+ def teams=(array_value)
63
+ array_value.each do |value|
64
+ Team.find_or_create(value)
65
+ end
66
+ end
67
+ end
68
+ class Team
69
+ # belongs_to :game
70
+ # belongs_to :sport
71
+ attr_writer :name, :scores
72
+ def sport=(value)
73
+ Sport.find_or_create(value)
74
+ end
75
+ end
76
+ class Sport
77
+ # has_many :teams
78
+ end
79
+ ```
80
+ ## Contributing
81
+
82
+ 1. Fork it
83
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
84
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
85
+ 4. Push to the branch (`git push origin my-new-feature`)
86
+ 5. Create new Pull Request
@@ -0,0 +1,4 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ RSpec::Core::RakeTask.new('spec')
4
+ task default: :spec
@@ -0,0 +1,28 @@
1
+ require "date"
2
+ require "json"
3
+ require "em-http-request"
4
+ require "nokogiri"
5
+ require "rumba/crawler/version"
6
+ require "rumba/crawler/exceptions"
7
+ require "rumba/crawler/models"
8
+ require "rumba/crawler/parser"
9
+
10
+ module Rumba
11
+ module Crawler
12
+
13
+ def self.get_data(url, query)
14
+ EventMachine.run {
15
+ http = EventMachine::HttpRequest.new(url).get query: query
16
+ http.errback { raise Exceptions::BadResponse; EventMachine.stop }
17
+ http.callback {
18
+ if http.response_header.status >= 400
19
+ raise Exceptions::BadResponse
20
+ else
21
+ yield(http.response)
22
+ end
23
+ EventMachine.stop
24
+ }
25
+ }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,8 @@
1
+ module Rumba
2
+ module Crawler
3
+ module Exceptions
4
+ class BadResponse < StandardError
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ module Rumba
2
+ module Crawler
3
+ module Models
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,72 @@
1
+ module Rumba
2
+ module Crawler
3
+ class Parser
4
+ include Rumba::Crawler::Models
5
+ # Service Keys
6
+ SK = ['css', 'root', 'regexp']
7
+
8
+ def process(response, template)
9
+ template = JSON.parse(template)
10
+ @doc = Nokogiri::HTML(response)
11
+ if template.is_a? Array
12
+ parse_multi(@doc, template.first)
13
+ else
14
+ parse_node(@doc, template, template.keys.first)
15
+ end
16
+ end
17
+
18
+ def parse_multi(doc, template)
19
+ result = []
20
+ template.each do |key, value|
21
+ get_node(doc, value).each do |node|
22
+ result << create_object(key, node, value)
23
+ end
24
+ end
25
+ return result
26
+ end
27
+
28
+ def parse_node(doc, template, name)
29
+ node = get_node(doc, template).first
30
+ if leaf_node?(template)
31
+ get_content(node, template)
32
+ else
33
+ create_object(name, node, template)
34
+ end
35
+ end
36
+
37
+ def create_object(name, node, template)
38
+ object = send(name)
39
+ template.reject{|key, _| SK.include?(key)}.each do |key, value|
40
+ if value.is_a? Array
41
+ object.send("#{key}=", parse_multi(node, value.first))
42
+ else
43
+ object.send("#{key}=", parse_node(node, value, key))
44
+ end
45
+ end
46
+ return object
47
+ end
48
+
49
+ def get_content(node, template)
50
+ if template['regexp']
51
+ /#{template['regexp']}/i.match(node.content).to_s
52
+ else
53
+ node.content
54
+ end
55
+ end
56
+
57
+ def get_node(doc, template)
58
+ if template.is_a?(String)
59
+ doc.css(template)
60
+ elsif template['root']
61
+ @doc.css(template['root']).css(template['css'])
62
+ else
63
+ doc.css(template['css'])
64
+ end
65
+ end
66
+
67
+ def leaf_node?(template)
68
+ template.is_a?(String) || template.reject{|key, _| SK.include?(key)}.empty?
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,5 @@
1
+ module Rumba
2
+ module Crawler
3
+ VERSION = "0.1"
4
+ end
5
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rumba/crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "rumba-crawler"
8
+ spec.version = Rumba::Crawler::VERSION
9
+ spec.authors = ["Volodymyr Ladnik"]
10
+ spec.email = ["Volodymyr.UA@gmail.com"]
11
+ spec.summary = %q{Web crawler with JSON-based DSL and EventMachine-powered page fetching}
12
+ spec.description = spec.summary
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "json", "~> 1.8.0"
22
+ spec.add_dependency "em-http-request", "~> 1.0.3"
23
+ spec.add_dependency "nokogiri", "~> 1.5.9"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.3"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "rspec"
28
+ spec.add_development_dependency "webmock"
29
+ end
@@ -0,0 +1,101 @@
1
+ require 'spec_helper'
2
+
3
+ module Rumba::Crawler
4
+ describe Parser do
5
+ let(:parser) { Parser.new }
6
+
7
+ describe '#process' do
8
+ it "invokes parse_multi for Array template" do
9
+ Nokogiri::HTML::Document.stub(parse: :parsed_doc)
10
+ parser.should_receive(:parse_multi).with(:parsed_doc, 'games').and_return([:result])
11
+ parser.process('<html><html>','["games"]').should eq [:result]
12
+ end
13
+
14
+ it "invokes parse_node for Object template" do
15
+ Nokogiri::HTML::Document.stub(parse: :parsed_doc)
16
+ parser.should_receive(:parse_node).with(:parsed_doc, {"game" => 123}, 'game').and_return(:result)
17
+ parser.process('<html><html>','{"game": 123}').should eq :result
18
+ end
19
+ end
20
+
21
+ describe '#parse_multi' do
22
+ it "creates object for every matching HTML node" do
23
+ parser.stub(get_node: [:node, :node])
24
+ parser.should_receive(:create_object).with(:key, :node, :value).twice.and_return(:result)
25
+ parser.parse_multi(:doc, {key: :value}).should eq [:result, :result]
26
+ end
27
+ end
28
+
29
+ describe '#parse_node' do
30
+ it "gets content for template leaf node" do
31
+ parser.stub(get_node: [:node])
32
+ parser.stub(leaf_node?: true)
33
+ parser.should_receive(:get_content).with(:node, :template).and_return('some content')
34
+ parser.parse_node(:doc, :template, :name).should eq 'some content'
35
+ end
36
+ it "creates object for template parent node" do
37
+ parser.stub(get_node: [:node])
38
+ parser.stub(leaf_node?: false)
39
+ parser.should_receive(:create_object).with(:name, :node, :template).and_return(:object)
40
+ parser.parse_node(:doc, :template, :name).should eq :object
41
+ end
42
+ end
43
+
44
+ describe "#create_object" do
45
+ it "creates an object" do
46
+ parser.should_receive(:name).and_return(:object)
47
+ parser.create_object(:name, :node, {'css' => 'body'}).should eq :object
48
+ end
49
+ it "invokes a setter with parse_multi method for object attributes with Array value" do
50
+ object = double("object")
51
+ object.should_receive(:name=).with(:result)
52
+ parser.should_receive(:parse_multi).with(:node, 'attr').and_return(:result)
53
+ parser.stub(game: object)
54
+ parser.create_object(:game, :node, {'css' => 'body', 'name' => ['attr']})
55
+ end
56
+ it "invokes a setter with parse_node method for object other attributes" do
57
+ object = double("object")
58
+ object.should_receive(:name=).with(:result)
59
+ parser.should_receive(:parse_node).with(:node, {"attr"=>"value"}, "name").and_return(:result)
60
+ parser.stub(game: object)
61
+ parser.create_object(:game, :node, {'css' => 'body', 'name' => {'attr' => 'value'}})
62
+ end
63
+ end
64
+
65
+ describe "#get_content" do
66
+ it "get node content" do
67
+ node = double("node")
68
+ node.should_receive(:content).and_return('some content')
69
+ parser.get_content(node, {}).should eq 'some content'
70
+ end
71
+ it "uses provided regexp to extract content" do
72
+ node = double("node")
73
+ node.should_receive(:content).and_return('content123')
74
+ parser.get_content(node, {'regexp' => '[a-z]+'}).should eq 'content'
75
+ end
76
+ end
77
+
78
+ describe "#get_node" do
79
+ it "uses css locator provided" do
80
+ node = double("node")
81
+ node.should_receive(:css).with('locator').and_return(:node)
82
+ parser.get_node(node, {'css' => 'locator'}).should eq :node
83
+ end
84
+ it "uses shortcut css locator" do
85
+ node = double("node")
86
+ node.should_receive(:css).with('locator').and_return(:node)
87
+ parser.get_node(node, 'locator').should eq :node
88
+ end
89
+ end
90
+
91
+ describe "#leaf_node?" do
92
+ it "identifies shortcut locator" do
93
+ parser.leaf_node?('locator').should eq true
94
+ end
95
+ it "identifies non-service attributes" do
96
+ parser.leaf_node?({'css' => 'locator'}).should eq true
97
+ parser.leaf_node?({'name' => 'locator'}).should eq false
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ module Rumba::Crawler
4
+ describe '#get_data' do
5
+ it "makes GET request to specified url with specified parameters" do
6
+ stub = stub_request(:get, "http://example.com/?date=now").to_return(status: 200, body: '', headers: {})
7
+ Rumba::Crawler.get_data('http://example.com', {date: 'now'}) { |response| }
8
+ stub.should have_been_requested
9
+ end
10
+
11
+ it "invokes callback method with response data" do
12
+ callback = double("callback")
13
+ callback.should_receive(:process_response).with('response data')
14
+ stub_request(:get, "http://example.com/?date=now").to_return(status: 200, body: 'response data', headers: {})
15
+ Rumba::Crawler.get_data('http://example.com', {date: 'now'}) { |response| callback.process_response(response)}
16
+ end
17
+
18
+ it "should raise exception on bad response" do
19
+ stub_request(:get, "http://example.com/?date=now").to_return(status: 400)
20
+ expect {Rumba::Crawler.get_data('http://example.com', {date: 'now'}) { |response| }}.to raise_error(Exceptions::BadResponse)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,9 @@
1
+ require 'rspec'
2
+ require 'webmock/rspec'
3
+ require 'rumba/crawler'
4
+
5
+
6
+ RSpec.configure do |config|
7
+ config.color_enabled = true
8
+ config.formatter = :documentation
9
+ end
metadata ADDED
@@ -0,0 +1,160 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rumba-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Volodymyr Ladnik
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-06-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: json
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.8.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: em-http-request
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 1.0.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 1.5.9
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.5.9
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '1.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Web crawler with JSON-based DSL and EventMachine-powered page fetching
112
+ email:
113
+ - Volodymyr.UA@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - .travis.yml
120
+ - Gemfile
121
+ - LICENSE.txt
122
+ - README.md
123
+ - Rakefile
124
+ - lib/rumba/crawler.rb
125
+ - lib/rumba/crawler/exceptions.rb
126
+ - lib/rumba/crawler/models.rb
127
+ - lib/rumba/crawler/parser.rb
128
+ - lib/rumba/crawler/version.rb
129
+ - rumba-crawler.gemspec
130
+ - spec/crawler/parser_spec.rb
131
+ - spec/crawler_spec.rb
132
+ - spec/spec_helper.rb
133
+ homepage: ''
134
+ licenses:
135
+ - MIT
136
+ metadata: {}
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ requirements:
148
+ - - '>='
149
+ - !ruby/object:Gem::Version
150
+ version: '0'
151
+ requirements: []
152
+ rubyforge_project:
153
+ rubygems_version: 2.0.3
154
+ signing_key:
155
+ specification_version: 4
156
+ summary: Web crawler with JSON-based DSL and EventMachine-powered page fetching
157
+ test_files:
158
+ - spec/crawler/parser_spec.rb
159
+ - spec/crawler_spec.rb
160
+ - spec/spec_helper.rb