caule 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format Fivemat
3
+ --order rand
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Rafael Souza
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,29 @@
1
+ # Caule
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'caule'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install caule
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/caule/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Rafael Souza"]
6
+ gem.email = ["me@rafaelss.com"]
7
+ gem.description = %q{DSL to build crawlers easily}
8
+ gem.summary = %q{DSL to build crawlers easily}
9
+ gem.homepage = "http://github.com/rafaelss/caule"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "caule"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Caule::VERSION
17
+
18
+ gem.add_dependency "mechanize", "~> 2.3"
19
+ gem.add_development_dependency "rspec", "~> 2.9.0"
20
+ gem.add_development_dependency "fivemat", "~> 1.0.0"
21
+ gem.add_development_dependency "webmock", "~> 1.8.6"
22
+ end
@@ -0,0 +1,10 @@
1
+ require ""
2
+
3
+ crawler = Caule::Bot.new("http://sfbay.craigslist.org/")
4
+ crawler.on_pages_like(/forums/) do |page|
5
+ page.
6
+ end
7
+ crawler.on_every_page do
8
+
9
+ end
10
+ crawler.run
@@ -0,0 +1,11 @@
1
+ require "caule/version"
2
+
3
+ module Caule
4
+ autoload :Bot, "caule/bot"
5
+
6
+ def self.start(*urls)
7
+ bot = Bot.new(*urls)
8
+ yield bot
9
+ bot.run
10
+ end
11
+ end
@@ -0,0 +1,79 @@
1
+ require "mechanize"
2
+
3
+ module Caule
4
+ class Bot
5
+ attr_reader :urls, :agent
6
+
7
+ def initialize(*urls)
8
+ @urls = urls.flatten
9
+ @agent = Mechanize.new do |a|
10
+ a.user_agent_alias = "Windows IE 7"
11
+ end
12
+
13
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
14
+ end
15
+
16
+ def on_every_page(&block)
17
+ on_pages_like(/.+/, &block)
18
+ end
19
+
20
+ def on_pages_like(*patterns, &block)
21
+ if patterns && !patterns.empty?
22
+ patterns.each do |pattern|
23
+ @on_pages_like_blocks[pattern] << block
24
+ end
25
+ end
26
+ end
27
+
28
+ def run
29
+ urls.each do |url|
30
+ page = agent.get(url)
31
+ do_page_blocks(page)
32
+
33
+ stack = links_to_follow(page)
34
+ while l = stack.shift
35
+ begin
36
+ page = l.click
37
+ next unless Mechanize::Page === page
38
+ do_page_blocks(page)
39
+ stack.push(*links_to_follow(page))
40
+ rescue Mechanize::ResponseCodeError => ex
41
+ puts ex.message.red
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ def focus_crawl(&block)
48
+ @focus_crawl_block = block
49
+ end
50
+
51
+ protected
52
+
53
+ def do_page_blocks(page)
54
+ @on_pages_like_blocks.each do |pattern, blocks|
55
+ blocks.each do |block|
56
+ if page.uri.to_s =~ pattern
57
+ block.call(page)
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ def links_to_follow(page)
64
+ if @focus_crawl_block
65
+ links = @focus_crawl_block.call(page) || []
66
+ links.compact! unless links.empty?
67
+ else
68
+ links = page.links
69
+ end
70
+
71
+ links.select { |link| visit_link?(link) }.map { |link| link.dup }
72
+ end
73
+
74
+ def visit_link?(link)
75
+ return false if agent.visited?(link.href)
76
+ true
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,3 @@
1
+ module Caule
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,59 @@
1
+ require "spec_helper"
2
+
3
+ describe Caule::Bot do
4
+ context "hooks" do
5
+ describe "on_every_page" do
6
+ subject { described_class.new("http://sfbay.craigslist.org/", "http://sfbay.craigslist.org/about") }
7
+
8
+ it "runs on every page" do
9
+ stub_request(:get, "http://sfbay.craigslist.org/").
10
+ to_return(:status => 200, :body => "", :headers => { "Content-Type" => "text/html" })
11
+
12
+ stub_request(:get, "http://sfbay.craigslist.org/about").
13
+ to_return(:status => 200, :body => "", :headers => { "Content-Type" => "text/html" })
14
+
15
+ subject.on_every_page do |page|
16
+ page.uri.to_s.should match(/(\/|about)$/)
17
+ end
18
+
19
+ subject.run
20
+ end
21
+ end
22
+
23
+ describe "on_pages_like" do
24
+ subject { described_class.new("http://sfbay.craigslist.org/", "http://sfbay.craigslist.org/about") }
25
+
26
+ it "runs on pages that match the regular expression passed" do
27
+ stub_request(:get, "http://sfbay.craigslist.org/").
28
+ to_return(:status => 200, :body => "", :headers => { "Content-Type" => "text/html" })
29
+
30
+ stub_request(:get, "http://sfbay.craigslist.org/about").
31
+ to_return(:status => 200, :body => "", :headers => { "Content-Type" => "text/html" })
32
+
33
+ subject.on_pages_like(/about/) do |page|
34
+ page.uri.to_s.should match(/about$/)
35
+ end
36
+
37
+ subject.run
38
+ end
39
+ end
40
+
41
+ describe "focus_crawl" do
42
+ subject { described_class.new("http://sfbay.craigslist.org/") }
43
+
44
+ it "crawls only links returned by the block" do
45
+ stub_request(:get, "http://sfbay.craigslist.org/").
46
+ to_return(:status => 200, :body => "<a href='/about'>about</a><a href='/forums'></a>", :headers => { "Content-Type" => "text/html" })
47
+
48
+ stub_request(:get, "http://sfbay.craigslist.org/forums").
49
+ to_return(:status => 200, :body => "", :headers => { "Content-Type" => "text/html" })
50
+
51
+ subject.focus_crawl do |page|
52
+ page.links_with(:href => /forums$/)
53
+ end
54
+
55
+ subject.run
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,9 @@
1
+ require "spec_helper"
2
+
3
+ describe Caule do
4
+ it "instantiates bot, yield the instance and run the crawler" do
5
+ bot = nil
6
+ Caule.start { |b| bot = b }
7
+ bot.should be_instance_of(Caule::Bot)
8
+ end
9
+ end
@@ -0,0 +1,8 @@
1
+ require "caule"
2
+ require "webmock/rspec"
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: caule
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Rafael Souza
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &70237564894140 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.3'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70237564894140
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70237564893620 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 2.9.0
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70237564893620
36
+ - !ruby/object:Gem::Dependency
37
+ name: fivemat
38
+ requirement: &70237564893140 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 1.0.0
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70237564893140
47
+ - !ruby/object:Gem::Dependency
48
+ name: webmock
49
+ requirement: &70237564892680 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 1.8.6
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70237564892680
58
+ description: DSL to build crawlers easily
59
+ email:
60
+ - me@rafaelss.com
61
+ executables: []
62
+ extensions: []
63
+ extra_rdoc_files: []
64
+ files:
65
+ - .gitignore
66
+ - .rspec
67
+ - Gemfile
68
+ - LICENSE
69
+ - README
70
+ - Rakefile
71
+ - caule.gemspec
72
+ - crawler_test.rb
73
+ - lib/caule.rb
74
+ - lib/caule/bot.rb
75
+ - lib/caule/version.rb
76
+ - spec/caule/bot_spec.rb
77
+ - spec/caule_spec.rb
78
+ - spec/spec_helper.rb
79
+ homepage: http://github.com/rafaelss/caule
80
+ licenses: []
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project:
99
+ rubygems_version: 1.8.11
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: DSL to build crawlers easily
103
+ test_files:
104
+ - spec/caule/bot_spec.rb
105
+ - spec/caule_spec.rb
106
+ - spec/spec_helper.rb