klepto 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.rspec +2 -0
- data/.rvmrc +1 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +129 -0
- data/Rakefile +7 -0
- data/klepto.gemspec +26 -0
- data/lib/klepto.rb +26 -0
- data/lib/klepto/bot.rb +59 -0
- data/lib/klepto/browser.rb +18 -0
- data/lib/klepto/crawler.rb +72 -0
- data/lib/klepto/tasks.rb +15 -0
- data/lib/klepto/version.rb +3 -0
- data/samples/example.rb +49 -0
- data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
- data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
- data/spec/lib/klepto/bot_spec.rb +40 -0
- data/spec/lib/klepto/browser_spec.rb +15 -0
- data/spec/lib/klepto/crawler_spec.rb +88 -0
- data/spec/lib/klepto/dsl_spec.rb +6 -0
- data/spec/lib/klepto_spec.rb +64 -0
- data/spec/orm/active_record.rb +36 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +32 -0
- metadata +157 -0
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
log
|
14
|
+
rdoc
|
15
|
+
spec/reports
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
spec/orm/database.yml
|
20
|
+
.DS_Store
|
21
|
+
**/.DS_Store
|
data/.rspec
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use ruby-1.9.3-p194@klepto
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in klepto.gemspec
|
4
|
+
gemspec
|
5
|
+
gem 'rspec', '2.12.0'
|
6
|
+
gem 'rb-fsevent'
|
7
|
+
gem "guard", '1.6.1'
|
8
|
+
gem "guard-bundler"
|
9
|
+
gem "guard-rspec"
|
10
|
+
gem 'debugger', '1.2.0'
|
11
|
+
gem 'vcr'
|
12
|
+
gem 'fakeweb'
|
13
|
+
#gem 'activerecord', "~>3.2.0"
|
14
|
+
gem 'activerecord'
|
15
|
+
gem 'mysql2'
|
16
|
+
gem 'rb-fsevent'
|
17
|
+
gem 'ruby_gntp'
|
18
|
+
gem 'simplecov', :require => false
|
data/Guardfile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
guard 'bundler' do
|
2
|
+
watch('Gemfile')
|
3
|
+
# Uncomment next line if Gemfile contain `gemspec' command
|
4
|
+
# watch(/^.+\.gemspec/)
|
5
|
+
end
|
6
|
+
|
7
|
+
guard 'rspec', :version => 2 do
|
8
|
+
watch(%r{^spec/.+_spec\.rb$})
|
9
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
10
|
+
watch('spec/spec_helper.rb') { "spec" }
|
11
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Cory O'Daniel
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
# Klepto
|
2
|
+
|
3
|
+
A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data into your Rails app.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
* CSS or XPath Syntax
|
8
|
+
* Full javascript processing via phantomjs / poltergeist
|
9
|
+
* All the fun of capybara
|
10
|
+
* Scrape multiple pages with a single bot
|
11
|
+
* Scrape individuals pages with multiple 'crawlers', see Bieber example.
|
12
|
+
* Test coverage!
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
Say you want a bunch of Bieb tweets! How is there not profit in that?
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
# Make a bot
|
19
|
+
@bot = Klepto::Bot.new do
|
20
|
+
# Set your selector syntax. You can change to :xpath if you are 40+ or love C#.
|
21
|
+
syntax :css
|
22
|
+
|
23
|
+
# Send some headers, confuse everyone.
|
24
|
+
headers 'Referer' => 'http://www.twitter.com'
|
25
|
+
|
26
|
+
# The more the merrier. It takes a *splat.
|
27
|
+
urls 'https://twitter.com/justinbieber'
|
28
|
+
|
29
|
+
# Crawl the body of the page to get the user info
|
30
|
+
crawl 'body' do
|
31
|
+
# The default handler is to call .text on the scraped node.
|
32
|
+
scrape "h1.fullname", :name
|
33
|
+
|
34
|
+
# Scrape finds the first matching element for the given selector within
|
35
|
+
# the scope above (here: 'body')
|
36
|
+
scrape '.username span.screen-name', :username
|
37
|
+
|
38
|
+
# Scrape all matching elements with #scrape_all
|
39
|
+
scrape_all 'span.url a' do |nodes|
|
40
|
+
{
|
41
|
+
links: nodes.map{|n| n[:href]}
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
# Each 'match' of the crawlers selector (here: 'body') will have the
|
46
|
+
# content from 'scrape' passed in as a hash
|
47
|
+
save do |params|
|
48
|
+
user = User.find_by_name(params[:username]) || User.new
|
49
|
+
user.update_attributes params
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Get dem tweets
|
54
|
+
crawl 'li.stream-item' do
|
55
|
+
# Passing no parameters to scrape will set the context to be the
|
56
|
+
# outer matched crawled element (here: 'li.stream-item')
|
57
|
+
scrape do |node|
|
58
|
+
{:twitter_id => node['data-item-id']}
|
59
|
+
end
|
60
|
+
|
61
|
+
# Put '.content p' into params[:content]
|
62
|
+
scrape '.content p', :content
|
63
|
+
|
64
|
+
# Pass a block for more control
|
65
|
+
scrape '._timestamp' do |node|
|
66
|
+
{timestamp: node['data-time']}
|
67
|
+
end
|
68
|
+
|
69
|
+
scrape '.time a' do |node|
|
70
|
+
{permalink: node[:href]}
|
71
|
+
end
|
72
|
+
|
73
|
+
# Each 'match' of the crawlers selector (here: 'li.stream-item') will have the
|
74
|
+
# content from 'scrape' passed in as a hash
|
75
|
+
save do |params|
|
76
|
+
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
77
|
+
tweet.update_attributes params
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
#start that bot for some sweet victory, heart throb.
|
83
|
+
@bot.start!
|
84
|
+
```
|
85
|
+
|
86
|
+
All your content are belong to us.
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
## Stuff I'm going to add.
|
91
|
+
|
92
|
+
event handlers...
|
93
|
+
--------------------
|
94
|
+
|
95
|
+
on_http_status(500,404) do |response, bot|
|
96
|
+
email('admin@example.com', bot.status, bot.summary)
|
97
|
+
end
|
98
|
+
on_assertion_failure{ |response, bot| }
|
99
|
+
on_invalid_resource{ |resource, bot| }
|
100
|
+
|
101
|
+
Pre-req Steps
|
102
|
+
--------------------
|
103
|
+
|
104
|
+
prepare [
|
105
|
+
[:GET, 'http://example.com'],
|
106
|
+
[:POST, 'http://example.com/login', {username: 'cory', password: '123456'}],
|
107
|
+
]
|
108
|
+
|
109
|
+
Page Assertions
|
110
|
+
--------------------
|
111
|
+
|
112
|
+
assertions do
|
113
|
+
present 'li.offer'
|
114
|
+
present 'h3 a', :present => [:href]
|
115
|
+
within 'li.offer' do
|
116
|
+
present 'h3'
|
117
|
+
end
|
118
|
+
|
119
|
+
scrape 'h3 a' do |node|
|
120
|
+
node.is_a_link_to_someplace_we_like
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
Cookie Stufing
|
125
|
+
-------------------
|
126
|
+
|
127
|
+
cookies({
|
128
|
+
'Has Fun' => true
|
129
|
+
})
|
data/Rakefile
ADDED
data/klepto.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'klepto/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "klepto"
|
8
|
+
gem.version = Klepto::VERSION
|
9
|
+
gem.authors = ["Cory O'Daniel"]
|
10
|
+
gem.email = ["github@coryodaniel.com"]
|
11
|
+
gem.description = "Tearing up web pages into ActiveRecord resources"
|
12
|
+
gem.summary = "Tearing up web pages into ActiveRecord resources"
|
13
|
+
gem.homepage = "http://github.com/coryodaniel/klepto"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_dependency "docile"
|
20
|
+
gem.add_dependency "poltergeist", '1.1.0'
|
21
|
+
gem.add_dependency "capybara", '2.0.2'
|
22
|
+
#gem.add_dependency "thor"
|
23
|
+
gem.add_dependency "nokogiri", '~> 1.5.6'
|
24
|
+
gem.add_dependency "activesupport"
|
25
|
+
gem.add_dependency 'multi_json', '~> 1.0'
|
26
|
+
end
|
data/lib/klepto.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'docile'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'logger'
|
4
|
+
require "capybara"
|
5
|
+
require "capybara/dsl"
|
6
|
+
require 'capybara/poltergeist'
|
7
|
+
require 'pp'
|
8
|
+
|
9
|
+
Capybara.run_server = false
|
10
|
+
|
11
|
+
Capybara.register_driver :poltergeist do |app|
|
12
|
+
Capybara::Poltergeist::Driver.new(app, {
|
13
|
+
js_errors: false
|
14
|
+
})
|
15
|
+
end
|
16
|
+
Capybara.current_driver = :poltergeist
|
17
|
+
|
18
|
+
module Klepto
|
19
|
+
LOG = Logger.new(STDOUT)
|
20
|
+
LOG.level = Logger::WARN
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'klepto/version'
|
24
|
+
require 'klepto/crawler'
|
25
|
+
require 'klepto/browser'
|
26
|
+
require 'klepto/bot'
|
data/lib/klepto/bot.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Klepto
|
2
|
+
class Bot
|
3
|
+
def initialize(*args, &block)
|
4
|
+
@syntax = :css
|
5
|
+
@is_dry_run = false
|
6
|
+
@urls = []
|
7
|
+
@crawlers = []
|
8
|
+
@browser = Klepto::Browser.new
|
9
|
+
Docile.dsl_eval(self, &block) if block_given?
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_reader :browser, :crawlers
|
13
|
+
|
14
|
+
def dry_run!
|
15
|
+
@is_dry_run = true
|
16
|
+
end
|
17
|
+
|
18
|
+
def dry_run?
|
19
|
+
!!@is_dry_run
|
20
|
+
end
|
21
|
+
|
22
|
+
def syntax(kind=nil)
|
23
|
+
@syntax = kind unless kind.nil?
|
24
|
+
@syntax
|
25
|
+
end
|
26
|
+
|
27
|
+
def headers(_headers)
|
28
|
+
@browser.set_headers(_headers)
|
29
|
+
end
|
30
|
+
|
31
|
+
def url(*args)
|
32
|
+
@urls += args
|
33
|
+
end
|
34
|
+
alias :urls :url
|
35
|
+
|
36
|
+
def crawl(scope, options={}, &block)
|
37
|
+
options[:syntax] = @syntax
|
38
|
+
@crawlers << Klepto::Crawler.new(scope, options, &block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def start!
|
42
|
+
@urls.each do |url|
|
43
|
+
browser.fetch!(url)
|
44
|
+
@crawlers.each do |crawler|
|
45
|
+
crawler.crawl browser.page
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
@crawlers.each do |crawler|
|
50
|
+
if dry_run?
|
51
|
+
pp crawler.resources
|
52
|
+
else
|
53
|
+
crawler.persist!
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'docile'
|
2
|
+
module Klepto
|
3
|
+
class Crawler
|
4
|
+
|
5
|
+
def initialize(scope,options={},&block)
|
6
|
+
@resources = []
|
7
|
+
@limit = options[:limit]
|
8
|
+
@skip = options[:skip]
|
9
|
+
@syntax = options[:syntax]
|
10
|
+
@scope = scope
|
11
|
+
@designations = []
|
12
|
+
|
13
|
+
Docile.dsl_eval(self, &block) if block_given?
|
14
|
+
end
|
15
|
+
attr_accessor :resources
|
16
|
+
attr_reader :scope, :syntax
|
17
|
+
|
18
|
+
def scrape(selector=nil, assignee=nil, &block)
|
19
|
+
raise Exception if assignee.nil? && !block_given?
|
20
|
+
raise Exception if !assignee.nil? && block_given?
|
21
|
+
designate(:first, selector, assignee, &block)
|
22
|
+
end
|
23
|
+
|
24
|
+
def scrape_all(selector, assignee=nil, &block)
|
25
|
+
raise Exception if assignee.nil? && !block_given?
|
26
|
+
raise Exception if !assignee.nil? && block_given?
|
27
|
+
designate(:all, selector, assignee, &block)
|
28
|
+
end
|
29
|
+
|
30
|
+
def save(&block)
|
31
|
+
@resource_handler = block
|
32
|
+
end
|
33
|
+
|
34
|
+
def crawl(page)
|
35
|
+
page.all(syntax, scope).each do |selection|
|
36
|
+
params = {}
|
37
|
+
@designations.each do |first_or_all, selector, assignee, handler|
|
38
|
+
if selector.nil?
|
39
|
+
attribs = handler.call selection
|
40
|
+
params.merge!( attribs )
|
41
|
+
elsif first_or_all == :first
|
42
|
+
node = selection.first(syntax, selector)
|
43
|
+
if assignee
|
44
|
+
params[assignee] = node.try(:text)
|
45
|
+
else
|
46
|
+
attribs = handler.call node
|
47
|
+
params.merge!( attribs )
|
48
|
+
end
|
49
|
+
else
|
50
|
+
nodes = selection.all(syntax, selector)
|
51
|
+
attribs = handler.call nodes
|
52
|
+
params.merge!( attribs )
|
53
|
+
end
|
54
|
+
end
|
55
|
+
@resources << params
|
56
|
+
end
|
57
|
+
|
58
|
+
@resources
|
59
|
+
end
|
60
|
+
|
61
|
+
def persist!
|
62
|
+
if @resource_handler
|
63
|
+
@resources.each {|resource| @resource_handler.call(resource)}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
protected
|
68
|
+
def designate(count, selector, assignee, &block)
|
69
|
+
@designations << [count, selector, assignee, block]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
data/lib/klepto/tasks.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# require this file to load the tasks
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
# noop
|
5
|
+
=begin
|
6
|
+
This is here as a start point for adding rake tasks that can be 'required' by another project
|
7
|
+
Just add: require 'klepto/tasks' to your Rakefile
|
8
|
+
=end
|
9
|
+
|
10
|
+
namespace :klepto do
|
11
|
+
desc "Example task"
|
12
|
+
task :example do
|
13
|
+
puts "I'm a task"
|
14
|
+
end
|
15
|
+
end
|