klepto 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +120 -89
- data/lib/klepto.rb +2 -2
- data/lib/klepto/config.rb +18 -0
- data/lib/klepto/structure.rb +88 -0
- data/lib/klepto/version.rb +1 -1
- data/samples/bieber.html +4859 -0
- data/samples/concept.rb +50 -0
- data/spec/lib/klepto/config_spec.rb +23 -0
- data/spec/lib/klepto/structure_spec.rb +105 -0
- data/spec/lib/klepto_spec.rb +1 -58
- data/spec/orm/active_record.rb +2 -2
- data/spec/spec_helper.rb +1 -1
- metadata +22 -23
- data/lib/klepto/bot.rb +0 -59
- data/lib/klepto/crawler.rb +0 -72
- data/samples/example.rb +0 -49
- data/spec/lib/klepto/bot_spec.rb +0 -40
- data/spec/lib/klepto/crawler_spec.rb +0 -88
- data/spec/lib/klepto/dsl_spec.rb +0 -6
data/lib/klepto/crawler.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'docile'
|
2
|
-
module Klepto
|
3
|
-
class Crawler
|
4
|
-
|
5
|
-
def initialize(scope,options={},&block)
|
6
|
-
@resources = []
|
7
|
-
@limit = options[:limit]
|
8
|
-
@skip = options[:skip]
|
9
|
-
@syntax = options[:syntax]
|
10
|
-
@scope = scope
|
11
|
-
@designations = []
|
12
|
-
|
13
|
-
Docile.dsl_eval(self, &block) if block_given?
|
14
|
-
end
|
15
|
-
attr_accessor :resources
|
16
|
-
attr_reader :scope, :syntax
|
17
|
-
|
18
|
-
def scrape(selector=nil, assignee=nil, &block)
|
19
|
-
raise Exception if assignee.nil? && !block_given?
|
20
|
-
raise Exception if !assignee.nil? && block_given?
|
21
|
-
designate(:first, selector, assignee, &block)
|
22
|
-
end
|
23
|
-
|
24
|
-
def scrape_all(selector, assignee=nil, &block)
|
25
|
-
raise Exception if assignee.nil? && !block_given?
|
26
|
-
raise Exception if !assignee.nil? && block_given?
|
27
|
-
designate(:all, selector, assignee, &block)
|
28
|
-
end
|
29
|
-
|
30
|
-
def save(&block)
|
31
|
-
@resource_handler = block
|
32
|
-
end
|
33
|
-
|
34
|
-
def crawl(page)
|
35
|
-
page.all(syntax, scope).each do |selection|
|
36
|
-
params = {}
|
37
|
-
@designations.each do |first_or_all, selector, assignee, handler|
|
38
|
-
if selector.nil?
|
39
|
-
attribs = handler.call selection
|
40
|
-
params.merge!( attribs )
|
41
|
-
elsif first_or_all == :first
|
42
|
-
node = selection.first(syntax, selector)
|
43
|
-
if assignee
|
44
|
-
params[assignee] = node.try(:text)
|
45
|
-
else
|
46
|
-
attribs = handler.call node
|
47
|
-
params.merge!( attribs )
|
48
|
-
end
|
49
|
-
else
|
50
|
-
nodes = selection.all(syntax, selector)
|
51
|
-
attribs = handler.call nodes
|
52
|
-
params.merge!( attribs )
|
53
|
-
end
|
54
|
-
end
|
55
|
-
@resources << params
|
56
|
-
end
|
57
|
-
|
58
|
-
@resources
|
59
|
-
end
|
60
|
-
|
61
|
-
def persist!
|
62
|
-
if @resource_handler
|
63
|
-
@resources.each {|resource| @resource_handler.call(resource)}
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
protected
|
68
|
-
def designate(count, selector, assignee, &block)
|
69
|
-
@designations << [count, selector, assignee, block]
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
data/samples/example.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
require 'bundler/setup'
|
3
|
-
require 'klepto'
|
4
|
-
|
5
|
-
@bot = Klepto::Bot.new do
|
6
|
-
syntax :css
|
7
|
-
dry_run!
|
8
|
-
|
9
|
-
headers({
|
10
|
-
'Referer' => 'http://www.twitter.com',
|
11
|
-
'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
|
12
|
-
|
13
|
-
})
|
14
|
-
|
15
|
-
# Lootin' them bieb tweets
|
16
|
-
urls 'https://twitter.com/justinbieber'
|
17
|
-
|
18
|
-
crawl 'body' do
|
19
|
-
scrape "h1.fullname", :name
|
20
|
-
scrape '.username span.screen-name', :username
|
21
|
-
save do |params|
|
22
|
-
user = User.find_by_name(params[:username]) || User.new
|
23
|
-
user.update_attributes params
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
crawl 'li.stream-item' do
|
28
|
-
scrape do |node|
|
29
|
-
{:twitter_id => node['data-item-id']}
|
30
|
-
end
|
31
|
-
|
32
|
-
scrape '.content p', :content
|
33
|
-
|
34
|
-
scrape '._timestamp' do |node|
|
35
|
-
{timestamp: node['data-time']}
|
36
|
-
end
|
37
|
-
|
38
|
-
scrape '.time a' do |node|
|
39
|
-
{permalink: node[:href]}
|
40
|
-
end
|
41
|
-
|
42
|
-
save do |params|
|
43
|
-
tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
|
44
|
-
tweet.update_attributes params
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
@bot.start!
|
data/spec/lib/klepto/bot_spec.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Klepto::Bot, :vcr => {:record => :new_episodes} do
|
4
|
-
before(:each) do
|
5
|
-
@bot = Klepto::Bot.new
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'should know if it is a dry run' do
|
9
|
-
@bot.dry_run?.should be false
|
10
|
-
@bot.dry_run!
|
11
|
-
@bot.dry_run?.should be true
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should be able to set the selection syntax' do
|
15
|
-
@bot.syntax(:xpath)
|
16
|
-
@bot.syntax.should be(:xpath)
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'should be able to read the selection syntax' do
|
20
|
-
@bot.syntax.should be(:css)
|
21
|
-
end
|
22
|
-
|
23
|
-
it 'should be able to set request headers' do
|
24
|
-
@bot.should respond_to(:headers)
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'should be able to set a list of URLs to crawl' do
|
28
|
-
@bot.url 'http://www.google.com'
|
29
|
-
@bot.urls.should include('http://www.google.com')
|
30
|
-
@bot.urls 'http://twitter.com', 'http://facebook.com'
|
31
|
-
@bot.urls.should include('http://twitter.com')
|
32
|
-
@bot.urls.should include('http://facebook.com')
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'should be able to add crawlers' do
|
36
|
-
@bot.crawl('div'){}
|
37
|
-
@bot.instance_variable_get("@crawlers").should have(1).crawler
|
38
|
-
@bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
|
39
|
-
end
|
40
|
-
end
|
@@ -1,88 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'open-uri'
|
3
|
-
|
4
|
-
describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
|
5
|
-
describe 'dsl interaction' do
|
6
|
-
before(:each) do
|
7
|
-
@page = page("http://www.iana.org")
|
8
|
-
@crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
|
9
|
-
scrape 'h1', :title
|
10
|
-
|
11
|
-
scrape '#intro p' do |node|
|
12
|
-
{description: node.text}
|
13
|
-
end
|
14
|
-
|
15
|
-
scrape_all '.home-panel h2' do |nodes|
|
16
|
-
{ sections: nodes.map{|n| n.text} }
|
17
|
-
end
|
18
|
-
end
|
19
|
-
@resources = @crawler.crawl @page
|
20
|
-
end #end before
|
21
|
-
|
22
|
-
it 'should crawl the resource' do
|
23
|
-
@resources.should have(1).resource
|
24
|
-
@resources.first[:title].should match('Internet Assigned Numbers Authority')
|
25
|
-
@resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
|
26
|
-
@resources.first[:sections].should have(3).sections
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
describe 'standard interaction' do
|
31
|
-
before(:each) do
|
32
|
-
@page = page()
|
33
|
-
@crawler = Klepto::Crawler.new 'body', {:syntax => :css}
|
34
|
-
end
|
35
|
-
it 'should have a CSS scope' do
|
36
|
-
@crawler.scope.should eq 'body'
|
37
|
-
end
|
38
|
-
it 'should have a desired syntax' do
|
39
|
-
@crawler.syntax.should == :css
|
40
|
-
end
|
41
|
-
|
42
|
-
it 'should be able to scrape the node that the crawler is scoped to' do
|
43
|
-
@crawler.scrape do |node|
|
44
|
-
{:name => node.native.name}
|
45
|
-
end
|
46
|
-
resources = @crawler.crawl( @page )
|
47
|
-
resources.should have(1).resource
|
48
|
-
resources.first[:name].should eq('body')
|
49
|
-
end
|
50
|
-
|
51
|
-
it 'should be able to designate scraping of a single node with a symbol' do
|
52
|
-
@crawler.scrape 'h1', :title
|
53
|
-
resources = @crawler.crawl( @page )
|
54
|
-
resources.should have(1).resource
|
55
|
-
resources.first[:title].should eq('Example Domain')
|
56
|
-
end
|
57
|
-
|
58
|
-
it 'should be able to designate scraping of a single node with a block' do
|
59
|
-
@crawler.scrape 'h1' do |node|
|
60
|
-
{title: node.text}
|
61
|
-
end
|
62
|
-
|
63
|
-
resources = @crawler.crawl( @page )
|
64
|
-
resources.should have(1).resource
|
65
|
-
resources.first[:title].should eq('Example Domain')
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'should be able to designate scraping of a set of nodes' do
|
69
|
-
@crawler.scrape_all 'p' do |nodes|
|
70
|
-
{
|
71
|
-
paragraphs: [
|
72
|
-
nodes.first.text,
|
73
|
-
nodes.last.text
|
74
|
-
]
|
75
|
-
}
|
76
|
-
end
|
77
|
-
resources = @crawler.crawl( @page )
|
78
|
-
resources.should have(1).resource
|
79
|
-
resources.first[:paragraphs].should be_kind_of(Array)
|
80
|
-
resources.first[:paragraphs].last.should eq("More information...")
|
81
|
-
end
|
82
|
-
|
83
|
-
pending 'should be able to save a set of resources'
|
84
|
-
pending 'should be able to specify a limit'
|
85
|
-
pending 'should be able to specify a skip'
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|