klepto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/.gitignore +21 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +18 -0
  5. data/Guardfile +11 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +129 -0
  8. data/Rakefile +7 -0
  9. data/klepto.gemspec +26 -0
  10. data/lib/klepto.rb +26 -0
  11. data/lib/klepto/bot.rb +59 -0
  12. data/lib/klepto/browser.rb +18 -0
  13. data/lib/klepto/crawler.rb +72 -0
  14. data/lib/klepto/tasks.rb +15 -0
  15. data/lib/klepto/version.rb +3 -0
  16. data/samples/example.rb +49 -0
  17. data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
  18. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
  19. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
  20. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
  21. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
  22. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
  23. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
  24. data/spec/lib/klepto/bot_spec.rb +40 -0
  25. data/spec/lib/klepto/browser_spec.rb +15 -0
  26. data/spec/lib/klepto/crawler_spec.rb +88 -0
  27. data/spec/lib/klepto/dsl_spec.rb +6 -0
  28. data/spec/lib/klepto_spec.rb +64 -0
  29. data/spec/orm/active_record.rb +36 -0
  30. data/spec/orm/database.example.yml +15 -0
  31. data/spec/spec_helper.rb +32 -0
  32. metadata +157 -0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:03 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:04 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:04 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:04 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Bot, :vcr => {:record => :new_episodes} do
4
+ before(:each) do
5
+ @bot = Klepto::Bot.new
6
+ end
7
+
8
+ it 'should know if it is a dry run' do
9
+ @bot.dry_run?.should be false
10
+ @bot.dry_run!
11
+ @bot.dry_run?.should be true
12
+ end
13
+
14
+ it 'should be able to set the selection syntax' do
15
+ @bot.syntax(:xpath)
16
+ @bot.syntax.should be(:xpath)
17
+ end
18
+
19
+ it 'should be able to read the selection syntax' do
20
+ @bot.syntax.should be(:css)
21
+ end
22
+
23
+ it 'should be able to set request headers' do
24
+ @bot.should respond_to(:headers)
25
+ end
26
+
27
+ it 'should be able to set a list of URLs to crawl' do
28
+ @bot.url 'http://www.google.com'
29
+ @bot.urls.should include('http://www.google.com')
30
+ @bot.urls 'http://twitter.com', 'http://facebook.com'
31
+ @bot.urls.should include('http://twitter.com')
32
+ @bot.urls.should include('http://facebook.com')
33
+ end
34
+
35
+ it 'should be able to add crawlers' do
36
+ @bot.crawl('div'){}
37
+ @bot.instance_variable_get("@crawlers").should have(1).crawler
38
+ @bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
39
+ end
40
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Browser, :vcr => {:record => :new_episodes} do
4
+ before(:each) do
5
+ @browser = Klepto::Browser.new
6
+ @browser.set_headers({
7
+ 'Referer' => 'http://www.example.com'
8
+ })
9
+ end
10
+
11
+ it 'should be able to fetch a page' do
12
+ @page = @browser.fetch! 'http://www.example.com'
13
+ @page.status_code.should be(200)
14
+ end
15
+ end
@@ -0,0 +1,88 @@
1
+ require 'spec_helper'
2
+ require 'open-uri'
3
+
4
+ describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
5
+ describe 'dsl interaction' do
6
+ before(:each) do
7
+ @page = page("http://www.iana.org")
8
+ @crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
9
+ scrape 'h1', :title
10
+
11
+ scrape '#intro p' do |node|
12
+ {description: node.text}
13
+ end
14
+
15
+ scrape_all '.home-panel h2' do |nodes|
16
+ { sections: nodes.map{|n| n.text} }
17
+ end
18
+ end
19
+ @resources = @crawler.crawl @page
20
+ end #end before
21
+
22
+ it 'should crawl the resource' do
23
+ @resources.should have(1).resource
24
+ @resources.first[:title].should match('Internet Assigned Numbers Authority')
25
+ @resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
26
+ @resources.first[:sections].should have(3).sections
27
+ end
28
+ end
29
+
30
+ describe 'standard interaction' do
31
+ before(:each) do
32
+ @page = page()
33
+ @crawler = Klepto::Crawler.new 'body', {:syntax => :css}
34
+ end
35
+ it 'should have a CSS scope' do
36
+ @crawler.scope.should eq 'body'
37
+ end
38
+ it 'should have a desired syntax' do
39
+ @crawler.syntax.should == :css
40
+ end
41
+
42
+ it 'should be able to scrape the node that the crawler is scoped to' do
43
+ @crawler.scrape do |node|
44
+ {:name => node.native.name}
45
+ end
46
+ resources = @crawler.crawl( @page )
47
+ resources.should have(1).resource
48
+ resources.first[:name].should eq('body')
49
+ end
50
+
51
+ it 'should be able to designate scraping of a single node with a symbol' do
52
+ @crawler.scrape 'h1', :title
53
+ resources = @crawler.crawl( @page )
54
+ resources.should have(1).resource
55
+ resources.first[:title].should eq('Example Domain')
56
+ end
57
+
58
+ it 'should be able to designate scraping of a single node with a block' do
59
+ @crawler.scrape 'h1' do |node|
60
+ {title: node.text}
61
+ end
62
+
63
+ resources = @crawler.crawl( @page )
64
+ resources.should have(1).resource
65
+ resources.first[:title].should eq('Example Domain')
66
+ end
67
+
68
+ it 'should be able to designate scraping of a set of nodes' do
69
+ @crawler.scrape_all 'p' do |nodes|
70
+ {
71
+ paragraphs: [
72
+ nodes.first.text,
73
+ nodes.last.text
74
+ ]
75
+ }
76
+ end
77
+ resources = @crawler.crawl( @page )
78
+ resources.should have(1).resource
79
+ resources.first[:paragraphs].should be_kind_of(Array)
80
+ resources.first[:paragraphs].last.should eq("More information...")
81
+ end
82
+
83
+ pending 'should be able to save a set of resources'
84
+ pending 'should be able to specify a limit'
85
+ pending 'should be able to specify a skip'
86
+ end
87
+
88
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+
4
+
5
+ describe 'Klepto DSL' do
6
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto do
4
+ it "should have a version" do
5
+ Klepto::VERSION.should_not be_nil
6
+ end
7
+ end
8
+
9
+ describe 'Scraping pages', :skip => false do
10
+ before(:each) do
11
+ @bot = Klepto::Bot.new do
12
+ syntax :css
13
+
14
+ headers({
15
+ 'Referer' => 'https://twitter.com',
16
+ 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
17
+ })
18
+
19
+ urls 'https://twitter.com/justinbieber'
20
+
21
+ crawl 'body' do
22
+ scrape "h1.fullname", :name
23
+ scrape '.username span.screen-name', :username
24
+ save do |params|
25
+ user = User.find_by_name(params[:username]) || User.new
26
+ user.update_attributes params
27
+ end
28
+ end
29
+
30
+ crawl 'li.stream-item' do
31
+ scrape do |node|
32
+ {:twitter_id => node['data-item-id']}
33
+ end
34
+
35
+ scrape '.content p', :content
36
+
37
+ scrape '._timestamp' do |node|
38
+ {timestamp: node['data-time']}
39
+ end
40
+
41
+ scrape '.time a' do |node|
42
+ {permalink: node[:href]}
43
+ end
44
+
45
+ save do |params|
46
+ tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
47
+ tweet.update_attributes params
48
+ end
49
+ end
50
+ end
51
+
52
+ @bot.start!
53
+ end
54
+
55
+ it 'should have collected some resources' do
56
+ @bot.crawlers.should have(2).crawlers
57
+ @bot.crawlers.first.resources.should have(1).user
58
+ end
59
+
60
+ it 'should persist resources' do
61
+ User.count.should be(1)
62
+ Tweet.count.should_not be(0)
63
+ end
64
+ end