klepto 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. data/.gitignore +21 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +18 -0
  5. data/Guardfile +11 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +129 -0
  8. data/Rakefile +7 -0
  9. data/klepto.gemspec +26 -0
  10. data/lib/klepto.rb +26 -0
  11. data/lib/klepto/bot.rb +59 -0
  12. data/lib/klepto/browser.rb +18 -0
  13. data/lib/klepto/crawler.rb +72 -0
  14. data/lib/klepto/tasks.rb +15 -0
  15. data/lib/klepto/version.rb +3 -0
  16. data/samples/example.rb +49 -0
  17. data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
  18. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
  19. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
  20. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
  21. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
  22. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
  23. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
  24. data/spec/lib/klepto/bot_spec.rb +40 -0
  25. data/spec/lib/klepto/browser_spec.rb +15 -0
  26. data/spec/lib/klepto/crawler_spec.rb +88 -0
  27. data/spec/lib/klepto/dsl_spec.rb +6 -0
  28. data/spec/lib/klepto_spec.rb +64 -0
  29. data/spec/orm/active_record.rb +36 -0
  30. data/spec/orm/database.example.yml +15 -0
  31. data/spec/spec_helper.rb +32 -0
  32. metadata +157 -0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:03 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:04 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:04 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:03 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:04 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,40 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Bot, :vcr => {:record => :new_episodes} do
4
+ before(:each) do
5
+ @bot = Klepto::Bot.new
6
+ end
7
+
8
+ it 'should know if it is a dry run' do
9
+ @bot.dry_run?.should be false
10
+ @bot.dry_run!
11
+ @bot.dry_run?.should be true
12
+ end
13
+
14
+ it 'should be able to set the selection syntax' do
15
+ @bot.syntax(:xpath)
16
+ @bot.syntax.should be(:xpath)
17
+ end
18
+
19
+ it 'should be able to read the selection syntax' do
20
+ @bot.syntax.should be(:css)
21
+ end
22
+
23
+ it 'should be able to set request headers' do
24
+ @bot.should respond_to(:headers)
25
+ end
26
+
27
+ it 'should be able to set a list of URLs to crawl' do
28
+ @bot.url 'http://www.google.com'
29
+ @bot.urls.should include('http://www.google.com')
30
+ @bot.urls 'http://twitter.com', 'http://facebook.com'
31
+ @bot.urls.should include('http://twitter.com')
32
+ @bot.urls.should include('http://facebook.com')
33
+ end
34
+
35
+ it 'should be able to add crawlers' do
36
+ @bot.crawl('div'){}
37
+ @bot.instance_variable_get("@crawlers").should have(1).crawler
38
+ @bot.instance_variable_get("@crawlers").first.should be_kind_of(Klepto::Crawler)
39
+ end
40
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto::Browser, :vcr => {:record => :new_episodes} do
4
+ before(:each) do
5
+ @browser = Klepto::Browser.new
6
+ @browser.set_headers({
7
+ 'Referer' => 'http://www.example.com'
8
+ })
9
+ end
10
+
11
+ it 'should be able to fetch a page' do
12
+ @page = @browser.fetch! 'http://www.example.com'
13
+ @page.status_code.should be(200)
14
+ end
15
+ end
@@ -0,0 +1,88 @@
1
+ require 'spec_helper'
2
+ require 'open-uri'
3
+
4
+ describe Klepto::Crawler, :vcr => {:record => :new_episodes} do
5
+ describe 'dsl interaction' do
6
+ before(:each) do
7
+ @page = page("http://www.iana.org")
8
+ @crawler = Klepto::Crawler.new('body',{:syntax => :css}) do
9
+ scrape 'h1', :title
10
+
11
+ scrape '#intro p' do |node|
12
+ {description: node.text}
13
+ end
14
+
15
+ scrape_all '.home-panel h2' do |nodes|
16
+ { sections: nodes.map{|n| n.text} }
17
+ end
18
+ end
19
+ @resources = @crawler.crawl @page
20
+ end #end before
21
+
22
+ it 'should crawl the resource' do
23
+ @resources.should have(1).resource
24
+ @resources.first[:title].should match('Internet Assigned Numbers Authority')
25
+ @resources.first[:description].should match(/^The Internet Assigned Numbers Authority/i)
26
+ @resources.first[:sections].should have(3).sections
27
+ end
28
+ end
29
+
30
+ describe 'standard interaction' do
31
+ before(:each) do
32
+ @page = page()
33
+ @crawler = Klepto::Crawler.new 'body', {:syntax => :css}
34
+ end
35
+ it 'should have a CSS scope' do
36
+ @crawler.scope.should eq 'body'
37
+ end
38
+ it 'should have a desired syntax' do
39
+ @crawler.syntax.should == :css
40
+ end
41
+
42
+ it 'should be able to scrape the node that the crawler is scoped to' do
43
+ @crawler.scrape do |node|
44
+ {:name => node.native.name}
45
+ end
46
+ resources = @crawler.crawl( @page )
47
+ resources.should have(1).resource
48
+ resources.first[:name].should eq('body')
49
+ end
50
+
51
+ it 'should be able to designate scraping of a single node with a symbol' do
52
+ @crawler.scrape 'h1', :title
53
+ resources = @crawler.crawl( @page )
54
+ resources.should have(1).resource
55
+ resources.first[:title].should eq('Example Domain')
56
+ end
57
+
58
+ it 'should be able to designate scraping of a single node with a block' do
59
+ @crawler.scrape 'h1' do |node|
60
+ {title: node.text}
61
+ end
62
+
63
+ resources = @crawler.crawl( @page )
64
+ resources.should have(1).resource
65
+ resources.first[:title].should eq('Example Domain')
66
+ end
67
+
68
+ it 'should be able to designate scraping of a set of nodes' do
69
+ @crawler.scrape_all 'p' do |nodes|
70
+ {
71
+ paragraphs: [
72
+ nodes.first.text,
73
+ nodes.last.text
74
+ ]
75
+ }
76
+ end
77
+ resources = @crawler.crawl( @page )
78
+ resources.should have(1).resource
79
+ resources.first[:paragraphs].should be_kind_of(Array)
80
+ resources.first[:paragraphs].last.should eq("More information...")
81
+ end
82
+
83
+ pending 'should be able to save a set of resources'
84
+ pending 'should be able to specify a limit'
85
+ pending 'should be able to specify a skip'
86
+ end
87
+
88
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+
4
+
5
+ describe 'Klepto DSL' do
6
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe Klepto do
4
+ it "should have a version" do
5
+ Klepto::VERSION.should_not be_nil
6
+ end
7
+ end
8
+
9
+ describe 'Scraping pages', :skip => false do
10
+ before(:each) do
11
+ @bot = Klepto::Bot.new do
12
+ syntax :css
13
+
14
+ headers({
15
+ 'Referer' => 'https://twitter.com',
16
+ 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22"
17
+ })
18
+
19
+ urls 'https://twitter.com/justinbieber'
20
+
21
+ crawl 'body' do
22
+ scrape "h1.fullname", :name
23
+ scrape '.username span.screen-name', :username
24
+ save do |params|
25
+ user = User.find_by_name(params[:username]) || User.new
26
+ user.update_attributes params
27
+ end
28
+ end
29
+
30
+ crawl 'li.stream-item' do
31
+ scrape do |node|
32
+ {:twitter_id => node['data-item-id']}
33
+ end
34
+
35
+ scrape '.content p', :content
36
+
37
+ scrape '._timestamp' do |node|
38
+ {timestamp: node['data-time']}
39
+ end
40
+
41
+ scrape '.time a' do |node|
42
+ {permalink: node[:href]}
43
+ end
44
+
45
+ save do |params|
46
+ tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
47
+ tweet.update_attributes params
48
+ end
49
+ end
50
+ end
51
+
52
+ @bot.start!
53
+ end
54
+
55
+ it 'should have collected some resources' do
56
+ @bot.crawlers.should have(2).crawlers
57
+ @bot.crawlers.first.resources.should have(1).user
58
+ end
59
+
60
+ it 'should persist resources' do
61
+ User.count.should be(1)
62
+ Tweet.count.should_not be(0)
63
+ end
64
+ end