klepto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/.gitignore +21 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +18 -0
  5. data/Guardfile +11 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +129 -0
  8. data/Rakefile +7 -0
  9. data/klepto.gemspec +26 -0
  10. data/lib/klepto.rb +26 -0
  11. data/lib/klepto/bot.rb +59 -0
  12. data/lib/klepto/browser.rb +18 -0
  13. data/lib/klepto/crawler.rb +72 -0
  14. data/lib/klepto/tasks.rb +15 -0
  15. data/lib/klepto/version.rb +3 -0
  16. data/samples/example.rb +49 -0
  17. data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
  18. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
  19. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
  20. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
  21. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
  22. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
  23. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
  24. data/spec/lib/klepto/bot_spec.rb +40 -0
  25. data/spec/lib/klepto/browser_spec.rb +15 -0
  26. data/spec/lib/klepto/crawler_spec.rb +88 -0
  27. data/spec/lib/klepto/dsl_spec.rb +6 -0
  28. data/spec/lib/klepto_spec.rb +64 -0
  29. data/spec/orm/active_record.rb +36 -0
  30. data/spec/orm/database.example.yml +15 -0
  31. data/spec/spec_helper.rb +32 -0
  32. metadata +157 -0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:05 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:05 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:05 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:05 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:05 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:05 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:05 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 06:51:05 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 06:51:05 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 06:51:04 GMT
114
+ recorded_with: VCR 2.4.0
@@ -0,0 +1,114 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://example.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ accept:
11
+ - ! '*/*'
12
+ user-agent:
13
+ - Ruby
14
+ response:
15
+ status:
16
+ code: 302
17
+ message: Found
18
+ headers:
19
+ location:
20
+ - http://www.iana.org/domains/example/
21
+ server:
22
+ - BigIP
23
+ connection:
24
+ - Keep-Alive
25
+ content-length:
26
+ - '0'
27
+ body:
28
+ encoding: US-ASCII
29
+ string: ''
30
+ http_version: '1.0'
31
+ recorded_at: Wed, 10 Apr 2013 07:08:32 GMT
32
+ - request:
33
+ method: get
34
+ uri: http://www.iana.org/domains/example/
35
+ body:
36
+ encoding: US-ASCII
37
+ string: ''
38
+ headers:
39
+ accept:
40
+ - ! '*/*'
41
+ user-agent:
42
+ - Ruby
43
+ response:
44
+ status:
45
+ code: 302
46
+ message: FOUND
47
+ headers:
48
+ date:
49
+ - Wed, 10 Apr 2013 07:08:35 GMT
50
+ server:
51
+ - Apache/2.2.3 (CentOS)
52
+ location:
53
+ - http://www.iana.org/domains/example
54
+ content-length:
55
+ - '0'
56
+ connection:
57
+ - close
58
+ content-type:
59
+ - text/html; charset=utf-8
60
+ body:
61
+ encoding: US-ASCII
62
+ string: ''
63
+ http_version: '1.1'
64
+ recorded_at: Wed, 10 Apr 2013 07:08:35 GMT
65
+ - request:
66
+ method: get
67
+ uri: http://www.iana.org/domains/example
68
+ body:
69
+ encoding: US-ASCII
70
+ string: ''
71
+ headers:
72
+ accept:
73
+ - ! '*/*'
74
+ user-agent:
75
+ - Ruby
76
+ response:
77
+ status:
78
+ code: 200
79
+ message: OK
80
+ headers:
81
+ date:
82
+ - Wed, 10 Apr 2013 07:08:36 GMT
83
+ server:
84
+ - Apache/2.2.3 (CentOS)
85
+ last-modified:
86
+ - Fri, 04 Jan 2013 01:17:22 GMT
87
+ vary:
88
+ - Accept-Encoding
89
+ connection:
90
+ - close
91
+ transfer-encoding:
92
+ - chunked
93
+ content-type:
94
+ - text/html; charset=UTF-8
95
+ body:
96
+ encoding: US-ASCII
97
+ string: ! "<!doctype html>\n<html>\n<head>\n\t<title>Example Domain</title>\n\n\t<meta
98
+ charset=\"utf-8\" />\n\t<meta http-equiv=\"Content-type\" content=\"text/html;
99
+ charset=utf-8\" />\n\t<meta name=\"viewport\" content=\"width=device-width,
100
+ initial-scale=1\" />\n\t<style type=\"text/css\">\n\tbody {\n\t\tbackground-color:
101
+ #f0f0f2;\n\t\tmargin: 0;\n\t\tpadding: 0;\n\t\tfont-family: \"Open Sans\",
102
+ \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n\t\t\n\t}\n\tdiv {\n\t\twidth:
103
+ 600px;\n\t\tmargin: 5em auto;\n\t\tpadding: 3em;\n\t\tbackground-color: #fff;\n\t\tborder-radius:
104
+ 1em;\n\t}\n\ta:link, a:visited {\n\t\tcolor: #38488f;\n\t\ttext-decoration:
105
+ none;\n\t}\n\t@media (max-width: 600px) {\n\t\tbody {\n\t\t\tbackground-color:
106
+ #fff;\n\t\t}\n\t\tdiv {\n\t\t\twidth: auto;\n\t\t\tmargin: 0 auto;\n\t\t\tborder-radius:
107
+ 0;\n\t\t\tpadding: 1em;\n\t\t}\n\t}\n\t</style>\t\n</head>\n\n<body>\n<div>\n\t<h1>Example
108
+ Domain</h1>\n\t<p>This domain is established to be used for illustrative examples
109
+ in documents. You do not need to\n\t\tcoordinate or ask for permission to
110
+ use this domain in examples, and it is not available for\n\t\tregistration.</p>\n\t<p><a
111
+ href=\"http://www.iana.org/domains/special\">More information...</a></p>\n</div>\n</body>\n</html>\n"
112
+ http_version: '1.1'
113
+ recorded_at: Wed, 10 Apr 2013 07:08:36 GMT
114
+ recorded_with: VCR 2.4.0