wombat 2.4.0 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile.lock +52 -50
- data/README.md +4 -4
- data/VERSION +1 -1
- data/examples/iterator.rb +25 -25
- data/examples/list.rb +22 -22
- data/examples/xml.rb +15 -15
- data/fixtures/vcr_cassettes/make_post_request.yml +69 -0
- data/lib/wombat/crawler.rb +13 -7
- data/lib/wombat/dsl/metadata.rb +8 -0
- data/lib/wombat/processing/parser.rb +22 -7
- data/lib/wombat/property/locators/factory.rb +28 -28
- data/lib/wombat/property/locators/follow.rb +15 -38
- data/lib/wombat/property/locators/list.rb +13 -13
- data/lib/wombat.rb +8 -8
- data/spec/crawler_spec.rb +64 -42
- data/spec/integration/crawler_inheritance_spec.rb +63 -0
- data/spec/integration/integration_spec.rb +30 -0
- data/spec/processing/parser_spec.rb +35 -0
- data/spec/property/locators/factory_spec.rb +14 -14
- data/spec/property/locators/html_spec.rb +18 -18
- data/spec/property/locators/list_spec.rb +8 -8
- data/spec/property/locators/text_spec.rb +44 -44
- data/spec/wombat_spec.rb +36 -36
- data/wombat.gemspec +7 -3
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db6d749bf5076bdc369503d50594f59eac418c56
|
4
|
+
data.tar.gz: 5f39e89a2a7f6de5d23ade5ee835af4268904aa4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4e2da502d91d6557f777685b0935b14e497b3a63f4b2f0f07dbb3355135b4eaac38ca23b9e87fe00c12f0a8869578ecf950551aceea970b8c48037c40c392ab
|
7
|
+
data.tar.gz: 8eb05ced9d437c33a4aa5dab76b4d3f23ee63b6a0b817358242bd9a9eed3002dd7fa641e54020f65d21a2e27ed65c7104f49a3507ddd708732b621fa0c0741db
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
### version 2.5.0 (2016-01-26)
|
2
|
+
|
3
|
+
* Updates gem dependencies
|
4
|
+
* [PR #52](https://github.com/felipecsl/wombat/pull/52) Allow passing the URL as the `Wombat#crawl` argument
|
5
|
+
* [PR #51](https://github.com/felipecsl/wombat/pull/51) Allow crawler classes inheritance
|
6
|
+
* [PR #50](https://github.com/felipecsl/wombat/pull/50) Add HTTP methods support (`POST`, `PUT`, `HEAD`, etc)
|
7
|
+
|
8
|
+
### version 2.4.0
|
9
|
+
|
10
|
+
* Updates gem dependencies
|
11
|
+
* [Adds `user_agent` and `user_agent_alias` config options to `Wombat.configure`](https://github.com/felipecsl/wombat/pull/45)
|
12
|
+
|
13
|
+
### version 2.3.0
|
14
|
+
|
15
|
+
* Updates gem dependencies
|
16
|
+
* [Adds content-type=text/html header to Mechanize if missing](https://github.com/felipecsl/wombat/pull/40)
|
17
|
+
* [Retry page.click on relative links](https://github.com/felipecsl/wombat/pull/32)
|
18
|
+
|
19
|
+
### version 2.2.1
|
20
|
+
|
21
|
+
* Adds ability to crawl a prefetched Mechanize page (thanks to @dsjbirch)
|
22
|
+
|
23
|
+
### version 2.1.2
|
24
|
+
|
25
|
+
* Added support for hash based property selectors (eg.: `css: 'header'` instead of `'css=.header'`)
|
26
|
+
|
27
|
+
### version 2.1.1
|
28
|
+
|
29
|
+
* Updated gem dependencies
|
30
|
+
|
31
|
+
### version 2.1.0
|
32
|
+
|
33
|
+
* [Added header properties](https://github.com/felipecsl/wombat/pull/11) (thanks to @kdridi)
|
34
|
+
* [Fixed bug in selectors that used XPath functions like `concat`](https://github.com/felipecsl/wombat/pull/10) (thanks to @viniciusdaniel)
|
35
|
+
|
36
|
+
### version 2.0.1
|
37
|
+
|
38
|
+
* Added proxy settings configuration (thanks to @phortx)
|
39
|
+
* Fixed minor bug in HTML property locator
|
40
|
+
|
41
|
+
### version 2.0.0
|
42
|
+
|
43
|
+
This version contains some breaking changes (not backwards compatible), most notably to `for_each` that is now specified through the option `:iterator` and nested block parameters that are gone.
|
44
|
+
|
45
|
+
* Added syntactic sugar methods `Wombat.scrape` and `Crawler#scrape` that alias to their respective `crawl` method implementation;
|
46
|
+
* Gem internals suffered big refactoring, removed code duplication;
|
47
|
+
* DSL syntax simplified for nested properties. Now the nested block takes **no arguments**;
|
48
|
+
* DSL syntax changed for iterated properties. Iterators can now be named just like other properties and won't be automatically named as `iterator#{i}` anymore. Specified through the `:iterator` option;
|
49
|
+
* `Crawler#list_page` is now called `Crawler#path`;
|
50
|
+
* Added new `:follow` property type that crawls links in pages.
|
51
|
+
|
52
|
+
### version 1.0.0
|
53
|
+
|
54
|
+
* **Breaking change**: `Metadata#format` renamed to `Metadata#document_format` due to method name clash with [Kernel#format](http://www.ruby-doc.org/core-1.9.3/Kernel.html#method-i-format)
|
55
|
+
|
56
|
+
### version 0.5.0
|
57
|
+
|
58
|
+
* [Fixed a bug on malformed selectors](https://github.com/felipecsl/wombat/commit/e0f4eec20e1e2bb07a1813a1edd019933edeceaa)
|
59
|
+
* [Fixed a bug where multiple calls to #crawl would not clean up previously iterated array results and yield repeated results](https://github.com/felipecsl/wombat/commit/40b09a5bf8b9ba08aa51b6f41f706b7c3c4e4252)
|
60
|
+
|
61
|
+
### version 0.4.0
|
62
|
+
|
63
|
+
* Added utility method `Wombat.crawl` that eliminates the need to have a ruby class instance to use Wombat. Now you can use just `Wombat.crawl` and start working. The class based format still works as before though.
|
64
|
+
|
65
|
+
### version 0.3.1
|
66
|
+
|
67
|
+
* Added the ability to provide a block to Crawler#crawl and override the default crawler properties for a one off run (thanks to @danielnc)
|
data/Gemfile.lock
CHANGED
@@ -1,40 +1,40 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
activesupport (4.2.
|
4
|
+
activesupport (4.2.5.1)
|
5
5
|
i18n (~> 0.7)
|
6
6
|
json (~> 1.7, >= 1.7.7)
|
7
7
|
minitest (~> 5.1)
|
8
8
|
thread_safe (~> 0.3, >= 0.3.4)
|
9
9
|
tzinfo (~> 1.1)
|
10
|
-
addressable (2.
|
10
|
+
addressable (2.4.0)
|
11
11
|
builder (3.2.2)
|
12
|
-
coveralls (0.
|
13
|
-
|
12
|
+
coveralls (0.8.10)
|
13
|
+
json (~> 1.8)
|
14
14
|
rest-client (>= 1.6.8, < 2)
|
15
|
-
simplecov (~> 0.
|
15
|
+
simplecov (~> 0.11.0)
|
16
16
|
term-ansicolor (~> 1.3)
|
17
17
|
thor (~> 0.19.1)
|
18
|
+
tins (~> 1.6.0)
|
18
19
|
descendants_tracker (0.0.4)
|
19
20
|
thread_safe (~> 0.3, >= 0.3.1)
|
20
21
|
diff-lcs (1.2.5)
|
21
22
|
docile (1.1.5)
|
22
|
-
domain_name (0.5.
|
23
|
+
domain_name (0.5.25)
|
23
24
|
unf (>= 0.0.5, < 1.0.0)
|
24
25
|
fakeweb (1.3.0)
|
25
|
-
faraday (0.9.
|
26
|
+
faraday (0.9.2)
|
26
27
|
multipart-post (>= 1.2, < 3)
|
27
28
|
git (1.2.9.1)
|
28
|
-
github_api (0.
|
29
|
-
addressable (~> 2.
|
29
|
+
github_api (0.13.1)
|
30
|
+
addressable (~> 2.4.0)
|
30
31
|
descendants_tracker (~> 0.0.4)
|
31
32
|
faraday (~> 0.8, < 0.10)
|
32
|
-
hashie (>= 3.
|
33
|
+
hashie (>= 3.4)
|
33
34
|
multi_json (>= 1.7.5, < 2.0)
|
34
|
-
nokogiri (~> 1.6.3)
|
35
35
|
oauth2
|
36
|
-
hashie (3.4.
|
37
|
-
highline (1.7.
|
36
|
+
hashie (3.4.3)
|
37
|
+
highline (1.7.8)
|
38
38
|
http-cookie (1.0.2)
|
39
39
|
domain_name (~> 0.5)
|
40
40
|
i18n (0.7.0)
|
@@ -47,28 +47,28 @@ GEM
|
|
47
47
|
nokogiri (>= 1.5.10)
|
48
48
|
rake
|
49
49
|
rdoc
|
50
|
-
json (1.8.
|
51
|
-
jwt (1.
|
52
|
-
mechanize (2.7.
|
50
|
+
json (1.8.3)
|
51
|
+
jwt (1.5.2)
|
52
|
+
mechanize (2.7.4)
|
53
53
|
domain_name (~> 0.5, >= 0.5.1)
|
54
54
|
http-cookie (~> 1.0)
|
55
|
-
mime-types (
|
55
|
+
mime-types (>= 1.17.2, < 3)
|
56
56
|
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
57
57
|
net-http-persistent (~> 2.5, >= 2.5.2)
|
58
|
-
nokogiri (~> 1.
|
58
|
+
nokogiri (~> 1.6)
|
59
59
|
ntlm-http (~> 0.1, >= 0.1.1)
|
60
60
|
webrobots (>= 0.0.9, < 0.2)
|
61
|
-
mime-types (2.
|
62
|
-
|
63
|
-
minitest (5.
|
64
|
-
multi_json (1.11.
|
61
|
+
mime-types (2.99)
|
62
|
+
mini_portile2 (2.0.0)
|
63
|
+
minitest (5.8.4)
|
64
|
+
multi_json (1.11.2)
|
65
65
|
multi_xml (0.5.5)
|
66
66
|
multipart-post (2.0.0)
|
67
67
|
net-http-digest_auth (1.4)
|
68
68
|
net-http-persistent (2.9.4)
|
69
|
-
netrc (0.
|
70
|
-
nokogiri (1.6.
|
71
|
-
|
69
|
+
netrc (0.11.0)
|
70
|
+
nokogiri (1.6.7.2)
|
71
|
+
mini_portile2 (~> 2.0.0.rc2)
|
72
72
|
ntlm-http (0.1.1)
|
73
73
|
oauth2 (1.0.0)
|
74
74
|
faraday (>= 0.8, < 0.10)
|
@@ -76,42 +76,44 @@ GEM
|
|
76
76
|
multi_json (~> 1.3)
|
77
77
|
multi_xml (~> 0.5)
|
78
78
|
rack (~> 1.2)
|
79
|
-
rack (1.6.
|
80
|
-
rake (10.
|
81
|
-
rdoc (4.2.
|
82
|
-
|
79
|
+
rack (1.6.4)
|
80
|
+
rake (10.5.0)
|
81
|
+
rdoc (4.2.1)
|
82
|
+
json (~> 1.4)
|
83
|
+
rest-client (1.8.0)
|
84
|
+
http-cookie (>= 1.0.2, < 2.0)
|
83
85
|
mime-types (>= 1.16, < 3.0)
|
84
86
|
netrc (~> 0.7)
|
85
|
-
rspec (3.
|
86
|
-
rspec-core (~> 3.
|
87
|
-
rspec-expectations (~> 3.
|
88
|
-
rspec-mocks (~> 3.
|
89
|
-
rspec-core (3.2
|
90
|
-
rspec-support (~> 3.
|
91
|
-
rspec-expectations (3.
|
87
|
+
rspec (3.4.0)
|
88
|
+
rspec-core (~> 3.4.0)
|
89
|
+
rspec-expectations (~> 3.4.0)
|
90
|
+
rspec-mocks (~> 3.4.0)
|
91
|
+
rspec-core (3.4.2)
|
92
|
+
rspec-support (~> 3.4.0)
|
93
|
+
rspec-expectations (3.4.0)
|
92
94
|
diff-lcs (>= 1.2.0, < 2.0)
|
93
|
-
rspec-support (~> 3.
|
94
|
-
rspec-mocks (3.
|
95
|
+
rspec-support (~> 3.4.0)
|
96
|
+
rspec-mocks (3.4.1)
|
95
97
|
diff-lcs (>= 1.2.0, < 2.0)
|
96
|
-
rspec-support (~> 3.
|
97
|
-
rspec-support (3.
|
98
|
-
simplecov (0.
|
98
|
+
rspec-support (~> 3.4.0)
|
99
|
+
rspec-support (3.4.1)
|
100
|
+
simplecov (0.11.1)
|
99
101
|
docile (~> 1.1.0)
|
100
|
-
|
101
|
-
simplecov-html (~> 0.
|
102
|
-
simplecov-html (0.
|
103
|
-
term-ansicolor (1.3.
|
102
|
+
json (~> 1.8)
|
103
|
+
simplecov-html (~> 0.10.0)
|
104
|
+
simplecov-html (0.10.0)
|
105
|
+
term-ansicolor (1.3.2)
|
104
106
|
tins (~> 1.0)
|
105
107
|
thor (0.19.1)
|
106
|
-
thread_safe (0.3.
|
107
|
-
tins (1.
|
108
|
+
thread_safe (0.3.5)
|
109
|
+
tins (1.6.0)
|
108
110
|
tzinfo (1.2.2)
|
109
111
|
thread_safe (~> 0.1)
|
110
112
|
unf (0.1.4)
|
111
113
|
unf_ext
|
112
|
-
unf_ext (0.0.
|
113
|
-
vcr (
|
114
|
-
webrobots (0.1.
|
114
|
+
unf_ext (0.0.7.1)
|
115
|
+
vcr (3.0.1)
|
116
|
+
webrobots (0.1.2)
|
115
117
|
yard (0.8.7.6)
|
116
118
|
|
117
119
|
PLATFORMS
|
data/README.md
CHANGED
@@ -12,11 +12,11 @@ Web scraper with an elegant DSL that parses structured data from web pages.
|
|
12
12
|
|
13
13
|
## Usage:
|
14
14
|
|
15
|
-
|
15
|
+
`gem install wombat`
|
16
16
|
|
17
17
|
## Scraping a page:
|
18
18
|
|
19
|
-
The simplest way to use Wombat is by calling
|
19
|
+
The simplest way to use Wombat is by calling `Wombat.crawl` and passing it a block:
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
require 'wombat'
|
@@ -40,11 +40,11 @@ Wombat.crawl do
|
|
40
40
|
blog css: '.blog'
|
41
41
|
end
|
42
42
|
end
|
43
|
-
|
43
|
+
``
|
44
44
|
|
45
45
|
###### The code above is gonna return the following hash:
|
46
46
|
|
47
|
-
|
47
|
+
``ruby
|
48
48
|
{
|
49
49
|
"headline"=>"Build software better, together.",
|
50
50
|
"subheading"=>"Powerful collaboration, code review, and code management for open source and private projects. Need private repositories? Upgraded plans start at $7/mo.",
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.5.0
|
data/examples/iterator.rb
CHANGED
@@ -17,30 +17,30 @@ end
|
|
17
17
|
p IteratorCrawler.new.crawl
|
18
18
|
{"repos"=>
|
19
19
|
[
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
},
|
24
|
-
{
|
25
|
-
"repo"=>"ZeitOnline / briefkasten",
|
26
|
-
"description"=>"a reasonably secure web application for submitting content anonymously"
|
27
|
-
},
|
28
|
-
{
|
29
|
-
"repo"=>"nothingmagical / cheddar-ios",
|
30
|
-
"description"=>"Cheddar for iOS"
|
31
|
-
},
|
32
|
-
{
|
33
|
-
"repo"=>"nathanmarz / storm-mesos",
|
34
|
-
"description"=>"Run Storm on top of the Mesos cluster resource manager"
|
35
|
-
},
|
36
|
-
{
|
37
|
-
"repo"=>"Netflix / SimianArmy",
|
38
|
-
"description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
|
20
|
+
{
|
21
|
+
"repo"=>"bernii / gauge.js",
|
22
|
+
"description"=>"100% native and cool looking JavaScript gauge"
|
39
23
|
},
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
24
|
+
{
|
25
|
+
"repo"=>"ZeitOnline / briefkasten",
|
26
|
+
"description"=>"a reasonably secure web application for submitting content anonymously"
|
27
|
+
},
|
28
|
+
{
|
29
|
+
"repo"=>"nothingmagical / cheddar-ios",
|
30
|
+
"description"=>"Cheddar for iOS"
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"repo"=>"nathanmarz / storm-mesos",
|
34
|
+
"description"=>"Run Storm on top of the Mesos cluster resource manager"
|
35
|
+
},
|
36
|
+
{
|
37
|
+
"repo"=>"Netflix / SimianArmy",
|
38
|
+
"description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"repo"=>nil,
|
42
|
+
"description"=>nil
|
43
|
+
}
|
44
|
+
]
|
45
45
|
}
|
46
|
-
=end
|
46
|
+
=end
|
data/examples/list.rb
CHANGED
@@ -17,28 +17,28 @@ end
|
|
17
17
|
=begin
|
18
18
|
pp ListCrawler.new.crawl
|
19
19
|
{
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
20
|
+
"gems"=>{
|
21
|
+
"new"=>[
|
22
|
+
"buffer (0.0.1)",
|
23
|
+
"resque-telework (0.2.0)",
|
24
|
+
"my_string_extend_lyk (0.0.1)",
|
25
|
+
"specr (0.0.1)",
|
26
|
+
"array-frequency (1.0.0)"
|
27
|
+
],
|
28
|
+
"most_downloaded"=> [
|
29
|
+
"rake-0.9.2.2 (7,128)",
|
30
|
+
"mime-types-1.19 (5,331)",
|
31
|
+
"tilt-1.3.3 (5,146)",
|
32
|
+
"rack-1.4.1 (5,124)",
|
33
|
+
"multi_json-1.3.6 (5,093)"
|
34
|
+
],
|
35
|
+
"just_updated"=>[
|
36
|
+
"wombat (2.0.0)",
|
37
|
+
"pdf-reader-turtletext (0.2.1)",
|
38
|
+
"minitest-reporters (0.10.0)",
|
39
|
+
"cloudprint (0.1.3)",
|
40
|
+
"greenletters (0.2.0)"
|
41
41
|
]
|
42
42
|
}
|
43
43
|
}
|
44
|
-
=end
|
44
|
+
=end
|
data/examples/xml.rb
CHANGED
@@ -6,7 +6,7 @@ class XmlCrawler
|
|
6
6
|
|
7
7
|
base_url "http://ws.audioscrobbler.com"
|
8
8
|
path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
|
9
|
-
|
9
|
+
|
10
10
|
document_format :xml
|
11
11
|
|
12
12
|
title "xpath=//event/title"
|
@@ -21,18 +21,18 @@ end
|
|
21
21
|
pp XmlCrawler.new.crawl
|
22
22
|
|
23
23
|
{
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
24
|
+
"title"=>"Sinéad O'Connor",
|
25
|
+
"locations"=>[
|
26
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
27
|
+
{"latitude"=>"37.76213", "longitude"=>"-122.419032"},
|
28
|
+
{"latitude"=>"37.771491", "longitude"=>"-122.413241"},
|
29
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"},
|
30
|
+
{"latitude"=>"37.766588", "longitude"=>"-122.430391"},
|
31
|
+
{"latitude"=>"37.788978", "longitude"=>"-122.40664"},
|
32
|
+
{"latitude"=>"37.769715", "longitude"=>"-122.420427"},
|
33
|
+
{"latitude"=>"37.78832", "longitude"=>"-122.446692"},
|
34
|
+
{"latitude"=>"37.787583", "longitude"=>"-122.421665"},
|
35
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"}
|
36
|
+
]
|
37
37
|
}
|
38
|
-
=end
|
38
|
+
=end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: post
|
5
|
+
uri: http://hroch486.icpf.cas.cz/cgi-bin/echo.pl
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: your_name=Name
|
9
|
+
headers:
|
10
|
+
accept-encoding:
|
11
|
+
- gzip,deflate,identity
|
12
|
+
accept:
|
13
|
+
- "*/*"
|
14
|
+
user-agent:
|
15
|
+
- Mechanize/2.7.3 Ruby/2.2.2p95 (http://github.com/sparklemotion/mechanize/)
|
16
|
+
accept-charset:
|
17
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
18
|
+
accept-language:
|
19
|
+
- en-us,en;q=0.5
|
20
|
+
host:
|
21
|
+
- hroch486.icpf.cas.cz
|
22
|
+
content-type:
|
23
|
+
- application/x-www-form-urlencoded
|
24
|
+
content-length:
|
25
|
+
- '14'
|
26
|
+
connection:
|
27
|
+
- keep-alive
|
28
|
+
keep-alive:
|
29
|
+
- 300
|
30
|
+
response:
|
31
|
+
status:
|
32
|
+
code: 200
|
33
|
+
message: OK
|
34
|
+
headers:
|
35
|
+
date:
|
36
|
+
- Thu, 13 Aug 2015 07:37:36 GMT
|
37
|
+
server:
|
38
|
+
- Apache/2.2.23 (Fedora)
|
39
|
+
keep-alive:
|
40
|
+
- timeout=15, max=100
|
41
|
+
connection:
|
42
|
+
- Keep-Alive
|
43
|
+
transfer-encoding:
|
44
|
+
- chunked
|
45
|
+
content-type:
|
46
|
+
- text/html; charset=UTF-8
|
47
|
+
body:
|
48
|
+
encoding: UTF-8
|
49
|
+
string: "<HTML>\n<HEAD>\n<TITLE>CGI/1.0 test script report:</TITLE>\n</HEAD>\n<BODY>\n<H1>CGI/1.0
|
50
|
+
test script report:</H1>\nThis CGI script was written in Perl 5 by <a href=\"mailto:wagner@mbox.cesnet.cz\">Z.
|
51
|
+
Wagner</a>.<p>\n<H2>General information</H2>\n<UL>\n<LI>SERVER_SOFTWARE =
|
52
|
+
Apache/2.2.23 (Fedora)\n<LI>SERVER_NAME = hroch486.icpf.cas.cz\n<LI>GATEWAY_INTERFACE
|
53
|
+
= CGI/1.1\n<LI>SERVER_PROTOCOL = HTTP/1.1\n<LI>SERVER_PORT = 80\n<LI>REQUEST_METHOD
|
54
|
+
= POST\n<LI>HTTP_ACCEPT = */*\n<LI>HTTP_REFERER = \n<LI>HTTP_USER_AGENT =
|
55
|
+
Mechanize/2.7.3 Ruby/2.2.2p95 (http://github.com/sparklemotion/mechanize/)\n<LI>PATH_INFO
|
56
|
+
= \n<LI>PATH_TRANSLATED = \n<LI>REQUEST_URI = /cgi-bin/echo.pl\n<LI>SCRIPT_NAME
|
57
|
+
= /cgi-bin/echo.pl\n<LI>QUERY_STRING = \n<LI>REMOTE_HOST = \n<LI>REMOTE_ADDR
|
58
|
+
= 88.147.153.25\n<LI>REMOTE_PORT = 54361\n<LI>REMOTE_USER = \n<LI>AUTH_TYPE
|
59
|
+
= \n<LI>CONTENT_TYPE = application/x-www-form-urlencoded\n<LI>CONTENT_LENGTH
|
60
|
+
= 14\n<LI>HTTP_COOKIES = \n</UL>\n<H2>Apache extensions</H2>\n<UL>\n<LI>SERVER_ADMIN
|
61
|
+
= root@localhost\n<LI>SERVER_SIGNATURE = <address>Apache/2.2.23 (Fedora) Server
|
62
|
+
at hroch486.icpf.cas.cz Port 80</address>\n\n<LI>QUERY_STRING_UNESCAPED =
|
63
|
+
\n<LI>REDIRECT_QUERY_STRING = \n<LI>REDIRECT_URL = \n<LI>UNIQUE_ID = \n<LI>SCRIPT_FILENAME
|
64
|
+
<i>is intentionally hidden (full file name of the script)</i>\n<LI>DOCUMENT_ROOT
|
65
|
+
<i>is intentionally hidden</i>\n</UL>\n<H2>Query</H2>\n<P><code>your_name=Name</code>\n<H2>Parsed
|
66
|
+
values</H2>\n<UL>\n<LI>your_name = Name\n</UL>\n<H2>No cookies!</H2></BODY>\n</HTML>\n"
|
67
|
+
http_version: '1.1'
|
68
|
+
recorded_at: Thu, 13 Aug 2015 07:37:36 GMT
|
69
|
+
recorded_with: VCR 2.9.3
|
data/lib/wombat/crawler.rb
CHANGED
@@ -10,7 +10,14 @@ module Wombat
|
|
10
10
|
include Processing::Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
|
-
|
13
|
+
included do
|
14
|
+
class << self
|
15
|
+
attr_accessor :metadata
|
16
|
+
end
|
17
|
+
self.metadata = DSL::Metadata.new
|
18
|
+
end
|
19
|
+
|
20
|
+
def crawl(url = nil, &block)
|
14
21
|
if block
|
15
22
|
@metadata_dup = self.class.send(:metadata).clone
|
16
23
|
instance_eval do
|
@@ -20,14 +27,14 @@ module Wombat
|
|
20
27
|
end
|
21
28
|
end
|
22
29
|
self.instance_eval &block
|
23
|
-
parsed = parse
|
30
|
+
parsed = parse(@metadata_dup, url)
|
24
31
|
instance_eval do
|
25
32
|
alias :method_missing :old_method_missing
|
26
33
|
remove_instance_variable :@metadata_dup
|
27
34
|
end
|
28
35
|
parsed
|
29
36
|
else
|
30
|
-
parse
|
37
|
+
parse(self.class.send(:metadata), url)
|
31
38
|
end
|
32
39
|
end
|
33
40
|
|
@@ -45,10 +52,9 @@ module Wombat
|
|
45
52
|
def to_ary
|
46
53
|
end
|
47
54
|
|
48
|
-
|
49
|
-
|
50
|
-
@metadata ||= DSL::Metadata.new
|
55
|
+
def inherited(subclass)
|
56
|
+
subclass.metadata = self.metadata.clone
|
51
57
|
end
|
52
58
|
end
|
53
59
|
end
|
54
|
-
end
|
60
|
+
end
|
data/lib/wombat/dsl/metadata.rb
CHANGED
@@ -10,11 +10,17 @@ module Nokogiri
|
|
10
10
|
attr_accessor :headers
|
11
11
|
end
|
12
12
|
end
|
13
|
+
module HTML
|
14
|
+
class Document
|
15
|
+
attr_accessor :mechanize_page
|
16
|
+
end
|
17
|
+
end
|
13
18
|
end
|
14
19
|
|
15
20
|
module Wombat
|
16
21
|
module Processing
|
17
22
|
module Parser
|
23
|
+
HTTP_METHODS = [:get, :post, :put, :patch, :delete, :head]
|
18
24
|
attr_accessor :mechanize, :context, :response_code, :page
|
19
25
|
|
20
26
|
def initialize
|
@@ -31,26 +37,30 @@ module Wombat
|
|
31
37
|
@mechanize.user_agent_alias = Wombat.user_agent_alias if Wombat.user_agent_alias
|
32
38
|
end
|
33
39
|
|
34
|
-
def parse(metadata)
|
35
|
-
@context = parser_for
|
40
|
+
def parse(metadata, url=nil)
|
41
|
+
@context = parser_for(metadata, url)
|
36
42
|
|
37
43
|
Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
|
38
44
|
end
|
39
45
|
|
40
46
|
private
|
41
|
-
def parser_for(metadata)
|
42
|
-
url
|
47
|
+
def parser_for(metadata, url)
|
48
|
+
url ||= "#{metadata[:base_url]}#{metadata[:path]}"
|
43
49
|
page = nil
|
44
50
|
parser = nil
|
51
|
+
_method = method_from(metadata[:http_method])
|
52
|
+
data = metadata[:data]
|
53
|
+
args = [url, data].compact
|
45
54
|
begin
|
46
55
|
@page = metadata[:page]
|
47
56
|
|
48
57
|
if metadata[:document_format] == :html
|
49
|
-
@page = @mechanize.
|
50
|
-
parser = @page.parser
|
58
|
+
@page = @mechanize.public_send(_method, *args) unless @page
|
59
|
+
parser = @page.parser # Nokogiri::HTML::Document
|
60
|
+
parser.mechanize_page = @page # Mechanize::Page
|
51
61
|
parser.headers = @page.header
|
52
62
|
else
|
53
|
-
@page = RestClient.
|
63
|
+
@page = RestClient.public_send(_method, *args) unless @page
|
54
64
|
parser = Nokogiri::XML @page
|
55
65
|
parser.headers = @page.headers
|
56
66
|
end
|
@@ -65,6 +75,11 @@ module Wombat
|
|
65
75
|
raise $!
|
66
76
|
end
|
67
77
|
end
|
78
|
+
|
79
|
+
def method_from(_method)
|
80
|
+
return :get if _method.nil?
|
81
|
+
HTTP_METHODS.detect(->{:get}){ |i| i == _method.downcase.to_sym }
|
82
|
+
end
|
68
83
|
end
|
69
84
|
end
|
70
85
|
end
|