wombat 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +52 -2
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +517 -508
- data/lib/wombat/crawler.rb +2 -6
- data/lib/wombat/iterator.rb +9 -0
- data/lib/wombat/metadata.rb +13 -0
- data/lib/wombat/parser.rb +5 -3
- data/lib/wombat/property_container.rb +1 -1
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +15 -0
- data/spec/integration/integration_spec.rb +7 -1
- data/spec/metadata_spec.rb +6 -0
- data/spec/parser_spec.rb +7 -5
- data/spec/property_container_spec.rb +3 -2
- data/spec/property_locator_spec.rb +3 -3
- data/wombat.gemspec +3 -2
- metadata +27 -26
data/README.md
CHANGED
@@ -2,9 +2,59 @@
|
|
2
2
|
|
3
3
|
[![CI Build Status](https://secure.travis-ci.org/intridea/omniauth.png?branch=master)](http://travis-ci.org/felipecsl/wombat)
|
4
4
|
|
5
|
-
Generic Web crawler with a DSL that parses
|
5
|
+
Generic Web crawler with a DSL that parses structured data from web pages.
|
6
|
+
|
7
|
+
## Usage:
|
8
|
+
|
9
|
+
``gem install wombat``
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
|
13
|
+
# => github_crawler.rb
|
14
|
+
|
15
|
+
#coding: utf-8
|
16
|
+
require 'wombat'
|
17
|
+
|
18
|
+
class GithubCrawler
|
19
|
+
include Wombat::Crawler
|
20
|
+
|
21
|
+
base_url "http://www.github.com"
|
22
|
+
list_page "/"
|
23
|
+
|
24
|
+
headline "xpath=//h1"
|
25
|
+
|
26
|
+
what_is "css=.column.secondary p", :html
|
27
|
+
|
28
|
+
explore "xpath=//ul/li[2]/a" do |e|
|
29
|
+
e.gsub(/Explore/, "LOVE")
|
30
|
+
end
|
31
|
+
|
32
|
+
benefits do |b|
|
33
|
+
b.first_benefit "css=.column.leftmost h3"
|
34
|
+
b.second_benefir "css=.column.leftmid h3"
|
35
|
+
b.third_benefit "css=.column.rightmid h3"
|
36
|
+
b.fourth_benefit "css=.column.rightmost h3"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
```
|
40
|
+
```ruby
|
41
|
+
irb> SampleCrawler.new.crawl
|
42
|
+
=>
|
43
|
+
{
|
44
|
+
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
45
|
+
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
46
|
+
"explore" => "LOVE GitHub",
|
47
|
+
"benefits" => {
|
48
|
+
"first_benefit" => "Team management",
|
49
|
+
"second_benefit" => "Code review",
|
50
|
+
"third_benefit" => "Reliable code hosting",
|
51
|
+
"fourth_benefit" => "Open source collaboration"
|
52
|
+
}
|
53
|
+
}
|
54
|
+
```
|
55
|
+
|
56
|
+
### More advanced constructs like loops, following links, callbacks, etc. to be added/documented soon.
|
6
57
|
|
7
|
-
Still under heavy development, it is being rewritten from scratch as a gem from an already existing project.
|
8
58
|
|
9
59
|
## Contributing to Wombat
|
10
60
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.6
|