wombat 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +52 -2
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/basic_crawler_page.yml +517 -508
- data/lib/wombat/crawler.rb +2 -6
- data/lib/wombat/iterator.rb +9 -0
- data/lib/wombat/metadata.rb +13 -0
- data/lib/wombat/parser.rb +5 -3
- data/lib/wombat/property_container.rb +1 -1
- data/lib/wombat/property_locator.rb +3 -3
- data/spec/crawler_spec.rb +15 -0
- data/spec/integration/integration_spec.rb +7 -1
- data/spec/metadata_spec.rb +6 -0
- data/spec/parser_spec.rb +7 -5
- data/spec/property_container_spec.rb +3 -2
- data/spec/property_locator_spec.rb +3 -3
- data/wombat.gemspec +3 -2
- metadata +27 -26
data/README.md
CHANGED
@@ -2,9 +2,59 @@
|
|
2
2
|
|
3
3
|
[](http://travis-ci.org/felipecsl/wombat)
|
4
4
|
|
5
|
-
Generic Web crawler with a DSL that parses
|
5
|
+
Generic Web crawler with a DSL that parses structured data from web pages.
|
6
|
+
|
7
|
+
## Usage:
|
8
|
+
|
9
|
+
``gem install wombat``
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
|
13
|
+
# => github_crawler.rb
|
14
|
+
|
15
|
+
#coding: utf-8
|
16
|
+
require 'wombat'
|
17
|
+
|
18
|
+
class GithubCrawler
|
19
|
+
include Wombat::Crawler
|
20
|
+
|
21
|
+
base_url "http://www.github.com"
|
22
|
+
list_page "/"
|
23
|
+
|
24
|
+
headline "xpath=//h1"
|
25
|
+
|
26
|
+
what_is "css=.column.secondary p", :html
|
27
|
+
|
28
|
+
explore "xpath=//ul/li[2]/a" do |e|
|
29
|
+
e.gsub(/Explore/, "LOVE")
|
30
|
+
end
|
31
|
+
|
32
|
+
benefits do |b|
|
33
|
+
b.first_benefit "css=.column.leftmost h3"
|
34
|
+
b.second_benefir "css=.column.leftmid h3"
|
35
|
+
b.third_benefit "css=.column.rightmid h3"
|
36
|
+
b.fourth_benefit "css=.column.rightmost h3"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
```
|
40
|
+
```ruby
|
41
|
+
irb> SampleCrawler.new.crawl
|
42
|
+
=>
|
43
|
+
{
|
44
|
+
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
45
|
+
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
46
|
+
"explore" => "LOVE GitHub",
|
47
|
+
"benefits" => {
|
48
|
+
"first_benefit" => "Team management",
|
49
|
+
"second_benefit" => "Code review",
|
50
|
+
"third_benefit" => "Reliable code hosting",
|
51
|
+
"fourth_benefit" => "Open source collaboration"
|
52
|
+
}
|
53
|
+
}
|
54
|
+
```
|
55
|
+
|
56
|
+
### More advanced constructs like loops, following links, callbacks, etc. to be added/documented soon.
|
6
57
|
|
7
|
-
Still under heavy development, it is being rewritten from scratch as a gem from an already existing project.
|
8
58
|
|
9
59
|
## Contributing to Wombat
|
10
60
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.6
|