object-scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +4 -0
- data/Rakefile +1 -1
- data/lib/object-scraper/scraper.rb +6 -0
- data/object-scraper.gemspec +2 -2
- data/spec/object-scraper/scraper_spec.rb +16 -0
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -31,6 +31,10 @@ extraction of ruby objects from various web sites.
|
|
31
31
|
|
32
32
|
@objects = Scraper.parse(:twitter)
|
33
33
|
|
34
|
+
If you define multiple scrapers, you can collect all their objects with one simple method
|
35
|
+
|
36
|
+
@objects = Scraper.parse_all
|
37
|
+
|
34
38
|
== Advanced Example
|
35
39
|
|
36
40
|
It is possible to use other existing HTML parsers instead of hpricot.
|
data/Rakefile
CHANGED
@@ -2,7 +2,7 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('object-scraper', '0.0.
|
5
|
+
Echoe.new('object-scraper', '0.0.3') do |p|
|
6
6
|
p.summary = "Recipe like object extraction from HTML sources"
|
7
7
|
p.description = "Object scraper is a thin wrapper for hpricot to enable recipe-like extraction of ruby objects from various web sites."
|
8
8
|
p.url = "http://github.com/enricogenauck/object-scraper"
|
@@ -42,6 +42,12 @@ class Scraper
|
|
42
42
|
def self.parse(name)
|
43
43
|
scraper_by_name(name).parse
|
44
44
|
end
|
45
|
+
|
46
|
+
def self.parse_all
|
47
|
+
objects = []
|
48
|
+
scrapers.each_value { |s| objects << s.parse }
|
49
|
+
objects.flatten
|
50
|
+
end
|
45
51
|
|
46
52
|
def parse
|
47
53
|
doc = open(@scraper_source) { |f| Scraper.scrape_source_with.call(f) }
|
data/object-scraper.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{object-scraper}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Enrico Genauck"]
|
9
|
-
s.date = %q{
|
9
|
+
s.date = %q{2010-02-03}
|
10
10
|
s.description = %q{Object scraper is a thin wrapper for hpricot to enable recipe-like extraction of ruby objects from various web sites.}
|
11
11
|
s.email = %q{kontakt@enricogenauck.de}
|
12
12
|
s.extra_rdoc_files = ["README.rdoc", "lib/object-scraper.rb", "lib/object-scraper/scraper.rb"]
|
@@ -43,6 +43,22 @@ describe Scraper do
|
|
43
43
|
@objects.first.date.should == DateTime.parse("Mon Nov 30 04:10:51 +0000 2009")
|
44
44
|
end
|
45
45
|
|
46
|
+
it "should get the objects from multiple scrapers" do
|
47
|
+
Scraper.define(:twitter_1, :class => :entry, :source => @uri, :node => @pattern) do |s|
|
48
|
+
s.text { |node| node.at(".entry-content").inner_html }
|
49
|
+
s.date { |node| DateTime.parse(node.at(".timestamp")[:data][/\'.*\'/].delete("'")) }
|
50
|
+
end
|
51
|
+
|
52
|
+
Scraper.define(:twitter_2, :class => :entry, :source => @uri, :node => @pattern) do |s|
|
53
|
+
s.text { |node| node.at(".entry-content").inner_html }
|
54
|
+
s.date { |node| DateTime.parse(node.at(".timestamp")[:data][/\'.*\'/].delete("'")) }
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
@objects = Scraper.parse_all
|
59
|
+
@objects.size.should == 40
|
60
|
+
end
|
61
|
+
|
46
62
|
it "should use a different html parser" do
|
47
63
|
require 'nokogiri'
|
48
64
|
Scraper.scrape_source_with = Proc.new { |source| Nokogiri::HTML(source) }
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: object-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Enrico Genauck
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-02-03 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|