object-scraper 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +1 -1
- data/Rakefile +1 -1
- data/lib/object-scraper/scraper.rb +5 -1
- data/object-scraper.gemspec +2 -2
- data/spec/data/incomplete_objects.html +12 -0
- data/spec/object-scraper/scraper_spec.rb +12 -0
- metadata +3 -2
data/Manifest
CHANGED
data/Rakefile
CHANGED
@@ -2,7 +2,7 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('object-scraper', '0.0.
|
5
|
+
Echoe.new('object-scraper', '0.0.4') do |p|
|
6
6
|
p.summary = "Recipe like object extraction from HTML sources"
|
7
7
|
p.description = "Object scraper is a thin wrapper for hpricot to enable recipe-like extraction of ruby objects from various web sites."
|
8
8
|
p.url = "http://github.com/enricogenauck/object-scraper"
|
@@ -66,7 +66,11 @@ class Scraper
|
|
66
66
|
|
67
67
|
def method_missing(symbol, *args, &block)
|
68
68
|
if block_given?
|
69
|
-
@current_object.send("#{symbol}=",
|
69
|
+
@current_object.send("#{symbol}=", begin
|
70
|
+
yield(@current_node)
|
71
|
+
rescue
|
72
|
+
puts "Warning, parsing failed at #{@current_node.inspect}"
|
73
|
+
end)
|
70
74
|
else
|
71
75
|
@current_object.send("#{symbol}=", args.first)
|
72
76
|
end
|
data/object-scraper.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{object-scraper}
|
5
|
-
s.version = "0.0.
|
5
|
+
s.version = "0.0.4"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Enrico Genauck"]
|
@@ -10,7 +10,7 @@ Gem::Specification.new do |s|
|
|
10
10
|
s.description = %q{Object scraper is a thin wrapper for hpricot to enable recipe-like extraction of ruby objects from various web sites.}
|
11
11
|
s.email = %q{kontakt@enricogenauck.de}
|
12
12
|
s.extra_rdoc_files = ["README.rdoc", "lib/object-scraper.rb", "lib/object-scraper/scraper.rb"]
|
13
|
-
s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/object-scraper.rb", "lib/object-scraper/scraper.rb", "
|
13
|
+
s.files = ["Manifest", "README.rdoc", "Rakefile", "lib/object-scraper.rb", "lib/object-scraper/scraper.rb", "spec/data/incomplete_objects.html", "spec/data/twitter.html", "spec/object-scraper/scraper_spec.rb", "spec/spec.opts", "spec/spec_helper.rb", "object-scraper.gemspec"]
|
14
14
|
s.homepage = %q{http://github.com/enricogenauck/object-scraper}
|
15
15
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Object-scraper", "--main", "README.rdoc"]
|
16
16
|
s.require_paths = ["lib"]
|
@@ -3,6 +3,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec_helper'))
|
|
3
3
|
describe Scraper do
|
4
4
|
before :all do
|
5
5
|
@uri = File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'twitter.html' ))
|
6
|
+
@faulty_source = File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'incomplete_objects.html' ))
|
6
7
|
@pattern = ".status"
|
7
8
|
class Entry < Object
|
8
9
|
attr_accessor :text, :date
|
@@ -74,5 +75,16 @@ describe Scraper do
|
|
74
75
|
@objects.first.date.should == DateTime.parse("Mon Nov 30 04:10:51 +0000 2009")
|
75
76
|
end
|
76
77
|
|
78
|
+
it "should get the objects despite of parse errors" do
|
79
|
+
Scraper.define(:errors, :class => :entry, :source => @faulty_source, :node => @pattern) do |s|
|
80
|
+
s.text { |node| node.at("h1").inner_html }
|
81
|
+
s.date { |node| node.at("p").inner_html }
|
82
|
+
end
|
83
|
+
|
84
|
+
@objects = Scraper.parse(:errors)
|
85
|
+
@objects[0].date.should == "content"
|
86
|
+
@objects[1].date.should be_nil
|
87
|
+
end
|
88
|
+
|
77
89
|
end
|
78
90
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: object-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Enrico Genauck
|
@@ -38,11 +38,12 @@ files:
|
|
38
38
|
- Rakefile
|
39
39
|
- lib/object-scraper.rb
|
40
40
|
- lib/object-scraper/scraper.rb
|
41
|
-
-
|
41
|
+
- spec/data/incomplete_objects.html
|
42
42
|
- spec/data/twitter.html
|
43
43
|
- spec/object-scraper/scraper_spec.rb
|
44
44
|
- spec/spec.opts
|
45
45
|
- spec/spec_helper.rb
|
46
|
+
- object-scraper.gemspec
|
46
47
|
has_rdoc: true
|
47
48
|
homepage: http://github.com/enricogenauck/object-scraper
|
48
49
|
licenses: []
|