scrappy 0.1.17 → 0.1.18
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/README.rdoc +4 -1
- data/Rakefile +1 -1
- data/bin/scrappy +1 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +2 -2
- data/lib/scrappy/agent/blind_agent.rb +13 -7
- data/lib/scrappy/agent/extractor.rb +1 -1
- data/scrappy.gemspec +5 -5
- metadata +5 -4
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -125,8 +125,11 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
125
125
|
# Parse a knowledge base
|
126
126
|
kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
|
127
127
|
|
128
|
+
# Set kb as default knowledge base
|
129
|
+
Scrappy::Agent::Options.kb = kb
|
130
|
+
|
128
131
|
# Create an agent
|
129
|
-
agent = Scrappy::Agent.create
|
132
|
+
agent = Scrappy::Agent.create
|
130
133
|
|
131
134
|
# Get RDF output
|
132
135
|
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1.9'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -87,11 +87,11 @@ module Scrappy
|
|
87
87
|
|
88
88
|
# Enqueue subresources
|
89
89
|
# Pages are enqueued without reducing depth
|
90
|
-
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(
|
90
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
|
91
91
|
|
92
92
|
# All other URIS are enqueued with depth reduced
|
93
93
|
uris = if depth != 0
|
94
|
-
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(
|
94
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
|
95
95
|
else
|
96
96
|
[]
|
97
97
|
end
|
@@ -12,14 +12,20 @@ module Scrappy
|
|
12
12
|
|
13
13
|
def uri= uri
|
14
14
|
synchronize do
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
retries = 10
|
16
|
+
@loaded = false
|
17
|
+
while retries > 0 and !@loaded
|
18
|
+
begin
|
19
|
+
@mechanize.get uri
|
20
|
+
@loaded = true
|
21
|
+
rescue Timeout::Error
|
22
|
+
@loaded = false
|
23
|
+
rescue
|
24
|
+
@loaded = false
|
25
|
+
end
|
26
|
+
retries -= 1 unless @loaded
|
22
27
|
end
|
28
|
+
@loaded
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.18"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-18}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<camping>, ["= 2.0"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1.9"])
|
36
36
|
s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
37
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
38
38
|
else
|
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_dependency(%q<camping>, ["= 2.0"])
|
42
42
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
|
45
45
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
end
|
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
|
|
51
51
|
s.add_dependency(%q<camping>, ["= 2.0"])
|
52
52
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
54
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
|
55
55
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
56
56
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
57
57
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 18
|
9
|
+
version: 0.1.18
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-18 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -96,7 +96,8 @@ dependencies:
|
|
96
96
|
segments:
|
97
97
|
- 0
|
98
98
|
- 1
|
99
|
-
|
99
|
+
- 9
|
100
|
+
version: 0.1.9
|
100
101
|
type: :runtime
|
101
102
|
version_requirements: *id006
|
102
103
|
- !ruby/object:Gem::Dependency
|