scrappy 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/README.rdoc +4 -1
- data/Rakefile +1 -1
- data/bin/scrappy +1 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +2 -2
- data/lib/scrappy/agent/blind_agent.rb +13 -7
- data/lib/scrappy/agent/extractor.rb +1 -1
- data/scrappy.gemspec +5 -5
- metadata +5 -4
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -125,8 +125,11 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
125
125
|
# Parse a knowledge base
|
126
126
|
kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
|
127
127
|
|
128
|
+
# Set kb as default knowledge base
|
129
|
+
Scrappy::Agent::Options.kb = kb
|
130
|
+
|
128
131
|
# Create an agent
|
129
|
-
agent = Scrappy::Agent.create
|
132
|
+
agent = Scrappy::Agent.create
|
130
133
|
|
131
134
|
# Get RDF output
|
132
135
|
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1.9'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -87,11 +87,11 @@ module Scrappy
|
|
87
87
|
|
88
88
|
# Enqueue subresources
|
89
89
|
# Pages are enqueued without reducing depth
|
90
|
-
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(
|
90
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
|
91
91
|
|
92
92
|
# All other URIS are enqueued with depth reduced
|
93
93
|
uris = if depth != 0
|
94
|
-
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(
|
94
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
|
95
95
|
else
|
96
96
|
[]
|
97
97
|
end
|
@@ -12,14 +12,20 @@ module Scrappy
|
|
12
12
|
|
13
13
|
def uri= uri
|
14
14
|
synchronize do
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
retries = 10
|
16
|
+
@loaded = false
|
17
|
+
while retries > 0 and !@loaded
|
18
|
+
begin
|
19
|
+
@mechanize.get uri
|
20
|
+
@loaded = true
|
21
|
+
rescue Timeout::Error
|
22
|
+
@loaded = false
|
23
|
+
rescue
|
24
|
+
@loaded = false
|
25
|
+
end
|
26
|
+
retries -= 1 unless @loaded
|
22
27
|
end
|
28
|
+
@loaded
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.18"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-02-
|
9
|
+
s.date = %q{2011-02-18}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<camping>, ["= 2.0"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1.9"])
|
36
36
|
s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
37
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
38
38
|
else
|
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
|
|
41
41
|
s.add_dependency(%q<camping>, ["= 2.0"])
|
42
42
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
|
45
45
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
46
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
47
|
end
|
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
|
|
51
51
|
s.add_dependency(%q<camping>, ["= 2.0"])
|
52
52
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
54
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
|
55
55
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
56
56
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
57
57
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 18
|
9
|
+
version: 0.1.18
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-02-
|
17
|
+
date: 2011-02-18 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -96,7 +96,8 @@ dependencies:
|
|
96
96
|
segments:
|
97
97
|
- 0
|
98
98
|
- 1
|
99
|
-
|
99
|
+
- 9
|
100
|
+
version: 0.1.9
|
100
101
|
type: :runtime
|
101
102
|
version_requirements: *id006
|
102
103
|
- !ruby/object:Gem::Dependency
|