scrappy 0.1.17 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.18 2011-02-18
2
+
3
+ * Retry support when opening URLs
4
+ * Update for latest lightRDF version
5
+
1
6
  === 0.1.17 2011-02-15
2
7
 
3
8
  * Enabling headless yarf serialization in shell mode
data/README.rdoc CHANGED
@@ -125,8 +125,11 @@ scrappy offers many different interfaces to get RDF data from a web page:
125
125
  # Parse a knowledge base
126
126
  kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
127
127
 
128
+ # Set kb as default knowledge base
129
+ Scrappy::Agent::Options.kb = kb
130
+
128
131
  # Create an agent
129
- agent = Scrappy::Agent.create :kb=>kb
132
+ agent = Scrappy::Agent.create
130
133
 
131
134
  # Get RDF output
132
135
  output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
13
13
  p.ignore_pattern = ["pkg/*"]
14
- p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
14
+ p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1.9'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
15
15
  end
16
16
 
17
17
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -153,7 +153,7 @@ Copyright
153
153
  open(cache_file, "w") { |f| Marshal.dump(data, f) }
154
154
  data
155
155
  end
156
- RDF::QURI.ns.merge! Agent::Options.kb.ns
156
+ RDF::ID.ns.merge! Agent::Options.kb.ns
157
157
  end
158
158
  end
159
159
 
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.17'
24
+ VERSION = '0.1.18'
25
25
  end
26
26
 
27
27
  # Require selectors
@@ -87,11 +87,11 @@ module Scrappy
87
87
 
88
88
  # Enqueue subresources
89
89
  # Pages are enqueued without reducing depth
90
- pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
90
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
91
91
 
92
92
  # All other URIS are enqueued with depth reduced
93
93
  uris = if depth != 0
94
- (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
94
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
95
95
  else
96
96
  []
97
97
  end
@@ -12,14 +12,20 @@ module Scrappy
12
12
 
13
13
  def uri= uri
14
14
  synchronize do
15
- begin
16
- @mechanize.get uri
17
- @loaded = true
18
- rescue Timeout::Error
19
- @loaded = false
20
- rescue
21
- @loaded = false
15
+ retries = 10
16
+ @loaded = false
17
+ while retries > 0 and !@loaded
18
+ begin
19
+ @mechanize.get uri
20
+ @loaded = true
21
+ rescue Timeout::Error
22
+ @loaded = false
23
+ rescue
24
+ @loaded = false
25
+ end
26
+ retries -= 1 unless @loaded
22
27
  end
28
+ @loaded
23
29
  end
24
30
  end
25
31
 
@@ -121,7 +121,7 @@ module Scrappy
121
121
  def parse_uri(uri, rel_uri)
122
122
  return ID('*') if rel_uri.nil?
123
123
  begin
124
- ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri))
124
+ ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
125
125
  rescue
126
126
  ID('*')
127
127
  end
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.17"
5
+ s.version = "0.1.18"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-15}
9
+ s.date = %q{2011-02-18}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  s.add_runtime_dependency(%q<camping>, ["= 2.0"])
33
33
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
34
34
  s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
35
- s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
35
+ s.add_runtime_dependency(%q<lightrdf>, [">= 0.1.9"])
36
36
  s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
37
37
  s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
38
38
  else
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
41
41
  s.add_dependency(%q<camping>, ["= 2.0"])
42
42
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
43
43
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
44
- s.add_dependency(%q<lightrdf>, [">= 0.1"])
44
+ s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
45
45
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
46
46
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
47
47
  end
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
51
51
  s.add_dependency(%q<camping>, ["= 2.0"])
52
52
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
53
53
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
54
- s.add_dependency(%q<lightrdf>, [">= 0.1"])
54
+ s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
55
55
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
56
56
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
57
57
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 17
9
- version: 0.1.17
8
+ - 18
9
+ version: 0.1.18
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-15 00:00:00 +01:00
17
+ date: 2011-02-18 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -96,7 +96,8 @@ dependencies:
96
96
  segments:
97
97
  - 0
98
98
  - 1
99
- version: "0.1"
99
+ - 9
100
+ version: 0.1.9
100
101
  type: :runtime
101
102
  version_requirements: *id006
102
103
  - !ruby/object:Gem::Dependency