scrappy 0.1.17 → 0.1.18

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.18 2011-02-18
2
+
3
+ * Retry support when opening URLs
4
+ * Update for latest lightRDF version
5
+
1
6
  === 0.1.17 2011-02-15
2
7
 
3
8
  * Enabling headless yarf serialization in shell mode
data/README.rdoc CHANGED
@@ -125,8 +125,11 @@ scrappy offers many different interfaces to get RDF data from a web page:
125
125
  # Parse a knowledge base
126
126
  kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
127
127
 
128
+ # Set kb as default knowledge base
129
+ Scrappy::Agent::Options.kb = kb
130
+
128
131
  # Create an agent
129
- agent = Scrappy::Agent.create :kb=>kb
132
+ agent = Scrappy::Agent.create
130
133
 
131
134
  # Get RDF output
132
135
  output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
13
13
  p.ignore_pattern = ["pkg/*"]
14
- p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
14
+ p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1.9'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
15
15
  end
16
16
 
17
17
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -153,7 +153,7 @@ Copyright
153
153
  open(cache_file, "w") { |f| Marshal.dump(data, f) }
154
154
  data
155
155
  end
156
- RDF::QURI.ns.merge! Agent::Options.kb.ns
156
+ RDF::ID.ns.merge! Agent::Options.kb.ns
157
157
  end
158
158
  end
159
159
 
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.17'
24
+ VERSION = '0.1.18'
25
25
  end
26
26
 
27
27
  # Require selectors
@@ -87,11 +87,11 @@ module Scrappy
87
87
 
88
88
  # Enqueue subresources
89
89
  # Pages are enqueued without reducing depth
90
- pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
90
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
91
91
 
92
92
  # All other URIS are enqueued with depth reduced
93
93
  uris = if depth != 0
94
- (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
94
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(Symbol)}
95
95
  else
96
96
  []
97
97
  end
@@ -12,14 +12,20 @@ module Scrappy
12
12
 
13
13
  def uri= uri
14
14
  synchronize do
15
- begin
16
- @mechanize.get uri
17
- @loaded = true
18
- rescue Timeout::Error
19
- @loaded = false
20
- rescue
21
- @loaded = false
15
+ retries = 10
16
+ @loaded = false
17
+ while retries > 0 and !@loaded
18
+ begin
19
+ @mechanize.get uri
20
+ @loaded = true
21
+ rescue Timeout::Error
22
+ @loaded = false
23
+ rescue
24
+ @loaded = false
25
+ end
26
+ retries -= 1 unless @loaded
22
27
  end
28
+ @loaded
23
29
  end
24
30
  end
25
31
 
@@ -121,7 +121,7 @@ module Scrappy
121
121
  def parse_uri(uri, rel_uri)
122
122
  return ID('*') if rel_uri.nil?
123
123
  begin
124
- ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri))
124
+ ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
125
125
  rescue
126
126
  ID('*')
127
127
  end
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.17"
5
+ s.version = "0.1.18"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-15}
9
+ s.date = %q{2011-02-18}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
32
32
  s.add_runtime_dependency(%q<camping>, ["= 2.0"])
33
33
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
34
34
  s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
35
- s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
35
+ s.add_runtime_dependency(%q<lightrdf>, [">= 0.1.9"])
36
36
  s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
37
37
  s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
38
38
  else
@@ -41,7 +41,7 @@ Gem::Specification.new do |s|
41
41
  s.add_dependency(%q<camping>, ["= 2.0"])
42
42
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
43
43
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
44
- s.add_dependency(%q<lightrdf>, [">= 0.1"])
44
+ s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
45
45
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
46
46
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
47
47
  end
@@ -51,7 +51,7 @@ Gem::Specification.new do |s|
51
51
  s.add_dependency(%q<camping>, ["= 2.0"])
52
52
  s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
53
53
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
54
- s.add_dependency(%q<lightrdf>, [">= 0.1"])
54
+ s.add_dependency(%q<lightrdf>, [">= 0.1.9"])
55
55
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
56
56
  s.add_dependency(%q<i18n>, [">= 0.4.2"])
57
57
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 17
9
- version: 0.1.17
8
+ - 18
9
+ version: 0.1.18
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-15 00:00:00 +01:00
17
+ date: 2011-02-18 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -96,7 +96,8 @@ dependencies:
96
96
  segments:
97
97
  - 0
98
98
  - 1
99
- version: "0.1"
99
+ - 9
100
+ version: 0.1.9
100
101
  type: :runtime
101
102
  version_requirements: *id006
102
103
  - !ruby/object:Gem::Dependency