scrappy 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,10 @@
1
+ === 0.1.11 2010-12-23
2
+
3
+ * Handling of timeout errors
4
+ * Support for paged resources with sc:Page
5
+ * Added missing gem dependecy (i18n)
6
+ * Added debug mode
7
+
1
8
  === 0.1.10 2010-12-20
2
9
 
3
10
  * Fixed gem dependencies
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
13
13
  p.ignore_pattern = ["pkg/*"]
14
- p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
14
+ p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
15
15
  end
16
16
 
17
17
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -37,6 +37,7 @@ module Scrappy
37
37
  opts.on('-h', '--help') { output_help; exit 0 }
38
38
  opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
39
39
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
40
+ opts.on('-u', '--debug') { Agent::Options.debug = true }
40
41
  opts.on('-i', '--interactive') { Options.shell = true }
41
42
  opts.on('-s', '--server') { Options.server = true }
42
43
  opts.on('-S', '--proxy-server') { Options.proxy = true }
@@ -98,6 +99,7 @@ Options
98
99
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
99
100
  -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
100
101
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
102
+ -u, --debug Shows debugging traces
101
103
  -i, --interactive Runs interactive shell
102
104
  -s, --server Runs web server
103
105
  -S, --proxy-server Runs web proxy
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.10'
22
+ VERSION = '0.1.11'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -44,23 +44,28 @@ module Scrappy
44
44
  depth = args[:depth]
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
- # Expire cache
48
- cache.expire! 300 # 5 minutes
49
-
50
47
  # Lookup in cache
51
48
  triples = if cache[request]
49
+ puts "Retrieving cached #{request[:uri]}...done!" if options.debug
50
+
52
51
  cache[request][:response]
53
52
  else
54
53
  # Perform the request
55
54
 
56
55
  sleep 0.001 * options.delay.to_f # Sleep if requested
57
-
56
+
57
+ if options.debug
58
+ print "Opening #{request[:uri]}..."; $stdout.flush
59
+ end
60
+
58
61
  if request[:method] == :get
59
62
  self.uri = request[:uri]
60
63
  else
61
64
  raise Exception, 'POST requests not supported yet'
62
65
  end
63
66
 
67
+ puts 'done!' if options.debug
68
+
64
69
  response = if self.html_data?
65
70
  add_visual_data! if options.referenceable # Adds tags including visual information
66
71
  extract self.uri, html, options.referenceable # Extract data
@@ -76,8 +81,21 @@ module Scrappy
76
81
  end
77
82
 
78
83
  # Enqueue subresources
79
- if depth > 0
80
- items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
84
+ if depth >= 0
85
+ # Pages are enqueued without reducing depth
86
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
87
+
88
+ # All other URIS are enqueued with depth reduced
89
+ uris = if depth > 0
90
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
91
+ else
92
+ []
93
+ end
94
+
95
+ items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
96
+
97
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
98
+
81
99
  if queue.nil?
82
100
  triples += process items
83
101
  else
@@ -89,18 +107,38 @@ module Scrappy
89
107
  end
90
108
 
91
109
  def reduce results
110
+ if options.debug
111
+ print "Merging results..."; $stdout.flush
112
+ end
113
+
92
114
  triples = []; results.each { |result| triples += result }
115
+
116
+ puts 'done!'if options.debug
117
+
93
118
  triples
94
119
  end
95
120
 
96
121
  def request args={}
97
- RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or o!=Node('sc:Index') })
122
+ # Expire cache
123
+ cache.expire! 300 # 5 minutes
124
+
125
+ RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
98
126
  end
99
127
 
100
128
  def proxy args={}
101
129
  request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
130
+
131
+ response = self.request(request)
132
+
133
+ if options.debug
134
+ print "Serializing..."; $stdout.flush
135
+ end
136
+
137
+ output = response.serialize(request[:format])
138
+
139
+ puts 'done!'if options.debug
102
140
 
103
- OpenStruct.new :output => self.request(request).serialize(request[:format]),
141
+ OpenStruct.new :output => output,
104
142
  :content_type => ContentTypes[request[:format]] || 'text/plain',
105
143
  :uri => self.uri,
106
144
  :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
@@ -14,6 +14,8 @@ module Scrappy
14
14
  begin
15
15
  @mechanize.get uri
16
16
  @loaded = true
17
+ rescue Timeout::Error
18
+ @loaded = false
17
19
  rescue
18
20
  @loaded = false
19
21
  end
@@ -3,6 +3,10 @@ require 'digest/md5'
3
3
  module Scrappy
4
4
  module Extractor
5
5
  def extract uri, html, referenceable=nil
6
+ if options.debug
7
+ print "Extracting #{uri}..."; $stdout.flush
8
+ end
9
+
6
10
  triples = []
7
11
  content = Nokogiri::HTML(html, nil, 'utf-8')
8
12
 
@@ -21,6 +25,8 @@ module Scrappy
21
25
 
22
26
  add_referenceable_data content, triples, referenceable if referenceable
23
27
 
28
+ puts "done!" if options.debug
29
+
24
30
  triples
25
31
  end
26
32
 
@@ -40,7 +46,7 @@ module Scrappy
40
46
 
41
47
  nodes.each do |node|
42
48
  # Build the object
43
- object = if fragment.sc::type.first == Node('rdf:Literal')
49
+ object = if fragment.sc::type.include?(Node('rdf:Literal'))
44
50
  value = doc[:value].to_s.strip
45
51
  if options[:referenceable]
46
52
  bnode = Node(nil)
@@ -52,15 +58,13 @@ module Scrappy
52
58
  value
53
59
  end
54
60
  else
55
- if fragment.sc::type.first and fragment.sc::type.first != Node('rdf:Resource')
56
- options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
57
- end
61
+ fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
58
62
  fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
59
63
  fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
60
64
  node
61
65
  end
62
66
  fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
63
-
67
+
64
68
  # Add referenceable data if requested
65
69
  if options[:referenceable]
66
70
  sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
@@ -81,7 +85,7 @@ module Scrappy
81
85
  # From "BaseUriSelector" to "base_uri"
82
86
  class_name = selector.rdf::type.first.to_s.split('#').last
83
87
 
84
- if !selector.sc::debug.empty?
88
+ if !selector.sc::debug.empty? and options.debug
85
89
  puts '== DEBUG'
86
90
  puts '== Selector:'
87
91
  puts selector.serialize(:yarf, false)
@@ -94,7 +98,7 @@ module Scrappy
94
98
  # Process selector
95
99
  results = Kernel.const_get(class_name).filter selector, doc
96
100
 
97
- if !selector.sc::debug.empty?
101
+ if !selector.sc::debug.empty? and options.debug
98
102
  puts "== No results" if results.empty?
99
103
  results.each_with_index do |result, i|
100
104
  puts "== Result ##{i}:"
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.10"
5
+ s.version = "0.1.11"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-12-20}
9
+ s.date = %q{2010-12-23}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
35
35
  s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
36
36
  s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
37
+ s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
37
38
  else
38
39
  s.add_dependency(%q<activesupport>, [">= 2.3.5"])
39
40
  s.add_dependency(%q<markaby>, [">= 0.7.1"])
@@ -42,6 +43,7 @@ Gem::Specification.new do |s|
42
43
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
43
44
  s.add_dependency(%q<lightrdf>, [">= 0.1"])
44
45
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
46
+ s.add_dependency(%q<i18n>, [">= 0.4.2"])
45
47
  end
46
48
  else
47
49
  s.add_dependency(%q<activesupport>, [">= 2.3.5"])
@@ -51,5 +53,6 @@ Gem::Specification.new do |s|
51
53
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
52
54
  s.add_dependency(%q<lightrdf>, [">= 0.1"])
53
55
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
56
+ s.add_dependency(%q<i18n>, [">= 0.4.2"])
54
57
  end
55
58
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 10
9
- version: 0.1.10
8
+ - 11
9
+ version: 0.1.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-20 00:00:00 +01:00
17
+ date: 2010-12-23 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -113,6 +113,20 @@ dependencies:
113
113
  version: 1.1.5
114
114
  type: :runtime
115
115
  version_requirements: *id007
116
+ - !ruby/object:Gem::Dependency
117
+ name: i18n
118
+ prerelease: false
119
+ requirement: &id008 !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ segments:
124
+ - 0
125
+ - 4
126
+ - 2
127
+ version: 0.4.2
128
+ type: :runtime
129
+ version_requirements: *id008
116
130
  description: RDF web scraper
117
131
  email: joseignacio.fernandez@gmail.com
118
132
  executables: