scrappy 0.1.10 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,10 @@
1
+ === 0.1.11 2010-12-23
2
+
3
+ * Handling of timeout errors
4
+ * Support for paged resources with sc:Page
5
+ * Added missing gem dependecy (i18n)
6
+ * Added debug mode
7
+
1
8
  === 0.1.10 2010-12-20
2
9
 
3
10
  * Fixed gem dependencies
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
13
13
  p.ignore_pattern = ["pkg/*"]
14
- p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
14
+ p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
15
15
  end
16
16
 
17
17
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -37,6 +37,7 @@ module Scrappy
37
37
  opts.on('-h', '--help') { output_help; exit 0 }
38
38
  opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
39
39
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
40
+ opts.on('-u', '--debug') { Agent::Options.debug = true }
40
41
  opts.on('-i', '--interactive') { Options.shell = true }
41
42
  opts.on('-s', '--server') { Options.server = true }
42
43
  opts.on('-S', '--proxy-server') { Options.proxy = true }
@@ -98,6 +99,7 @@ Options
98
99
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
99
100
  -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
100
101
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
102
+ -u, --debug Shows debugging traces
101
103
  -i, --interactive Runs interactive shell
102
104
  -s, --server Runs web server
103
105
  -S, --proxy-server Runs web proxy
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.10'
22
+ VERSION = '0.1.11'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -44,23 +44,28 @@ module Scrappy
44
44
  depth = args[:depth]
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
- # Expire cache
48
- cache.expire! 300 # 5 minutes
49
-
50
47
  # Lookup in cache
51
48
  triples = if cache[request]
49
+ puts "Retrieving cached #{request[:uri]}...done!" if options.debug
50
+
52
51
  cache[request][:response]
53
52
  else
54
53
  # Perform the request
55
54
 
56
55
  sleep 0.001 * options.delay.to_f # Sleep if requested
57
-
56
+
57
+ if options.debug
58
+ print "Opening #{request[:uri]}..."; $stdout.flush
59
+ end
60
+
58
61
  if request[:method] == :get
59
62
  self.uri = request[:uri]
60
63
  else
61
64
  raise Exception, 'POST requests not supported yet'
62
65
  end
63
66
 
67
+ puts 'done!' if options.debug
68
+
64
69
  response = if self.html_data?
65
70
  add_visual_data! if options.referenceable # Adds tags including visual information
66
71
  extract self.uri, html, options.referenceable # Extract data
@@ -76,8 +81,21 @@ module Scrappy
76
81
  end
77
82
 
78
83
  # Enqueue subresources
79
- if depth > 0
80
- items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
84
+ if depth >= 0
85
+ # Pages are enqueued without reducing depth
86
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
87
+
88
+ # All other URIS are enqueued with depth reduced
89
+ uris = if depth > 0
90
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
91
+ else
92
+ []
93
+ end
94
+
95
+ items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
96
+
97
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
98
+
81
99
  if queue.nil?
82
100
  triples += process items
83
101
  else
@@ -89,18 +107,38 @@ module Scrappy
89
107
  end
90
108
 
91
109
  def reduce results
110
+ if options.debug
111
+ print "Merging results..."; $stdout.flush
112
+ end
113
+
92
114
  triples = []; results.each { |result| triples += result }
115
+
116
+ puts 'done!'if options.debug
117
+
93
118
  triples
94
119
  end
95
120
 
96
121
  def request args={}
97
- RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or o!=Node('sc:Index') })
122
+ # Expire cache
123
+ cache.expire! 300 # 5 minutes
124
+
125
+ RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
98
126
  end
99
127
 
100
128
  def proxy args={}
101
129
  request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
130
+
131
+ response = self.request(request)
132
+
133
+ if options.debug
134
+ print "Serializing..."; $stdout.flush
135
+ end
136
+
137
+ output = response.serialize(request[:format])
138
+
139
+ puts 'done!'if options.debug
102
140
 
103
- OpenStruct.new :output => self.request(request).serialize(request[:format]),
141
+ OpenStruct.new :output => output,
104
142
  :content_type => ContentTypes[request[:format]] || 'text/plain',
105
143
  :uri => self.uri,
106
144
  :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
@@ -14,6 +14,8 @@ module Scrappy
14
14
  begin
15
15
  @mechanize.get uri
16
16
  @loaded = true
17
+ rescue Timeout::Error
18
+ @loaded = false
17
19
  rescue
18
20
  @loaded = false
19
21
  end
@@ -3,6 +3,10 @@ require 'digest/md5'
3
3
  module Scrappy
4
4
  module Extractor
5
5
  def extract uri, html, referenceable=nil
6
+ if options.debug
7
+ print "Extracting #{uri}..."; $stdout.flush
8
+ end
9
+
6
10
  triples = []
7
11
  content = Nokogiri::HTML(html, nil, 'utf-8')
8
12
 
@@ -21,6 +25,8 @@ module Scrappy
21
25
 
22
26
  add_referenceable_data content, triples, referenceable if referenceable
23
27
 
28
+ puts "done!" if options.debug
29
+
24
30
  triples
25
31
  end
26
32
 
@@ -40,7 +46,7 @@ module Scrappy
40
46
 
41
47
  nodes.each do |node|
42
48
  # Build the object
43
- object = if fragment.sc::type.first == Node('rdf:Literal')
49
+ object = if fragment.sc::type.include?(Node('rdf:Literal'))
44
50
  value = doc[:value].to_s.strip
45
51
  if options[:referenceable]
46
52
  bnode = Node(nil)
@@ -52,15 +58,13 @@ module Scrappy
52
58
  value
53
59
  end
54
60
  else
55
- if fragment.sc::type.first and fragment.sc::type.first != Node('rdf:Resource')
56
- options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
57
- end
61
+ fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
58
62
  fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
59
63
  fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
60
64
  node
61
65
  end
62
66
  fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
63
-
67
+
64
68
  # Add referenceable data if requested
65
69
  if options[:referenceable]
66
70
  sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
@@ -81,7 +85,7 @@ module Scrappy
81
85
  # From "BaseUriSelector" to "base_uri"
82
86
  class_name = selector.rdf::type.first.to_s.split('#').last
83
87
 
84
- if !selector.sc::debug.empty?
88
+ if !selector.sc::debug.empty? and options.debug
85
89
  puts '== DEBUG'
86
90
  puts '== Selector:'
87
91
  puts selector.serialize(:yarf, false)
@@ -94,7 +98,7 @@ module Scrappy
94
98
  # Process selector
95
99
  results = Kernel.const_get(class_name).filter selector, doc
96
100
 
97
- if !selector.sc::debug.empty?
101
+ if !selector.sc::debug.empty? and options.debug
98
102
  puts "== No results" if results.empty?
99
103
  results.each_with_index do |result, i|
100
104
  puts "== Result ##{i}:"
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.10"
5
+ s.version = "0.1.11"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-12-20}
9
+ s.date = %q{2010-12-23}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
35
35
  s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
36
36
  s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
37
+ s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
37
38
  else
38
39
  s.add_dependency(%q<activesupport>, [">= 2.3.5"])
39
40
  s.add_dependency(%q<markaby>, [">= 0.7.1"])
@@ -42,6 +43,7 @@ Gem::Specification.new do |s|
42
43
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
43
44
  s.add_dependency(%q<lightrdf>, [">= 0.1"])
44
45
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
46
+ s.add_dependency(%q<i18n>, [">= 0.4.2"])
45
47
  end
46
48
  else
47
49
  s.add_dependency(%q<activesupport>, [">= 2.3.5"])
@@ -51,5 +53,6 @@ Gem::Specification.new do |s|
51
53
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
52
54
  s.add_dependency(%q<lightrdf>, [">= 0.1"])
53
55
  s.add_dependency(%q<mongrel>, [">= 1.1.5"])
56
+ s.add_dependency(%q<i18n>, [">= 0.4.2"])
54
57
  end
55
58
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 10
9
- version: 0.1.10
8
+ - 11
9
+ version: 0.1.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-20 00:00:00 +01:00
17
+ date: 2010-12-23 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -113,6 +113,20 @@ dependencies:
113
113
  version: 1.1.5
114
114
  type: :runtime
115
115
  version_requirements: *id007
116
+ - !ruby/object:Gem::Dependency
117
+ name: i18n
118
+ prerelease: false
119
+ requirement: &id008 !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ segments:
124
+ - 0
125
+ - 4
126
+ - 2
127
+ version: 0.4.2
128
+ type: :runtime
129
+ version_requirements: *id008
116
130
  description: RDF web scraper
117
131
  email: joseignacio.fernandez@gmail.com
118
132
  executables: