scrappy 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ === 0.1.14 2011-01-30
2
+
3
+ * Added missing file to Manifest
4
+
5
+ === 0.1.13 2011-01-27
6
+
7
+ * Improvements on memory consumption
8
+ * Correction in redirections in the web interface
9
+ * Dump to disk feature
10
+
1
11
  === 0.1.12 2011-01-10
2
12
 
3
13
  * Correction for Windows compatibility
data/Manifest CHANGED
@@ -9,6 +9,7 @@ lib/scrappy.rb
9
9
  lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
11
  lib/scrappy/agent/cache.rb
12
+ lib/scrappy/agent/dumper.rb
12
13
  lib/scrappy/agent/map_reduce.rb
13
14
  lib/scrappy/agent/extractor.rb
14
15
  lib/scrappy/agent/visual_agent.rb
data/bin/scrappy CHANGED
@@ -39,6 +39,7 @@ module Scrappy
39
39
  opts.on('-h', '--help') { output_help; exit 0 }
40
40
  opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
41
41
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
42
+ opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
42
43
  opts.on('-u', '--debug') { Agent::Options.debug = true }
43
44
  opts.on('-i', '--interactive') { Options.shell = true }
44
45
  opts.on('-s', '--server') { Options.server = true }
@@ -101,6 +102,7 @@ Options
101
102
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
102
103
  -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
103
104
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
105
+ -D, --dump Dumps RDF data to disk
104
106
  -u, --debug Shows debugging traces
105
107
  -i, --interactive Runs interactive shell
106
108
  -s, --server Runs web server
data/lib/scrappy.rb CHANGED
@@ -14,12 +14,13 @@ require 'scrappy/support'
14
14
  require 'scrappy/agent/extractor'
15
15
  require 'scrappy/agent/map_reduce'
16
16
  require 'scrappy/agent/cache'
17
+ require 'scrappy/agent/dumper'
17
18
  require 'scrappy/agent/agent'
18
19
 
19
20
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
21
 
21
22
  module Scrappy
22
- VERSION = '0.1.12'
23
+ VERSION = '0.1.14'
23
24
  end
24
25
 
25
26
  # Require selectors
@@ -44,6 +44,9 @@ module Scrappy
44
44
  depth = args[:depth]
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
+ # Expire cache
48
+ cache.expire! 300 # 5 minutes
49
+
47
50
  # Lookup in cache
48
51
  triples = if cache[request]
49
52
  puts "Retrieving cached #{request[:uri]}...done!" if options.debug
@@ -67,15 +70,17 @@ module Scrappy
67
70
  puts 'done!' if options.debug
68
71
 
69
72
  response = if self.html_data?
70
- add_visual_data! if options.referenceable # Adds tags including visual information
71
- extract self.uri, html, options.referenceable # Extract data
73
+ add_visual_data! if options.referenceable # Adds tags including visual information
74
+ extraction = extract self.uri, html, options.referenceable # Extract data
75
+ Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
76
+ extraction
72
77
  else
73
78
  []
74
79
  end
75
80
 
76
81
  # Cache the request
77
- # cache[request] = { :time=>Time.now, :response=>response }
78
- # cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
82
+ cache[request] = { :time=>Time.now, :response=>response }
83
+ cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
79
84
 
80
85
  response
81
86
  end
@@ -102,11 +107,13 @@ module Scrappy
102
107
  items.each { |item| queue << item }
103
108
  end
104
109
  end
105
-
106
- triples
110
+
111
+ triples unless options.dump
107
112
  end
108
113
 
109
114
  def reduce results
115
+ return [] if options.dump
116
+
110
117
  if options.debug
111
118
  print "Merging results..."; $stdout.flush
112
119
  end
@@ -119,10 +126,7 @@ module Scrappy
119
126
  end
120
127
 
121
128
  def request args={}
122
- # Expire cache
123
- cache.expire! 300 # 5 minutes
124
-
125
- RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
129
+ RDF::Graph.new clean(map(args) || [])
126
130
  end
127
131
 
128
132
  def proxy args={}
@@ -130,13 +134,19 @@ module Scrappy
130
134
 
131
135
  response = self.request(request)
132
136
 
133
- if options.debug
134
- print "Serializing..."; $stdout.flush
135
- end
136
-
137
- output = response.serialize(request[:format])
137
+ output = if options.dump
138
+ ""
139
+ else
140
+ if options.debug
141
+ print "Serializing..."; $stdout.flush
142
+ end
143
+
144
+ output = response.serialize request[:format]
138
145
 
139
- puts 'done!'if options.debug
146
+ puts 'done!'if options.debug
147
+
148
+ output
149
+ end
140
150
 
141
151
  OpenStruct.new :output => output,
142
152
  :content_type => ContentTypes[request[:format]] || 'text/plain',
@@ -149,5 +159,9 @@ module Scrappy
149
159
  uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
150
160
  uri
151
161
  end
162
+
163
+ def clean triples
164
+ triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
165
+ end
152
166
  end
153
167
  end
@@ -3,6 +3,7 @@ module Scrappy
3
3
  def initialize args={}
4
4
  super
5
5
  @mechanize = Mechanize.new
6
+ @mechanize.max_history = 20
6
7
  end
7
8
 
8
9
  def uri
@@ -19,10 +19,13 @@ module Scrappy
19
19
 
20
20
  class Cache < Hash
21
21
  include MonitorMixin
22
+ MAX_ELEMENTS = 100
22
23
 
23
24
  def expire! timeout
24
25
  synchronize do
25
- keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
26
+ keys.each { |key| delete(key) if Time.now.to_i - self[key][:time].to_i > timeout }
27
+ sort_by { |key, value| value[:time].to_i }[0...size-MAX_ELEMENTS].each { |key, value| delete key } if size > MAX_ELEMENTS
28
+ self
26
29
  end
27
30
  end
28
31
 
@@ -0,0 +1,13 @@
1
+ module Scrappy
2
+ class Dumper
3
+ Mux = Mutex.new
4
+
5
+ def self.dump uri, triples, format
6
+ Mux.synchronize do
7
+ filename = uri.gsub("http://", "").gsub("https://", "").gsub("/", "-").gsub(".", "_").gsub("?", "+").gsub("&", "+") + ".#{format}"
8
+ data = RDF::Graph.new(triples).serialize(format)
9
+ File.open(filename, "w") { |f| f.write data }
10
+ end
11
+ end
12
+ end
13
+ end
@@ -1,7 +1,7 @@
1
1
  module SliceSelector
2
2
  def self.filter selector, doc
3
3
  selector.rdf::value.map do |separator|
4
- slices = doc[:content].text.split(separator)
4
+ slices = doc[:value].split(separator)
5
5
  selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
6
6
  end.flatten
7
7
  end
@@ -51,10 +51,9 @@ module Scrappy
51
51
  def process_request method, format, url
52
52
  callback = @input['callback']
53
53
  response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
54
-
55
54
  case response.status
56
55
  when :redirect
57
- redirect "/#{format}/#{response.uri}#{inputs}"
56
+ redirect "/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{inputs}"
58
57
  when :ok
59
58
  @headers['Content-Type'] = response.content_type
60
59
  callback ? "#{callback}(#{response.output})" : response.output
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.12"
5
+ s.version = "0.1.14"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-01-10}
9
+ s.date = %q{2011-01-30}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 12
9
- version: 0.1.12
8
+ - 14
9
+ version: 0.1.14
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-10 00:00:00 +01:00
17
+ date: 2011-01-30 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -141,6 +141,7 @@ extra_rdoc_files:
141
141
  - lib/scrappy/agent/agent.rb
142
142
  - lib/scrappy/agent/blind_agent.rb
143
143
  - lib/scrappy/agent/cache.rb
144
+ - lib/scrappy/agent/dumper.rb
144
145
  - lib/scrappy/agent/map_reduce.rb
145
146
  - lib/scrappy/agent/extractor.rb
146
147
  - lib/scrappy/agent/visual_agent.rb
@@ -170,6 +171,7 @@ files:
170
171
  - lib/scrappy/agent/agent.rb
171
172
  - lib/scrappy/agent/blind_agent.rb
172
173
  - lib/scrappy/agent/cache.rb
174
+ - lib/scrappy/agent/dumper.rb
173
175
  - lib/scrappy/agent/map_reduce.rb
174
176
  - lib/scrappy/agent/extractor.rb
175
177
  - lib/scrappy/agent/visual_agent.rb