scrappy 0.1.12 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ === 0.1.14 2011-01-30
2
+
3
+ * Added missing file to Manifest
4
+
5
+ === 0.1.13 2011-01-27
6
+
7
+ * Improvements on memory consumption
8
+ * Correction in redirections in the web interface
9
+ * Dump to disk feature
10
+
1
11
  === 0.1.12 2011-01-10
2
12
 
3
13
  * Correction for Windows compatibility
data/Manifest CHANGED
@@ -9,6 +9,7 @@ lib/scrappy.rb
9
9
  lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
11
  lib/scrappy/agent/cache.rb
12
+ lib/scrappy/agent/dumper.rb
12
13
  lib/scrappy/agent/map_reduce.rb
13
14
  lib/scrappy/agent/extractor.rb
14
15
  lib/scrappy/agent/visual_agent.rb
data/bin/scrappy CHANGED
@@ -39,6 +39,7 @@ module Scrappy
39
39
  opts.on('-h', '--help') { output_help; exit 0 }
40
40
  opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
41
41
  opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
42
+ opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
42
43
  opts.on('-u', '--debug') { Agent::Options.debug = true }
43
44
  opts.on('-i', '--interactive') { Options.shell = true }
44
45
  opts.on('-s', '--server') { Options.server = true }
@@ -101,6 +102,7 @@ Options
101
102
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
102
103
  -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
103
104
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
105
+ -D, --dump Dumps RDF data to disk
104
106
  -u, --debug Shows debugging traces
105
107
  -i, --interactive Runs interactive shell
106
108
  -s, --server Runs web server
data/lib/scrappy.rb CHANGED
@@ -14,12 +14,13 @@ require 'scrappy/support'
14
14
  require 'scrappy/agent/extractor'
15
15
  require 'scrappy/agent/map_reduce'
16
16
  require 'scrappy/agent/cache'
17
+ require 'scrappy/agent/dumper'
17
18
  require 'scrappy/agent/agent'
18
19
 
19
20
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
21
 
21
22
  module Scrappy
22
- VERSION = '0.1.12'
23
+ VERSION = '0.1.14'
23
24
  end
24
25
 
25
26
  # Require selectors
@@ -44,6 +44,9 @@ module Scrappy
44
44
  depth = args[:depth]
45
45
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
46
46
 
47
+ # Expire cache
48
+ cache.expire! 300 # 5 minutes
49
+
47
50
  # Lookup in cache
48
51
  triples = if cache[request]
49
52
  puts "Retrieving cached #{request[:uri]}...done!" if options.debug
@@ -67,15 +70,17 @@ module Scrappy
67
70
  puts 'done!' if options.debug
68
71
 
69
72
  response = if self.html_data?
70
- add_visual_data! if options.referenceable # Adds tags including visual information
71
- extract self.uri, html, options.referenceable # Extract data
73
+ add_visual_data! if options.referenceable # Adds tags including visual information
74
+ extraction = extract self.uri, html, options.referenceable # Extract data
75
+ Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
76
+ extraction
72
77
  else
73
78
  []
74
79
  end
75
80
 
76
81
  # Cache the request
77
- # cache[request] = { :time=>Time.now, :response=>response }
78
- # cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
82
+ cache[request] = { :time=>Time.now, :response=>response }
83
+ cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
79
84
 
80
85
  response
81
86
  end
@@ -102,11 +107,13 @@ module Scrappy
102
107
  items.each { |item| queue << item }
103
108
  end
104
109
  end
105
-
106
- triples
110
+
111
+ triples unless options.dump
107
112
  end
108
113
 
109
114
  def reduce results
115
+ return [] if options.dump
116
+
110
117
  if options.debug
111
118
  print "Merging results..."; $stdout.flush
112
119
  end
@@ -119,10 +126,7 @@ module Scrappy
119
126
  end
120
127
 
121
128
  def request args={}
122
- # Expire cache
123
- cache.expire! 300 # 5 minutes
124
-
125
- RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
129
+ RDF::Graph.new clean(map(args) || [])
126
130
  end
127
131
 
128
132
  def proxy args={}
@@ -130,13 +134,19 @@ module Scrappy
130
134
 
131
135
  response = self.request(request)
132
136
 
133
- if options.debug
134
- print "Serializing..."; $stdout.flush
135
- end
136
-
137
- output = response.serialize(request[:format])
137
+ output = if options.dump
138
+ ""
139
+ else
140
+ if options.debug
141
+ print "Serializing..."; $stdout.flush
142
+ end
143
+
144
+ output = response.serialize request[:format]
138
145
 
139
- puts 'done!'if options.debug
146
+ puts 'done!'if options.debug
147
+
148
+ output
149
+ end
140
150
 
141
151
  OpenStruct.new :output => output,
142
152
  :content_type => ContentTypes[request[:format]] || 'text/plain',
@@ -149,5 +159,9 @@ module Scrappy
149
159
  uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
150
160
  uri
151
161
  end
162
+
163
+ def clean triples
164
+ triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
165
+ end
152
166
  end
153
167
  end
@@ -3,6 +3,7 @@ module Scrappy
3
3
  def initialize args={}
4
4
  super
5
5
  @mechanize = Mechanize.new
6
+ @mechanize.max_history = 20
6
7
  end
7
8
 
8
9
  def uri
@@ -19,10 +19,13 @@ module Scrappy
19
19
 
20
20
  class Cache < Hash
21
21
  include MonitorMixin
22
+ MAX_ELEMENTS = 100
22
23
 
23
24
  def expire! timeout
24
25
  synchronize do
25
- keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
26
+ keys.each { |key| delete(key) if Time.now.to_i - self[key][:time].to_i > timeout }
27
+ sort_by { |key, value| value[:time].to_i }[0...size-MAX_ELEMENTS].each { |key, value| delete key } if size > MAX_ELEMENTS
28
+ self
26
29
  end
27
30
  end
28
31
 
@@ -0,0 +1,13 @@
1
+ module Scrappy
2
+ class Dumper
3
+ Mux = Mutex.new
4
+
5
+ def self.dump uri, triples, format
6
+ Mux.synchronize do
7
+ filename = uri.gsub("http://", "").gsub("https://", "").gsub("/", "-").gsub(".", "_").gsub("?", "+").gsub("&", "+") + ".#{format}"
8
+ data = RDF::Graph.new(triples).serialize(format)
9
+ File.open(filename, "w") { |f| f.write data }
10
+ end
11
+ end
12
+ end
13
+ end
@@ -1,7 +1,7 @@
1
1
  module SliceSelector
2
2
  def self.filter selector, doc
3
3
  selector.rdf::value.map do |separator|
4
- slices = doc[:content].text.split(separator)
4
+ slices = doc[:value].split(separator)
5
5
  selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
6
6
  end.flatten
7
7
  end
@@ -51,10 +51,9 @@ module Scrappy
51
51
  def process_request method, format, url
52
52
  callback = @input['callback']
53
53
  response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
54
-
55
54
  case response.status
56
55
  when :redirect
57
- redirect "/#{format}/#{response.uri}#{inputs}"
56
+ redirect "/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{inputs}"
58
57
  when :ok
59
58
  @headers['Content-Type'] = response.content_type
60
59
  callback ? "#{callback}(#{response.output})" : response.output
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.12"
5
+ s.version = "0.1.14"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-01-10}
9
+ s.date = %q{2011-01-30}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 12
9
- version: 0.1.12
8
+ - 14
9
+ version: 0.1.14
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-10 00:00:00 +01:00
17
+ date: 2011-01-30 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -141,6 +141,7 @@ extra_rdoc_files:
141
141
  - lib/scrappy/agent/agent.rb
142
142
  - lib/scrappy/agent/blind_agent.rb
143
143
  - lib/scrappy/agent/cache.rb
144
+ - lib/scrappy/agent/dumper.rb
144
145
  - lib/scrappy/agent/map_reduce.rb
145
146
  - lib/scrappy/agent/extractor.rb
146
147
  - lib/scrappy/agent/visual_agent.rb
@@ -170,6 +171,7 @@ files:
170
171
  - lib/scrappy/agent/agent.rb
171
172
  - lib/scrappy/agent/blind_agent.rb
172
173
  - lib/scrappy/agent/cache.rb
174
+ - lib/scrappy/agent/dumper.rb
173
175
  - lib/scrappy/agent/map_reduce.rb
174
176
  - lib/scrappy/agent/extractor.rb
175
177
  - lib/scrappy/agent/visual_agent.rb