scrappy 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Manifest +1 -0
- data/bin/scrappy +2 -0
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +30 -16
- data/lib/scrappy/agent/blind_agent.rb +1 -0
- data/lib/scrappy/agent/cache.rb +4 -1
- data/lib/scrappy/agent/dumper.rb +13 -0
- data/lib/scrappy/selectors/slice.rb +1 -1
- data/lib/scrappy/server.rb +1 -2
- data/scrappy.gemspec +4 -4
- metadata +5 -3
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 0.1.14 2011-01-30
|
2
|
+
|
3
|
+
* Added missing file to Manifest
|
4
|
+
|
5
|
+
=== 0.1.13 2011-01-27
|
6
|
+
|
7
|
+
* Improvements on memory consumption
|
8
|
+
* Correction in redirections in the web interface
|
9
|
+
* Dump to disk feature
|
10
|
+
|
1
11
|
=== 0.1.12 2011-01-10
|
2
12
|
|
3
13
|
* Correction for Windows compatibility
|
data/Manifest
CHANGED
data/bin/scrappy
CHANGED
@@ -39,6 +39,7 @@ module Scrappy
|
|
39
39
|
opts.on('-h', '--help') { output_help; exit 0 }
|
40
40
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
41
41
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
42
|
+
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
42
43
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
43
44
|
opts.on('-i', '--interactive') { Options.shell = true }
|
44
45
|
opts.on('-s', '--server') { Options.server = true }
|
@@ -101,6 +102,7 @@ Options
|
|
101
102
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
102
103
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
|
103
104
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
105
|
+
-D, --dump Dumps RDF data to disk
|
104
106
|
-u, --debug Shows debugging traces
|
105
107
|
-i, --interactive Runs interactive shell
|
106
108
|
-s, --server Runs web server
|
data/lib/scrappy.rb
CHANGED
@@ -14,12 +14,13 @@ require 'scrappy/support'
|
|
14
14
|
require 'scrappy/agent/extractor'
|
15
15
|
require 'scrappy/agent/map_reduce'
|
16
16
|
require 'scrappy/agent/cache'
|
17
|
+
require 'scrappy/agent/dumper'
|
17
18
|
require 'scrappy/agent/agent'
|
18
19
|
|
19
20
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
20
21
|
|
21
22
|
module Scrappy
|
22
|
-
VERSION = '0.1.
|
23
|
+
VERSION = '0.1.14'
|
23
24
|
end
|
24
25
|
|
25
26
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -44,6 +44,9 @@ module Scrappy
|
|
44
44
|
depth = args[:depth]
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
|
+
# Expire cache
|
48
|
+
cache.expire! 300 # 5 minutes
|
49
|
+
|
47
50
|
# Lookup in cache
|
48
51
|
triples = if cache[request]
|
49
52
|
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
@@ -67,15 +70,17 @@ module Scrappy
|
|
67
70
|
puts 'done!' if options.debug
|
68
71
|
|
69
72
|
response = if self.html_data?
|
70
|
-
add_visual_data! if options.referenceable
|
71
|
-
extract self.uri, html, options.referenceable
|
73
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
74
|
+
extraction = extract self.uri, html, options.referenceable # Extract data
|
75
|
+
Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
|
76
|
+
extraction
|
72
77
|
else
|
73
78
|
[]
|
74
79
|
end
|
75
80
|
|
76
81
|
# Cache the request
|
77
|
-
|
78
|
-
|
82
|
+
cache[request] = { :time=>Time.now, :response=>response }
|
83
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
79
84
|
|
80
85
|
response
|
81
86
|
end
|
@@ -102,11 +107,13 @@ module Scrappy
|
|
102
107
|
items.each { |item| queue << item }
|
103
108
|
end
|
104
109
|
end
|
105
|
-
|
106
|
-
triples
|
110
|
+
|
111
|
+
triples unless options.dump
|
107
112
|
end
|
108
113
|
|
109
114
|
def reduce results
|
115
|
+
return [] if options.dump
|
116
|
+
|
110
117
|
if options.debug
|
111
118
|
print "Merging results..."; $stdout.flush
|
112
119
|
end
|
@@ -119,10 +126,7 @@ module Scrappy
|
|
119
126
|
end
|
120
127
|
|
121
128
|
def request args={}
|
122
|
-
|
123
|
-
cache.expire! 300 # 5 minutes
|
124
|
-
|
125
|
-
RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
|
129
|
+
RDF::Graph.new clean(map(args) || [])
|
126
130
|
end
|
127
131
|
|
128
132
|
def proxy args={}
|
@@ -130,13 +134,19 @@ module Scrappy
|
|
130
134
|
|
131
135
|
response = self.request(request)
|
132
136
|
|
133
|
-
if options.
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
137
|
+
output = if options.dump
|
138
|
+
""
|
139
|
+
else
|
140
|
+
if options.debug
|
141
|
+
print "Serializing..."; $stdout.flush
|
142
|
+
end
|
143
|
+
|
144
|
+
output = response.serialize request[:format]
|
138
145
|
|
139
|
-
|
146
|
+
puts 'done!'if options.debug
|
147
|
+
|
148
|
+
output
|
149
|
+
end
|
140
150
|
|
141
151
|
OpenStruct.new :output => output,
|
142
152
|
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
@@ -149,5 +159,9 @@ module Scrappy
|
|
149
159
|
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
150
160
|
uri
|
151
161
|
end
|
162
|
+
|
163
|
+
def clean triples
|
164
|
+
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
165
|
+
end
|
152
166
|
end
|
153
167
|
end
|
data/lib/scrappy/agent/cache.rb
CHANGED
@@ -19,10 +19,13 @@ module Scrappy
|
|
19
19
|
|
20
20
|
class Cache < Hash
|
21
21
|
include MonitorMixin
|
22
|
+
MAX_ELEMENTS = 100
|
22
23
|
|
23
24
|
def expire! timeout
|
24
25
|
synchronize do
|
25
|
-
keys.each { |
|
26
|
+
keys.each { |key| delete(key) if Time.now.to_i - self[key][:time].to_i > timeout }
|
27
|
+
sort_by { |key, value| value[:time].to_i }[0...size-MAX_ELEMENTS].each { |key, value| delete key } if size > MAX_ELEMENTS
|
28
|
+
self
|
26
29
|
end
|
27
30
|
end
|
28
31
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Dumper
|
3
|
+
Mux = Mutex.new
|
4
|
+
|
5
|
+
def self.dump uri, triples, format
|
6
|
+
Mux.synchronize do
|
7
|
+
filename = uri.gsub("http://", "").gsub("https://", "").gsub("/", "-").gsub(".", "_").gsub("?", "+").gsub("&", "+") + ".#{format}"
|
8
|
+
data = RDF::Graph.new(triples).serialize(format)
|
9
|
+
File.open(filename, "w") { |f| f.write data }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module SliceSelector
|
2
2
|
def self.filter selector, doc
|
3
3
|
selector.rdf::value.map do |separator|
|
4
|
-
slices = doc[:
|
4
|
+
slices = doc[:value].split(separator)
|
5
5
|
selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
|
6
6
|
end.flatten
|
7
7
|
end
|
data/lib/scrappy/server.rb
CHANGED
@@ -51,10 +51,9 @@ module Scrappy
|
|
51
51
|
def process_request method, format, url
|
52
52
|
callback = @input['callback']
|
53
53
|
response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
|
54
|
-
|
55
54
|
case response.status
|
56
55
|
when :redirect
|
57
|
-
redirect "/#{format}/#{response.uri}#{inputs}"
|
56
|
+
redirect "/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{inputs}"
|
58
57
|
when :ok
|
59
58
|
@headers['Content-Type'] = response.content_type
|
60
59
|
callback ? "#{callback}(#{response.output})" : response.output
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.14"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-01-
|
9
|
+
s.date = %q{2011-01-30}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 14
|
9
|
+
version: 0.1.14
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-30 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -141,6 +141,7 @@ extra_rdoc_files:
|
|
141
141
|
- lib/scrappy/agent/agent.rb
|
142
142
|
- lib/scrappy/agent/blind_agent.rb
|
143
143
|
- lib/scrappy/agent/cache.rb
|
144
|
+
- lib/scrappy/agent/dumper.rb
|
144
145
|
- lib/scrappy/agent/map_reduce.rb
|
145
146
|
- lib/scrappy/agent/extractor.rb
|
146
147
|
- lib/scrappy/agent/visual_agent.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- lib/scrappy/agent/agent.rb
|
171
172
|
- lib/scrappy/agent/blind_agent.rb
|
172
173
|
- lib/scrappy/agent/cache.rb
|
174
|
+
- lib/scrappy/agent/dumper.rb
|
173
175
|
- lib/scrappy/agent/map_reduce.rb
|
174
176
|
- lib/scrappy/agent/extractor.rb
|
175
177
|
- lib/scrappy/agent/visual_agent.rb
|