scrappy 0.1.12 → 0.1.14
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/Manifest +1 -0
- data/bin/scrappy +2 -0
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +30 -16
- data/lib/scrappy/agent/blind_agent.rb +1 -0
- data/lib/scrappy/agent/cache.rb +4 -1
- data/lib/scrappy/agent/dumper.rb +13 -0
- data/lib/scrappy/selectors/slice.rb +1 -1
- data/lib/scrappy/server.rb +1 -2
- data/scrappy.gemspec +4 -4
- metadata +5 -3
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 0.1.14 2011-01-30
|
2
|
+
|
3
|
+
* Added missing file to Manifest
|
4
|
+
|
5
|
+
=== 0.1.13 2011-01-27
|
6
|
+
|
7
|
+
* Improvements on memory consumption
|
8
|
+
* Correction in redirections in the web interface
|
9
|
+
* Dump to disk feature
|
10
|
+
|
1
11
|
=== 0.1.12 2011-01-10
|
2
12
|
|
3
13
|
* Correction for Windows compatibility
|
data/Manifest
CHANGED
data/bin/scrappy
CHANGED
@@ -39,6 +39,7 @@ module Scrappy
|
|
39
39
|
opts.on('-h', '--help') { output_help; exit 0 }
|
40
40
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
41
41
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
42
|
+
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
42
43
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
43
44
|
opts.on('-i', '--interactive') { Options.shell = true }
|
44
45
|
opts.on('-s', '--server') { Options.server = true }
|
@@ -101,6 +102,7 @@ Options
|
|
101
102
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
102
103
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
|
103
104
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
105
|
+
-D, --dump Dumps RDF data to disk
|
104
106
|
-u, --debug Shows debugging traces
|
105
107
|
-i, --interactive Runs interactive shell
|
106
108
|
-s, --server Runs web server
|
data/lib/scrappy.rb
CHANGED
@@ -14,12 +14,13 @@ require 'scrappy/support'
|
|
14
14
|
require 'scrappy/agent/extractor'
|
15
15
|
require 'scrappy/agent/map_reduce'
|
16
16
|
require 'scrappy/agent/cache'
|
17
|
+
require 'scrappy/agent/dumper'
|
17
18
|
require 'scrappy/agent/agent'
|
18
19
|
|
19
20
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
20
21
|
|
21
22
|
module Scrappy
|
22
|
-
VERSION = '0.1.
|
23
|
+
VERSION = '0.1.14'
|
23
24
|
end
|
24
25
|
|
25
26
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -44,6 +44,9 @@ module Scrappy
|
|
44
44
|
depth = args[:depth]
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
|
+
# Expire cache
|
48
|
+
cache.expire! 300 # 5 minutes
|
49
|
+
|
47
50
|
# Lookup in cache
|
48
51
|
triples = if cache[request]
|
49
52
|
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
@@ -67,15 +70,17 @@ module Scrappy
|
|
67
70
|
puts 'done!' if options.debug
|
68
71
|
|
69
72
|
response = if self.html_data?
|
70
|
-
add_visual_data! if options.referenceable
|
71
|
-
extract self.uri, html, options.referenceable
|
73
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
74
|
+
extraction = extract self.uri, html, options.referenceable # Extract data
|
75
|
+
Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
|
76
|
+
extraction
|
72
77
|
else
|
73
78
|
[]
|
74
79
|
end
|
75
80
|
|
76
81
|
# Cache the request
|
77
|
-
|
78
|
-
|
82
|
+
cache[request] = { :time=>Time.now, :response=>response }
|
83
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
79
84
|
|
80
85
|
response
|
81
86
|
end
|
@@ -102,11 +107,13 @@ module Scrappy
|
|
102
107
|
items.each { |item| queue << item }
|
103
108
|
end
|
104
109
|
end
|
105
|
-
|
106
|
-
triples
|
110
|
+
|
111
|
+
triples unless options.dump
|
107
112
|
end
|
108
113
|
|
109
114
|
def reduce results
|
115
|
+
return [] if options.dump
|
116
|
+
|
110
117
|
if options.debug
|
111
118
|
print "Merging results..."; $stdout.flush
|
112
119
|
end
|
@@ -119,10 +126,7 @@ module Scrappy
|
|
119
126
|
end
|
120
127
|
|
121
128
|
def request args={}
|
122
|
-
|
123
|
-
cache.expire! 300 # 5 minutes
|
124
|
-
|
125
|
-
RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
|
129
|
+
RDF::Graph.new clean(map(args) || [])
|
126
130
|
end
|
127
131
|
|
128
132
|
def proxy args={}
|
@@ -130,13 +134,19 @@ module Scrappy
|
|
130
134
|
|
131
135
|
response = self.request(request)
|
132
136
|
|
133
|
-
if options.
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
137
|
+
output = if options.dump
|
138
|
+
""
|
139
|
+
else
|
140
|
+
if options.debug
|
141
|
+
print "Serializing..."; $stdout.flush
|
142
|
+
end
|
143
|
+
|
144
|
+
output = response.serialize request[:format]
|
138
145
|
|
139
|
-
|
146
|
+
puts 'done!'if options.debug
|
147
|
+
|
148
|
+
output
|
149
|
+
end
|
140
150
|
|
141
151
|
OpenStruct.new :output => output,
|
142
152
|
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
@@ -149,5 +159,9 @@ module Scrappy
|
|
149
159
|
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
150
160
|
uri
|
151
161
|
end
|
162
|
+
|
163
|
+
def clean triples
|
164
|
+
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
165
|
+
end
|
152
166
|
end
|
153
167
|
end
|
data/lib/scrappy/agent/cache.rb
CHANGED
@@ -19,10 +19,13 @@ module Scrappy
|
|
19
19
|
|
20
20
|
class Cache < Hash
|
21
21
|
include MonitorMixin
|
22
|
+
MAX_ELEMENTS = 100
|
22
23
|
|
23
24
|
def expire! timeout
|
24
25
|
synchronize do
|
25
|
-
keys.each { |
|
26
|
+
keys.each { |key| delete(key) if Time.now.to_i - self[key][:time].to_i > timeout }
|
27
|
+
sort_by { |key, value| value[:time].to_i }[0...size-MAX_ELEMENTS].each { |key, value| delete key } if size > MAX_ELEMENTS
|
28
|
+
self
|
26
29
|
end
|
27
30
|
end
|
28
31
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Dumper
|
3
|
+
Mux = Mutex.new
|
4
|
+
|
5
|
+
def self.dump uri, triples, format
|
6
|
+
Mux.synchronize do
|
7
|
+
filename = uri.gsub("http://", "").gsub("https://", "").gsub("/", "-").gsub(".", "_").gsub("?", "+").gsub("&", "+") + ".#{format}"
|
8
|
+
data = RDF::Graph.new(triples).serialize(format)
|
9
|
+
File.open(filename, "w") { |f| f.write data }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module SliceSelector
|
2
2
|
def self.filter selector, doc
|
3
3
|
selector.rdf::value.map do |separator|
|
4
|
-
slices = doc[:
|
4
|
+
slices = doc[:value].split(separator)
|
5
5
|
selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
|
6
6
|
end.flatten
|
7
7
|
end
|
data/lib/scrappy/server.rb
CHANGED
@@ -51,10 +51,9 @@ module Scrappy
|
|
51
51
|
def process_request method, format, url
|
52
52
|
callback = @input['callback']
|
53
53
|
response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
|
54
|
-
|
55
54
|
case response.status
|
56
55
|
when :redirect
|
57
|
-
redirect "/#{format}/#{response.uri}#{inputs}"
|
56
|
+
redirect "/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{inputs}"
|
58
57
|
when :ok
|
59
58
|
@headers['Content-Type'] = response.content_type
|
60
59
|
callback ? "#{callback}(#{response.output})" : response.output
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.14"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-01-
|
9
|
+
s.date = %q{2011-01-30}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 14
|
9
|
+
version: 0.1.14
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-01-
|
17
|
+
date: 2011-01-30 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -141,6 +141,7 @@ extra_rdoc_files:
|
|
141
141
|
- lib/scrappy/agent/agent.rb
|
142
142
|
- lib/scrappy/agent/blind_agent.rb
|
143
143
|
- lib/scrappy/agent/cache.rb
|
144
|
+
- lib/scrappy/agent/dumper.rb
|
144
145
|
- lib/scrappy/agent/map_reduce.rb
|
145
146
|
- lib/scrappy/agent/extractor.rb
|
146
147
|
- lib/scrappy/agent/visual_agent.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- lib/scrappy/agent/agent.rb
|
171
172
|
- lib/scrappy/agent/blind_agent.rb
|
172
173
|
- lib/scrappy/agent/cache.rb
|
174
|
+
- lib/scrappy/agent/dumper.rb
|
173
175
|
- lib/scrappy/agent/map_reduce.rb
|
174
176
|
- lib/scrappy/agent/extractor.rb
|
175
177
|
- lib/scrappy/agent/visual_agent.rb
|