scrappy 0.1.24 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest +1 -0
- data/README.rdoc +42 -2
- data/Rakefile +1 -1
- data/bin/scrappy +40 -12
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +112 -35
- data/lib/scrappy/agent/extractor.rb +28 -24
- data/lib/scrappy/repository.rb +34 -0
- data/scrappy.gemspec +10 -7
- metadata +24 -8
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -30,6 +30,7 @@ lib/scrappy/server/public/images/logo_small.png
|
|
30
30
|
lib/scrappy/server/public/stylesheets/application.css
|
31
31
|
lib/scrappy/server/views/home.haml
|
32
32
|
lib/scrappy/server/views/help.haml
|
33
|
+
lib/scrappy/repository.rb
|
33
34
|
lib/scrappy/shell.rb
|
34
35
|
lib/scrappy/support.rb
|
35
36
|
lib/scrappy/webkit/webkit.rb
|
data/README.rdoc
CHANGED
@@ -138,8 +138,43 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
138
138
|
titles = output.find([], Node('dc:title'), nil)
|
139
139
|
titles.each { |title| puts title }
|
140
140
|
|
141
|
-
|
141
|
+
* RDF repository:
|
142
|
+
|
143
|
+
Sesame functionality has been included in Scrappy. You can configure
|
144
|
+
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
+
An example of this file can be found at the end of this README.
|
146
|
+
|
147
|
+
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
148
|
+
|
149
|
+
$ scrappy -g example.org -t 3
|
150
|
+
|
151
|
+
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
152
|
+
|
153
|
+
* Sample config.yml
|
142
154
|
|
155
|
+
# This is a sample configuration file, with the options to communicate with Sesame using Scrappy
|
156
|
+
repository:
|
157
|
+
# The host were Sesame is. Do not add the trailing '/'
|
158
|
+
host: http://localhost
|
159
|
+
|
160
|
+
# The port for the connection
|
161
|
+
port: 8080
|
162
|
+
|
163
|
+
# The time to consider the data in the repository valid, in minutes
|
164
|
+
time: 15
|
165
|
+
|
166
|
+
# The name of the repository
|
167
|
+
repository: memory
|
168
|
+
|
169
|
+
# The format to communicate with the repository
|
170
|
+
format: ntriples
|
171
|
+
|
172
|
+
# You can use any of the following formats:
|
173
|
+
# rdfxml, ntriples, turtle, n3, trix, trig
|
174
|
+
|
175
|
+
|
176
|
+
== INSTALL:
|
177
|
+
|
143
178
|
Install it as any other gem:
|
144
179
|
|
145
180
|
$ gem install scrappy
|
@@ -153,10 +188,15 @@ Additionally, some extra libraries are needed for certain features:
|
|
153
188
|
|
154
189
|
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
155
190
|
|
191
|
+
In order to use Sesame, you will need to install it. Further instructions can be found
|
192
|
+
in the openRDF website, more precisely, in http://www.openrdf.org/doc/sesame2/users/ch06.html .
|
193
|
+
|
156
194
|
== CONTRIBUTORS:
|
157
195
|
|
158
196
|
* José Ignacio Fernández
|
159
197
|
|
198
|
+
* Alberto Mardomingo
|
199
|
+
|
160
200
|
* Jacobo Blasco
|
161
201
|
|
162
202
|
== LICENSE:
|
@@ -182,4 +222,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
182
222
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
183
223
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
184
224
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
185
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
225
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.1'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
# encoding: UTF-8
|
3
3
|
|
4
|
-
|
4
|
+
require 'rbconfig'
|
5
|
+
WINDOWS_PLATFORM = Config::CONFIG['host_os'] =~ /mswin|mingw/
|
6
|
+
|
7
|
+
if !WINDOWS_PLATFORM
|
5
8
|
stty_save = `stty -g`.chomp
|
6
9
|
trap('INT') { system('stty', stty_save); Scrappy::App.quit }
|
7
10
|
end
|
@@ -31,8 +34,8 @@ module Scrappy
|
|
31
34
|
OptionParser.new do |opts|
|
32
35
|
opts.on('-v', '--version') { output_version; exit 0 }
|
33
36
|
opts.on('-h', '--help') { output_help; exit 0 }
|
34
|
-
opts.on('-g
|
35
|
-
opts.on('-p
|
37
|
+
opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
|
38
|
+
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
36
39
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
37
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
38
41
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
@@ -47,15 +50,19 @@ module Scrappy
|
|
47
50
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
51
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
49
52
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
53
|
+
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
54
|
+
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
50
55
|
end.parse!(args)
|
51
56
|
@file = args.shift
|
52
57
|
end
|
53
58
|
|
54
59
|
def run
|
55
60
|
onload
|
56
|
-
if Options.
|
61
|
+
if Options.uri
|
57
62
|
Options.quiet = true
|
58
|
-
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.
|
63
|
+
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
|
+
elsif Options.observe
|
65
|
+
Agent.create.observe(Options.observe)
|
59
66
|
elsif Options.proxy
|
60
67
|
puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
|
61
68
|
require 'scrappy/server/proxy'
|
@@ -105,10 +112,12 @@ Options
|
|
105
112
|
-D, --dump Dumps RDF data to disk
|
106
113
|
-u, --debug Shows debugging traces
|
107
114
|
-i, --interactive Runs interactive shell
|
115
|
+
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
108
116
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
109
117
|
-S, --proxy-server Runs web proxy
|
110
118
|
-P, --port PORT Selects port number (default is 3434)
|
111
119
|
-V, --visual Uses visual agent (slow)
|
120
|
+
-t, --time DAYS Returns repository data from the last given minutes
|
112
121
|
-r, --reference Outputs referenceable data
|
113
122
|
-R, --reference-all Outputs all HTML referenceable data
|
114
123
|
-w, --window Shows browser window (requires -v)
|
@@ -127,15 +136,23 @@ Copyright
|
|
127
136
|
|
128
137
|
def onload
|
129
138
|
# Check local or global knowledge base
|
130
|
-
home =
|
139
|
+
home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
140
|
+
|
141
|
+
data_dirname = "kb"
|
142
|
+
cache_dirname = "cache"
|
143
|
+
cache_filename = "scrappy-#{Scrappy::VERSION}.kb"
|
144
|
+
config_filename = "config.yml"
|
131
145
|
|
132
|
-
if File.exists?(
|
133
|
-
data_folder
|
134
|
-
|
146
|
+
if File.exists?(File.join(home, data_dirname))
|
147
|
+
data_folder = File.join home, data_dirname
|
148
|
+
cache_folder = File.join home, cache_dirname
|
135
149
|
else
|
136
|
-
data_folder
|
137
|
-
|
150
|
+
data_folder = File.join Scrappy::Root, data_dirname
|
151
|
+
cache_folder = Dir.tmpdir
|
138
152
|
end
|
153
|
+
Dir.mkdir cache_folder if Dir[cache_folder].empty?
|
154
|
+
cache_file = File.join cache_folder, cache_filename
|
155
|
+
config_file = File.join home, config_filename
|
139
156
|
|
140
157
|
# Load knowledge base
|
141
158
|
Agent::Options.kb = if File.exists?(cache_file) and File.mtime(cache_file) >= Dir["#{data_folder}/*", data_folder].map{ |f| File.mtime(f) }.max
|
@@ -143,7 +160,7 @@ Copyright
|
|
143
160
|
open(cache_file) { |f| Marshal.load(f) }
|
144
161
|
else
|
145
162
|
# Load YARF files and cache kb
|
146
|
-
data = Dir[
|
163
|
+
data = Dir[File.join(data_folder, "*")].inject(RDF::Graph.new) do |kb, file|
|
147
164
|
extension = file.split('.').last.to_sym
|
148
165
|
graph = RDF::Parser.parse(extension, open(file).read)
|
149
166
|
kb.ns.merge! graph.ns
|
@@ -153,6 +170,17 @@ Copyright
|
|
153
170
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
154
171
|
data
|
155
172
|
end
|
173
|
+
|
174
|
+
# Looks for a configuration file. If it does not exist, Scrappy does not uses Sesame
|
175
|
+
# It looks for it in the home .scrappy dir
|
176
|
+
if File.exist?(config_file)
|
177
|
+
config = YAML::load_file(config_file)["repository"]
|
178
|
+
# Convert the strings from the YAML file into symbols
|
179
|
+
repository_options = {}
|
180
|
+
config.each { |k,v| repository_options[k.to_sym] = v }
|
181
|
+
Agent::Options.repository = Repository.new repository_options
|
182
|
+
end
|
183
|
+
|
156
184
|
RDF::ID.ns.merge! Agent::Options.kb.ns
|
157
185
|
end
|
158
186
|
end
|
data/lib/scrappy.rb
CHANGED
@@ -10,6 +10,7 @@ require 'tmpdir'
|
|
10
10
|
require 'lightrdf'
|
11
11
|
|
12
12
|
require 'scrappy/support'
|
13
|
+
require 'scrappy/repository'
|
13
14
|
|
14
15
|
require 'scrappy/agent/extractor'
|
15
16
|
require 'scrappy/agent/map_reduce'
|
@@ -21,7 +22,7 @@ require 'scrappy/agent/agent'
|
|
21
22
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
22
23
|
|
23
24
|
module Scrappy
|
24
|
-
VERSION = '0.
|
25
|
+
VERSION = '0.2.0'
|
25
26
|
end
|
26
27
|
|
27
28
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -38,6 +38,7 @@ module Scrappy
|
|
38
38
|
Agent.pool[@id] = self
|
39
39
|
@kb = args[:kb] || Options.kb
|
40
40
|
@options = Options.clone
|
41
|
+
@repository = args[:repository] || Options.repository
|
41
42
|
end
|
42
43
|
|
43
44
|
def map args, queue=nil
|
@@ -52,51 +53,35 @@ module Scrappy
|
|
52
53
|
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
53
54
|
|
54
55
|
cache[request][:response]
|
56
|
+
elsif @repository
|
57
|
+
# Extracts from the repository
|
58
|
+
request_from_repository(request)
|
55
59
|
else
|
56
60
|
# Perform the request
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
if options.debug
|
61
|
-
print "Opening #{request[:uri]}..."; $stdout.flush
|
62
|
-
end
|
63
|
-
|
64
|
-
if request[:method] == :get
|
65
|
-
self.uri = request[:uri]
|
66
|
-
else
|
67
|
-
raise Exception, 'POST requests not supported yet'
|
68
|
-
end
|
69
|
-
|
70
|
-
puts 'done!' if options.debug
|
71
|
-
|
72
|
-
response = if self.html_data?
|
73
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
74
|
-
extraction = extract self.uri, html, options.referenceable # Extract data
|
75
|
-
Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
|
76
|
-
extraction
|
77
|
-
else
|
78
|
-
[]
|
79
|
-
end
|
61
|
+
request_uncached(request)
|
62
|
+
end
|
80
63
|
|
64
|
+
# If previous cache exists, do not cache it again
|
65
|
+
unless cache[request]
|
81
66
|
# Cache the request
|
82
|
-
cache[request] = { :time=>Time.now, :response=>
|
83
|
-
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>
|
84
|
-
|
85
|
-
response
|
67
|
+
cache[request] = { :time=>Time.now, :response=>triples }
|
68
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>triples } if self.uri
|
86
69
|
end
|
87
70
|
|
88
71
|
# Enqueue subresources
|
89
72
|
# Pages are enqueued without reducing depth
|
90
|
-
pages = triples.select { |s,p,o| p==
|
73
|
+
pages = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
|
91
74
|
|
92
75
|
# All other URIS are enqueued with depth reduced
|
93
76
|
uris = if depth != 0
|
94
|
-
(triples.map { |s, p, o| [s,o] }.flatten - [
|
77
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [ID(self.uri)] - pages).select{|n| n.is_a?(Symbol)}
|
95
78
|
else
|
96
79
|
[]
|
97
80
|
end
|
98
81
|
|
99
|
-
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
82
|
+
items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
83
|
+
uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
|
84
|
+
uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
|
100
85
|
|
101
86
|
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
102
87
|
|
@@ -120,7 +105,7 @@ module Scrappy
|
|
120
105
|
|
121
106
|
puts 'done!'if options.debug
|
122
107
|
|
123
|
-
triples
|
108
|
+
triples.uniq
|
124
109
|
end
|
125
110
|
|
126
111
|
def request args={}
|
@@ -139,7 +124,7 @@ module Scrappy
|
|
139
124
|
print "Serializing..."; $stdout.flush
|
140
125
|
end
|
141
126
|
|
142
|
-
output = response.serialize request[:format],
|
127
|
+
output = response.serialize request[:format], options.format_header
|
143
128
|
|
144
129
|
puts 'done!'if options.debug
|
145
130
|
|
@@ -152,14 +137,106 @@ module Scrappy
|
|
152
137
|
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
153
138
|
end
|
154
139
|
|
140
|
+
# Method to observe several webs, and extract the data periodically
|
141
|
+
def observe uris
|
142
|
+
while true
|
143
|
+
time_init = Time.now.to_i
|
144
|
+
uris.each do |uri|
|
145
|
+
puts "Pinging #{uri}..."
|
146
|
+
request :uri=>uri
|
147
|
+
end
|
148
|
+
time = options.repository.time * 60 - (Time.now.to_i - time_init)
|
149
|
+
puts "Sleeping until #{Time.now + time}..."
|
150
|
+
sleep time
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
155
|
def complete_uri uri
|
156
156
|
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
157
|
-
uri = "http://#{uri}"
|
157
|
+
uri = "http://#{uri}" unless uri =~ /\A\w*:/
|
158
158
|
uri
|
159
159
|
end
|
160
|
-
|
160
|
+
|
161
161
|
def clean triples
|
162
|
-
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
162
|
+
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
163
|
+
end
|
164
|
+
|
165
|
+
# Do the extraction using RDF repository
|
166
|
+
def request_from_repository request
|
167
|
+
triples = []
|
168
|
+
|
169
|
+
# Checks if there is any previous extraction within the last 15 minutes
|
170
|
+
contexts = if Options.time
|
171
|
+
@repository.recent_contexts(request[:uri], Options.time)
|
172
|
+
else
|
173
|
+
@repository.recent_contexts(request[:uri])
|
174
|
+
end
|
175
|
+
|
176
|
+
if contexts.empty?
|
177
|
+
# Extracts data from the uri
|
178
|
+
triples = request_uncached request
|
179
|
+
|
180
|
+
if options.debug
|
181
|
+
print "Storing into repository #{request[:uri]}..."; $stdout.flush
|
182
|
+
end
|
183
|
+
|
184
|
+
# Checks if the extraction returned something
|
185
|
+
graph = if triples.empty?
|
186
|
+
# Creates a triple to indicate that nothing was extracted from the uri
|
187
|
+
# This is done because otherwise the context wouldn't be stored
|
188
|
+
RDF::Graph.new [ [ID(request[:uri]), ID("sc:extraction"), ID("sc:Empty")] ]
|
189
|
+
else
|
190
|
+
RDF::Graph.new triples.uniq
|
191
|
+
end
|
192
|
+
|
193
|
+
# Adds data to sesame
|
194
|
+
@repository.data = graph, "#{request[:uri]}:#{Time.now.to_i}"
|
195
|
+
@repository.data = graph, "#{self.uri}:#{Time.now.to_i}" if self.uri
|
196
|
+
|
197
|
+
puts 'done!' if options.debug
|
198
|
+
|
199
|
+
triples
|
200
|
+
else
|
201
|
+
# Data found in repository. Asking for it
|
202
|
+
triples = []
|
203
|
+
if options.debug
|
204
|
+
print "Retrieving from repository #{request[:uri]}..."; $stdout.flush
|
205
|
+
end
|
206
|
+
contexts.each do |context|
|
207
|
+
graph = @repository.data(context)
|
208
|
+
triples += graph.triples.select{|s,p,o| p!=ID("sc:extraction")}
|
209
|
+
end
|
210
|
+
puts 'done!' if options.debug
|
211
|
+
|
212
|
+
triples
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Extracts from the uri
|
217
|
+
def request_uncached request
|
218
|
+
sleep 0.001 * options.delay.to_f # Sleep if requested
|
219
|
+
|
220
|
+
if options.debug
|
221
|
+
print "Opening #{request[:uri]}..."; $stdout.flush
|
222
|
+
end
|
223
|
+
|
224
|
+
if request[:method] == :get
|
225
|
+
self.uri = request[:uri]
|
226
|
+
else
|
227
|
+
raise Exception, 'POST requests not supported yet'
|
228
|
+
end
|
229
|
+
|
230
|
+
puts 'done!' if options.debug
|
231
|
+
|
232
|
+
if self.html_data?
|
233
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
234
|
+
triples = extract(self.uri, html, options.referenceable) # Extract data
|
235
|
+
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
236
|
+
triples
|
237
|
+
else
|
238
|
+
[]
|
239
|
+
end
|
163
240
|
end
|
164
241
|
end
|
165
242
|
end
|
@@ -6,7 +6,7 @@ module Scrappy
|
|
6
6
|
if options.debug
|
7
7
|
print "Extracting #{uri}..."; $stdout.flush
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
@selector_pool ||= {}
|
11
11
|
triples = []
|
12
12
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
@@ -27,7 +27,11 @@ module Scrappy
|
|
27
27
|
|
28
28
|
puts "done!" if options.debug
|
29
29
|
|
30
|
-
triples
|
30
|
+
triples.map do |s,p,o|
|
31
|
+
[ s.is_a?(RDF::Node) ? s.id : s,
|
32
|
+
p.is_a?(RDF::Node) ? p.id : p,
|
33
|
+
o.is_a?(RDF::Node) ? o.id : o ]
|
34
|
+
end
|
31
35
|
end
|
32
36
|
|
33
37
|
private
|
@@ -150,32 +154,32 @@ module Scrappy
|
|
150
154
|
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
151
155
|
|
152
156
|
content.search('*').each do |node|
|
157
|
+
next if node.text?
|
158
|
+
|
153
159
|
fragment = Node(node_hash(uri, node.path))
|
154
|
-
|
160
|
+
|
155
161
|
if referenceable == :dump or resources[fragment]
|
156
|
-
selector
|
162
|
+
selector = Node(nil)
|
157
163
|
presentation = Node(nil)
|
158
164
|
|
159
|
-
selector
|
160
|
-
selector
|
161
|
-
selector
|
162
|
-
selector
|
163
|
-
|
164
|
-
presentation
|
165
|
-
presentation
|
166
|
-
presentation
|
167
|
-
presentation
|
168
|
-
presentation
|
169
|
-
presentation
|
170
|
-
presentation
|
171
|
-
presentation
|
172
|
-
presentation
|
173
|
-
presentation
|
174
|
-
|
175
|
-
fragment
|
176
|
-
fragment
|
177
|
-
|
178
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples)
|
165
|
+
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
166
|
+
triples << [selector, ID('sc:path'), node.path.to_s]
|
167
|
+
triples << [selector, ID('sc:tag'), node.name.to_s]
|
168
|
+
triples << [selector, ID('sc:document'), uri]
|
169
|
+
|
170
|
+
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
171
|
+
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
172
|
+
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
173
|
+
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
174
|
+
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
175
|
+
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
176
|
+
triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
|
177
|
+
triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
|
178
|
+
triples << [presentation, ID('sc:text'), node.text.strip]
|
179
|
+
triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
|
180
|
+
|
181
|
+
triples << [fragment, ID('sc:selector'), selector]
|
182
|
+
triples << [fragment, ID('sc:presentation'), presentation]
|
179
183
|
end
|
180
184
|
end
|
181
185
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Repository < RDF::Repository
|
3
|
+
# Processes the list of context, checks if there is any extraction
|
4
|
+
# from the last X minutes, and returns an array with them.
|
5
|
+
# If there is not any extraction, returns an empty array
|
6
|
+
def recent_contexts uri, seconds=@options[:time].to_i*60
|
7
|
+
return [] unless uri
|
8
|
+
contexts.select do |context|
|
9
|
+
date = context_date(context)
|
10
|
+
date and check_date(date, seconds) and context_uri(context) == uri
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def time
|
15
|
+
@options[:time]
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
# Checks if the context date is within the indicated time
|
20
|
+
def check_date date, seconds
|
21
|
+
(Time.now.to_i - date) <= seconds
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns an integer with the date of a given context
|
25
|
+
def context_date context
|
26
|
+
$1.to_i if context =~ /:(\d+)\Z/
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the URI of a context
|
30
|
+
def context_uri context
|
31
|
+
$1 if context =~ /\A(.*):(\d+)\Z/
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-09}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
@@ -32,8 +32,9 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.1"])
|
36
36
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
|
+
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
37
38
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
39
|
else
|
39
40
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -41,8 +42,9 @@ Gem::Specification.new do |s|
|
|
41
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
42
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.2.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
|
45
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
|
+
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
46
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
47
49
|
end
|
48
50
|
else
|
@@ -51,8 +53,9 @@ Gem::Specification.new do |s|
|
|
51
53
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
52
54
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
55
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.2.
|
56
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
|
55
57
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
58
|
+
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
56
59
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
57
60
|
end
|
58
61
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-09 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,8 +97,8 @@ dependencies:
|
|
97
97
|
segments:
|
98
98
|
- 0
|
99
99
|
- 2
|
100
|
-
-
|
101
|
-
version: 0.2.
|
100
|
+
- 1
|
101
|
+
version: 0.2.1
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|
@@ -116,9 +116,23 @@ dependencies:
|
|
116
116
|
type: :runtime
|
117
117
|
version_requirements: *id007
|
118
118
|
- !ruby/object:Gem::Dependency
|
119
|
-
name:
|
119
|
+
name: rest-client
|
120
120
|
prerelease: false
|
121
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - ">="
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
segments:
|
126
|
+
- 1
|
127
|
+
- 6
|
128
|
+
- 1
|
129
|
+
version: 1.6.1
|
130
|
+
type: :runtime
|
131
|
+
version_requirements: *id008
|
132
|
+
- !ruby/object:Gem::Dependency
|
133
|
+
name: haml
|
134
|
+
prerelease: false
|
135
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
122
136
|
requirements:
|
123
137
|
- - ">="
|
124
138
|
- !ruby/object:Gem::Version
|
@@ -128,7 +142,7 @@ dependencies:
|
|
128
142
|
- 24
|
129
143
|
version: 3.0.24
|
130
144
|
type: :runtime
|
131
|
-
version_requirements: *
|
145
|
+
version_requirements: *id009
|
132
146
|
description: RDF web scraper
|
133
147
|
email: joseignacio.fernandez@gmail.com
|
134
148
|
executables:
|
@@ -164,6 +178,7 @@ extra_rdoc_files:
|
|
164
178
|
- lib/scrappy/server/public/stylesheets/application.css
|
165
179
|
- lib/scrappy/server/views/home.haml
|
166
180
|
- lib/scrappy/server/views/help.haml
|
181
|
+
- lib/scrappy/repository.rb
|
167
182
|
- lib/scrappy/shell.rb
|
168
183
|
- lib/scrappy/support.rb
|
169
184
|
- lib/scrappy/webkit/webkit.rb
|
@@ -200,6 +215,7 @@ files:
|
|
200
215
|
- lib/scrappy/server/public/stylesheets/application.css
|
201
216
|
- lib/scrappy/server/views/home.haml
|
202
217
|
- lib/scrappy/server/views/help.haml
|
218
|
+
- lib/scrappy/repository.rb
|
203
219
|
- lib/scrappy/shell.rb
|
204
220
|
- lib/scrappy/support.rb
|
205
221
|
- lib/scrappy/webkit/webkit.rb
|