scrappy 0.1.24 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest +1 -0
- data/README.rdoc +42 -2
- data/Rakefile +1 -1
- data/bin/scrappy +40 -12
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +112 -35
- data/lib/scrappy/agent/extractor.rb +28 -24
- data/lib/scrappy/repository.rb +34 -0
- data/scrappy.gemspec +10 -7
- metadata +24 -8
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -30,6 +30,7 @@ lib/scrappy/server/public/images/logo_small.png
|
|
30
30
|
lib/scrappy/server/public/stylesheets/application.css
|
31
31
|
lib/scrappy/server/views/home.haml
|
32
32
|
lib/scrappy/server/views/help.haml
|
33
|
+
lib/scrappy/repository.rb
|
33
34
|
lib/scrappy/shell.rb
|
34
35
|
lib/scrappy/support.rb
|
35
36
|
lib/scrappy/webkit/webkit.rb
|
data/README.rdoc
CHANGED
@@ -138,8 +138,43 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
138
138
|
titles = output.find([], Node('dc:title'), nil)
|
139
139
|
titles.each { |title| puts title }
|
140
140
|
|
141
|
-
|
141
|
+
* RDF repository:
|
142
|
+
|
143
|
+
Sesame functionality has been included in Scrappy. You can configure
|
144
|
+
the repository options by editing the file config.yml placed the folder .scrappy, in your home dir.
|
145
|
+
An example of this file can be found at the end of this README.
|
146
|
+
|
147
|
+
You can get the data for a certain period of time, by using the time (-t, --time) option:
|
148
|
+
|
149
|
+
$ scrappy -g example.org -t 3
|
150
|
+
|
151
|
+
This would get all the data stored in Sesame for example.org in the last 3 minutes.
|
152
|
+
|
153
|
+
* Sample config.yml
|
142
154
|
|
155
|
+
# This is a sample configuration file, with the options to communicate with Sesame using Scrappy
|
156
|
+
repository:
|
157
|
+
# The host were Sesame is. Do not add the trailing '/'
|
158
|
+
host: http://localhost
|
159
|
+
|
160
|
+
# The port for the connection
|
161
|
+
port: 8080
|
162
|
+
|
163
|
+
# The time to consider the data in the repository valid, in minutes
|
164
|
+
time: 15
|
165
|
+
|
166
|
+
# The name of the repository
|
167
|
+
repository: memory
|
168
|
+
|
169
|
+
# The format to communicate with the repository
|
170
|
+
format: ntriples
|
171
|
+
|
172
|
+
# You can use any of the following formats:
|
173
|
+
# rdfxml, ntriples, turtle, n3, trix, trig
|
174
|
+
|
175
|
+
|
176
|
+
== INSTALL:
|
177
|
+
|
143
178
|
Install it as any other gem:
|
144
179
|
|
145
180
|
$ gem install scrappy
|
@@ -153,10 +188,15 @@ Additionally, some extra libraries are needed for certain features:
|
|
153
188
|
|
154
189
|
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
155
190
|
|
191
|
+
In order to use Sesame, you will need to install it. Further instructions can be found
|
192
|
+
in the openRDF website, more precisely, in http://www.openrdf.org/doc/sesame2/users/ch06.html .
|
193
|
+
|
156
194
|
== CONTRIBUTORS:
|
157
195
|
|
158
196
|
* José Ignacio Fernández
|
159
197
|
|
198
|
+
* Alberto Mardomingo
|
199
|
+
|
160
200
|
* Jacobo Blasco
|
161
201
|
|
162
202
|
== LICENSE:
|
@@ -182,4 +222,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
182
222
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
183
223
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
184
224
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
185
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
225
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.2.1'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
# encoding: UTF-8
|
3
3
|
|
4
|
-
|
4
|
+
require 'rbconfig'
|
5
|
+
WINDOWS_PLATFORM = Config::CONFIG['host_os'] =~ /mswin|mingw/
|
6
|
+
|
7
|
+
if !WINDOWS_PLATFORM
|
5
8
|
stty_save = `stty -g`.chomp
|
6
9
|
trap('INT') { system('stty', stty_save); Scrappy::App.quit }
|
7
10
|
end
|
@@ -31,8 +34,8 @@ module Scrappy
|
|
31
34
|
OptionParser.new do |opts|
|
32
35
|
opts.on('-v', '--version') { output_version; exit 0 }
|
33
36
|
opts.on('-h', '--help') { output_help; exit 0 }
|
34
|
-
opts.on('-g
|
35
|
-
opts.on('-p
|
37
|
+
opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
|
38
|
+
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
36
39
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
37
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
38
41
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
@@ -47,15 +50,19 @@ module Scrappy
|
|
47
50
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
51
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
49
52
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
53
|
+
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
54
|
+
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
50
55
|
end.parse!(args)
|
51
56
|
@file = args.shift
|
52
57
|
end
|
53
58
|
|
54
59
|
def run
|
55
60
|
onload
|
56
|
-
if Options.
|
61
|
+
if Options.uri
|
57
62
|
Options.quiet = true
|
58
|
-
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.
|
63
|
+
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
|
+
elsif Options.observe
|
65
|
+
Agent.create.observe(Options.observe)
|
59
66
|
elsif Options.proxy
|
60
67
|
puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
|
61
68
|
require 'scrappy/server/proxy'
|
@@ -105,10 +112,12 @@ Options
|
|
105
112
|
-D, --dump Dumps RDF data to disk
|
106
113
|
-u, --debug Shows debugging traces
|
107
114
|
-i, --interactive Runs interactive shell
|
115
|
+
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
108
116
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
109
117
|
-S, --proxy-server Runs web proxy
|
110
118
|
-P, --port PORT Selects port number (default is 3434)
|
111
119
|
-V, --visual Uses visual agent (slow)
|
120
|
+
-t, --time DAYS Returns repository data from the last given minutes
|
112
121
|
-r, --reference Outputs referenceable data
|
113
122
|
-R, --reference-all Outputs all HTML referenceable data
|
114
123
|
-w, --window Shows browser window (requires -v)
|
@@ -127,15 +136,23 @@ Copyright
|
|
127
136
|
|
128
137
|
def onload
|
129
138
|
# Check local or global knowledge base
|
130
|
-
home =
|
139
|
+
home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
140
|
+
|
141
|
+
data_dirname = "kb"
|
142
|
+
cache_dirname = "cache"
|
143
|
+
cache_filename = "scrappy-#{Scrappy::VERSION}.kb"
|
144
|
+
config_filename = "config.yml"
|
131
145
|
|
132
|
-
if File.exists?(
|
133
|
-
data_folder
|
134
|
-
|
146
|
+
if File.exists?(File.join(home, data_dirname))
|
147
|
+
data_folder = File.join home, data_dirname
|
148
|
+
cache_folder = File.join home, cache_dirname
|
135
149
|
else
|
136
|
-
data_folder
|
137
|
-
|
150
|
+
data_folder = File.join Scrappy::Root, data_dirname
|
151
|
+
cache_folder = Dir.tmpdir
|
138
152
|
end
|
153
|
+
Dir.mkdir cache_folder if Dir[cache_folder].empty?
|
154
|
+
cache_file = File.join cache_folder, cache_filename
|
155
|
+
config_file = File.join home, config_filename
|
139
156
|
|
140
157
|
# Load knowledge base
|
141
158
|
Agent::Options.kb = if File.exists?(cache_file) and File.mtime(cache_file) >= Dir["#{data_folder}/*", data_folder].map{ |f| File.mtime(f) }.max
|
@@ -143,7 +160,7 @@ Copyright
|
|
143
160
|
open(cache_file) { |f| Marshal.load(f) }
|
144
161
|
else
|
145
162
|
# Load YARF files and cache kb
|
146
|
-
data = Dir[
|
163
|
+
data = Dir[File.join(data_folder, "*")].inject(RDF::Graph.new) do |kb, file|
|
147
164
|
extension = file.split('.').last.to_sym
|
148
165
|
graph = RDF::Parser.parse(extension, open(file).read)
|
149
166
|
kb.ns.merge! graph.ns
|
@@ -153,6 +170,17 @@ Copyright
|
|
153
170
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
154
171
|
data
|
155
172
|
end
|
173
|
+
|
174
|
+
# Looks for a configuration file. If it does not exist, Scrappy does not uses Sesame
|
175
|
+
# It looks for it in the home .scrappy dir
|
176
|
+
if File.exist?(config_file)
|
177
|
+
config = YAML::load_file(config_file)["repository"]
|
178
|
+
# Convert the strings from the YAML file into symbols
|
179
|
+
repository_options = {}
|
180
|
+
config.each { |k,v| repository_options[k.to_sym] = v }
|
181
|
+
Agent::Options.repository = Repository.new repository_options
|
182
|
+
end
|
183
|
+
|
156
184
|
RDF::ID.ns.merge! Agent::Options.kb.ns
|
157
185
|
end
|
158
186
|
end
|
data/lib/scrappy.rb
CHANGED
@@ -10,6 +10,7 @@ require 'tmpdir'
|
|
10
10
|
require 'lightrdf'
|
11
11
|
|
12
12
|
require 'scrappy/support'
|
13
|
+
require 'scrappy/repository'
|
13
14
|
|
14
15
|
require 'scrappy/agent/extractor'
|
15
16
|
require 'scrappy/agent/map_reduce'
|
@@ -21,7 +22,7 @@ require 'scrappy/agent/agent'
|
|
21
22
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
22
23
|
|
23
24
|
module Scrappy
|
24
|
-
VERSION = '0.
|
25
|
+
VERSION = '0.2.0'
|
25
26
|
end
|
26
27
|
|
27
28
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -38,6 +38,7 @@ module Scrappy
|
|
38
38
|
Agent.pool[@id] = self
|
39
39
|
@kb = args[:kb] || Options.kb
|
40
40
|
@options = Options.clone
|
41
|
+
@repository = args[:repository] || Options.repository
|
41
42
|
end
|
42
43
|
|
43
44
|
def map args, queue=nil
|
@@ -52,51 +53,35 @@ module Scrappy
|
|
52
53
|
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
53
54
|
|
54
55
|
cache[request][:response]
|
56
|
+
elsif @repository
|
57
|
+
# Extracts from the repository
|
58
|
+
request_from_repository(request)
|
55
59
|
else
|
56
60
|
# Perform the request
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
if options.debug
|
61
|
-
print "Opening #{request[:uri]}..."; $stdout.flush
|
62
|
-
end
|
63
|
-
|
64
|
-
if request[:method] == :get
|
65
|
-
self.uri = request[:uri]
|
66
|
-
else
|
67
|
-
raise Exception, 'POST requests not supported yet'
|
68
|
-
end
|
69
|
-
|
70
|
-
puts 'done!' if options.debug
|
71
|
-
|
72
|
-
response = if self.html_data?
|
73
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
74
|
-
extraction = extract self.uri, html, options.referenceable # Extract data
|
75
|
-
Dumper.dump self.uri, clean(extraction), options.format if options.dump # Dump results to disk
|
76
|
-
extraction
|
77
|
-
else
|
78
|
-
[]
|
79
|
-
end
|
61
|
+
request_uncached(request)
|
62
|
+
end
|
80
63
|
|
64
|
+
# If previous cache exists, do not cache it again
|
65
|
+
unless cache[request]
|
81
66
|
# Cache the request
|
82
|
-
cache[request] = { :time=>Time.now, :response=>
|
83
|
-
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>
|
84
|
-
|
85
|
-
response
|
67
|
+
cache[request] = { :time=>Time.now, :response=>triples }
|
68
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>triples } if self.uri
|
86
69
|
end
|
87
70
|
|
88
71
|
# Enqueue subresources
|
89
72
|
# Pages are enqueued without reducing depth
|
90
|
-
pages = triples.select { |s,p,o| p==
|
73
|
+
pages = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
|
91
74
|
|
92
75
|
# All other URIS are enqueued with depth reduced
|
93
76
|
uris = if depth != 0
|
94
|
-
(triples.map { |s, p, o| [s,o] }.flatten - [
|
77
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [ID(self.uri)] - pages).select{|n| n.is_a?(Symbol)}
|
95
78
|
else
|
96
79
|
[]
|
97
80
|
end
|
98
81
|
|
99
|
-
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
82
|
+
items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
|
83
|
+
uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
|
84
|
+
uniq.select{ |item| !RDF::ID.bnode?(item[:uri]) }
|
100
85
|
|
101
86
|
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
102
87
|
|
@@ -120,7 +105,7 @@ module Scrappy
|
|
120
105
|
|
121
106
|
puts 'done!'if options.debug
|
122
107
|
|
123
|
-
triples
|
108
|
+
triples.uniq
|
124
109
|
end
|
125
110
|
|
126
111
|
def request args={}
|
@@ -139,7 +124,7 @@ module Scrappy
|
|
139
124
|
print "Serializing..."; $stdout.flush
|
140
125
|
end
|
141
126
|
|
142
|
-
output = response.serialize request[:format],
|
127
|
+
output = response.serialize request[:format], options.format_header
|
143
128
|
|
144
129
|
puts 'done!'if options.debug
|
145
130
|
|
@@ -152,14 +137,106 @@ module Scrappy
|
|
152
137
|
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
153
138
|
end
|
154
139
|
|
140
|
+
# Method to observe several webs, and extract the data periodically
|
141
|
+
def observe uris
|
142
|
+
while true
|
143
|
+
time_init = Time.now.to_i
|
144
|
+
uris.each do |uri|
|
145
|
+
puts "Pinging #{uri}..."
|
146
|
+
request :uri=>uri
|
147
|
+
end
|
148
|
+
time = options.repository.time * 60 - (Time.now.to_i - time_init)
|
149
|
+
puts "Sleeping until #{Time.now + time}..."
|
150
|
+
sleep time
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
155
|
def complete_uri uri
|
156
156
|
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
157
|
-
uri = "http://#{uri}"
|
157
|
+
uri = "http://#{uri}" unless uri =~ /\A\w*:/
|
158
158
|
uri
|
159
159
|
end
|
160
|
-
|
160
|
+
|
161
161
|
def clean triples
|
162
|
-
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
162
|
+
triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
163
|
+
end
|
164
|
+
|
165
|
+
# Do the extraction using RDF repository
|
166
|
+
def request_from_repository request
|
167
|
+
triples = []
|
168
|
+
|
169
|
+
# Checks if there is any previous extraction within the last 15 minutes
|
170
|
+
contexts = if Options.time
|
171
|
+
@repository.recent_contexts(request[:uri], Options.time)
|
172
|
+
else
|
173
|
+
@repository.recent_contexts(request[:uri])
|
174
|
+
end
|
175
|
+
|
176
|
+
if contexts.empty?
|
177
|
+
# Extracts data from the uri
|
178
|
+
triples = request_uncached request
|
179
|
+
|
180
|
+
if options.debug
|
181
|
+
print "Storing into repository #{request[:uri]}..."; $stdout.flush
|
182
|
+
end
|
183
|
+
|
184
|
+
# Checks if the extraction returned something
|
185
|
+
graph = if triples.empty?
|
186
|
+
# Creates a triple to indicate that nothing was extracted from the uri
|
187
|
+
# This is done because otherwise the context wouldn't be stored
|
188
|
+
RDF::Graph.new [ [ID(request[:uri]), ID("sc:extraction"), ID("sc:Empty")] ]
|
189
|
+
else
|
190
|
+
RDF::Graph.new triples.uniq
|
191
|
+
end
|
192
|
+
|
193
|
+
# Adds data to sesame
|
194
|
+
@repository.data = graph, "#{request[:uri]}:#{Time.now.to_i}"
|
195
|
+
@repository.data = graph, "#{self.uri}:#{Time.now.to_i}" if self.uri
|
196
|
+
|
197
|
+
puts 'done!' if options.debug
|
198
|
+
|
199
|
+
triples
|
200
|
+
else
|
201
|
+
# Data found in repository. Asking for it
|
202
|
+
triples = []
|
203
|
+
if options.debug
|
204
|
+
print "Retrieving from repository #{request[:uri]}..."; $stdout.flush
|
205
|
+
end
|
206
|
+
contexts.each do |context|
|
207
|
+
graph = @repository.data(context)
|
208
|
+
triples += graph.triples.select{|s,p,o| p!=ID("sc:extraction")}
|
209
|
+
end
|
210
|
+
puts 'done!' if options.debug
|
211
|
+
|
212
|
+
triples
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Extracts from the uri
|
217
|
+
def request_uncached request
|
218
|
+
sleep 0.001 * options.delay.to_f # Sleep if requested
|
219
|
+
|
220
|
+
if options.debug
|
221
|
+
print "Opening #{request[:uri]}..."; $stdout.flush
|
222
|
+
end
|
223
|
+
|
224
|
+
if request[:method] == :get
|
225
|
+
self.uri = request[:uri]
|
226
|
+
else
|
227
|
+
raise Exception, 'POST requests not supported yet'
|
228
|
+
end
|
229
|
+
|
230
|
+
puts 'done!' if options.debug
|
231
|
+
|
232
|
+
if self.html_data?
|
233
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
234
|
+
triples = extract(self.uri, html, options.referenceable) # Extract data
|
235
|
+
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
236
|
+
triples
|
237
|
+
else
|
238
|
+
[]
|
239
|
+
end
|
163
240
|
end
|
164
241
|
end
|
165
242
|
end
|
@@ -6,7 +6,7 @@ module Scrappy
|
|
6
6
|
if options.debug
|
7
7
|
print "Extracting #{uri}..."; $stdout.flush
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
@selector_pool ||= {}
|
11
11
|
triples = []
|
12
12
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
@@ -27,7 +27,11 @@ module Scrappy
|
|
27
27
|
|
28
28
|
puts "done!" if options.debug
|
29
29
|
|
30
|
-
triples
|
30
|
+
triples.map do |s,p,o|
|
31
|
+
[ s.is_a?(RDF::Node) ? s.id : s,
|
32
|
+
p.is_a?(RDF::Node) ? p.id : p,
|
33
|
+
o.is_a?(RDF::Node) ? o.id : o ]
|
34
|
+
end
|
31
35
|
end
|
32
36
|
|
33
37
|
private
|
@@ -150,32 +154,32 @@ module Scrappy
|
|
150
154
|
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
151
155
|
|
152
156
|
content.search('*').each do |node|
|
157
|
+
next if node.text?
|
158
|
+
|
153
159
|
fragment = Node(node_hash(uri, node.path))
|
154
|
-
|
160
|
+
|
155
161
|
if referenceable == :dump or resources[fragment]
|
156
|
-
selector
|
162
|
+
selector = Node(nil)
|
157
163
|
presentation = Node(nil)
|
158
164
|
|
159
|
-
selector
|
160
|
-
selector
|
161
|
-
selector
|
162
|
-
selector
|
163
|
-
|
164
|
-
presentation
|
165
|
-
presentation
|
166
|
-
presentation
|
167
|
-
presentation
|
168
|
-
presentation
|
169
|
-
presentation
|
170
|
-
presentation
|
171
|
-
presentation
|
172
|
-
presentation
|
173
|
-
presentation
|
174
|
-
|
175
|
-
fragment
|
176
|
-
fragment
|
177
|
-
|
178
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples)
|
165
|
+
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
166
|
+
triples << [selector, ID('sc:path'), node.path.to_s]
|
167
|
+
triples << [selector, ID('sc:tag'), node.name.to_s]
|
168
|
+
triples << [selector, ID('sc:document'), uri]
|
169
|
+
|
170
|
+
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
171
|
+
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
172
|
+
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
173
|
+
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
174
|
+
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
175
|
+
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
176
|
+
triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
|
177
|
+
triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
|
178
|
+
triples << [presentation, ID('sc:text'), node.text.strip]
|
179
|
+
triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
|
180
|
+
|
181
|
+
triples << [fragment, ID('sc:selector'), selector]
|
182
|
+
triples << [fragment, ID('sc:presentation'), presentation]
|
179
183
|
end
|
180
184
|
end
|
181
185
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Repository < RDF::Repository
|
3
|
+
# Processes the list of context, checks if there is any extraction
|
4
|
+
# from the last X minutes, and returns an array with them.
|
5
|
+
# If there is not any extraction, returns an empty array
|
6
|
+
def recent_contexts uri, seconds=@options[:time].to_i*60
|
7
|
+
return [] unless uri
|
8
|
+
contexts.select do |context|
|
9
|
+
date = context_date(context)
|
10
|
+
date and check_date(date, seconds) and context_uri(context) == uri
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def time
|
15
|
+
@options[:time]
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
# Checks if the context date is within the indicated time
|
20
|
+
def check_date date, seconds
|
21
|
+
(Time.now.to_i - date) <= seconds
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns an integer with the date of a given context
|
25
|
+
def context_date context
|
26
|
+
$1.to_i if context =~ /:(\d+)\Z/
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the URI of a context
|
30
|
+
def context_uri context
|
31
|
+
$1 if context =~ /\A(.*):(\d+)\Z/
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.2.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-03-
|
9
|
+
s.date = %q{2011-03-09}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/repository.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
@@ -32,8 +32,9 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
33
33
|
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
-
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.
|
35
|
+
s.add_runtime_dependency(%q<lightrdf>, [">= 0.2.1"])
|
36
36
|
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
|
+
s.add_runtime_dependency(%q<rest-client>, [">= 1.6.1"])
|
37
38
|
s.add_runtime_dependency(%q<haml>, [">= 3.0.24"])
|
38
39
|
else
|
39
40
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -41,8 +42,9 @@ Gem::Specification.new do |s|
|
|
41
42
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
42
43
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
43
44
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
44
|
-
s.add_dependency(%q<lightrdf>, [">= 0.2.
|
45
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
|
45
46
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
47
|
+
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
46
48
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
47
49
|
end
|
48
50
|
else
|
@@ -51,8 +53,9 @@ Gem::Specification.new do |s|
|
|
51
53
|
s.add_dependency(%q<thin>, [">= 1.2.7"])
|
52
54
|
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
53
55
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
54
|
-
s.add_dependency(%q<lightrdf>, [">= 0.2.
|
56
|
+
s.add_dependency(%q<lightrdf>, [">= 0.2.1"])
|
55
57
|
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
58
|
+
s.add_dependency(%q<rest-client>, [">= 1.6.1"])
|
56
59
|
s.add_dependency(%q<haml>, [">= 3.0.24"])
|
57
60
|
end
|
58
61
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-09 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,8 +97,8 @@ dependencies:
|
|
97
97
|
segments:
|
98
98
|
- 0
|
99
99
|
- 2
|
100
|
-
-
|
101
|
-
version: 0.2.
|
100
|
+
- 1
|
101
|
+
version: 0.2.1
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id006
|
104
104
|
- !ruby/object:Gem::Dependency
|
@@ -116,9 +116,23 @@ dependencies:
|
|
116
116
|
type: :runtime
|
117
117
|
version_requirements: *id007
|
118
118
|
- !ruby/object:Gem::Dependency
|
119
|
-
name:
|
119
|
+
name: rest-client
|
120
120
|
prerelease: false
|
121
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - ">="
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
segments:
|
126
|
+
- 1
|
127
|
+
- 6
|
128
|
+
- 1
|
129
|
+
version: 1.6.1
|
130
|
+
type: :runtime
|
131
|
+
version_requirements: *id008
|
132
|
+
- !ruby/object:Gem::Dependency
|
133
|
+
name: haml
|
134
|
+
prerelease: false
|
135
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
122
136
|
requirements:
|
123
137
|
- - ">="
|
124
138
|
- !ruby/object:Gem::Version
|
@@ -128,7 +142,7 @@ dependencies:
|
|
128
142
|
- 24
|
129
143
|
version: 3.0.24
|
130
144
|
type: :runtime
|
131
|
-
version_requirements: *
|
145
|
+
version_requirements: *id009
|
132
146
|
description: RDF web scraper
|
133
147
|
email: joseignacio.fernandez@gmail.com
|
134
148
|
executables:
|
@@ -164,6 +178,7 @@ extra_rdoc_files:
|
|
164
178
|
- lib/scrappy/server/public/stylesheets/application.css
|
165
179
|
- lib/scrappy/server/views/home.haml
|
166
180
|
- lib/scrappy/server/views/help.haml
|
181
|
+
- lib/scrappy/repository.rb
|
167
182
|
- lib/scrappy/shell.rb
|
168
183
|
- lib/scrappy/support.rb
|
169
184
|
- lib/scrappy/webkit/webkit.rb
|
@@ -200,6 +215,7 @@ files:
|
|
200
215
|
- lib/scrappy/server/public/stylesheets/application.css
|
201
216
|
- lib/scrappy/server/views/home.haml
|
202
217
|
- lib/scrappy/server/views/help.haml
|
218
|
+
- lib/scrappy/repository.rb
|
203
219
|
- lib/scrappy/shell.rb
|
204
220
|
- lib/scrappy/support.rb
|
205
221
|
- lib/scrappy/webkit/webkit.rb
|