scrappy 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -9,32 +9,39 @@ lib/scrappy/agent/agent.rb
|
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
10
10
|
lib/scrappy/agent/cache.rb
|
11
11
|
lib/scrappy/agent/dumper.rb
|
12
|
-
lib/scrappy/agent/extractor.rb
|
13
|
-
lib/scrappy/agent/formats.rb
|
14
12
|
lib/scrappy/agent/map_reduce.rb
|
13
|
+
lib/scrappy/extractor/extractor.rb
|
14
|
+
lib/scrappy/extractor/formats.rb
|
15
|
+
lib/scrappy/extractor/fragment.rb
|
16
|
+
lib/scrappy/extractor/selector.rb
|
17
|
+
lib/scrappy/extractor/selectors/base_uri.rb
|
18
|
+
lib/scrappy/extractor/selectors/css.rb
|
19
|
+
lib/scrappy/extractor/selectors/new_uri.rb
|
20
|
+
lib/scrappy/extractor/selectors/root.rb
|
21
|
+
lib/scrappy/extractor/selectors/section.rb
|
22
|
+
lib/scrappy/extractor/selectors/slice.rb
|
23
|
+
lib/scrappy/extractor/selectors/uri.rb
|
24
|
+
lib/scrappy/extractor/selectors/uri_pattern.rb
|
25
|
+
lib/scrappy/extractor/selectors/visual.rb
|
26
|
+
lib/scrappy/extractor/selectors/xpath.rb
|
15
27
|
lib/scrappy/repository.rb
|
16
|
-
lib/scrappy/selectors/base_uri.rb
|
17
|
-
lib/scrappy/selectors/css.rb
|
18
|
-
lib/scrappy/selectors/new_uri.rb
|
19
|
-
lib/scrappy/selectors/root.rb
|
20
|
-
lib/scrappy/selectors/section.rb
|
21
|
-
lib/scrappy/selectors/slice.rb
|
22
|
-
lib/scrappy/selectors/uri.rb
|
23
|
-
lib/scrappy/selectors/uri_pattern.rb
|
24
|
-
lib/scrappy/selectors/xpath.rb
|
25
28
|
lib/scrappy/server/admin.rb
|
26
29
|
lib/scrappy/server/errors.rb
|
27
30
|
lib/scrappy/server/helpers.rb
|
28
31
|
lib/scrappy/server/server.rb
|
29
32
|
lib/scrappy/support.rb
|
33
|
+
lib/scrappy/trainer/trainer.rb
|
30
34
|
public/favicon.ico
|
31
35
|
public/images/logo.png
|
32
36
|
public/images/logo_tiny.png
|
33
|
-
public/javascripts/
|
37
|
+
public/javascripts/annotator.js
|
38
|
+
public/javascripts/remote.js
|
34
39
|
public/stylesheets/application.css
|
35
40
|
test/test_helper.rb
|
36
41
|
test/test_scrappy.rb
|
42
|
+
views/extractors.haml
|
37
43
|
views/help.haml
|
38
44
|
views/home.haml
|
39
|
-
views/
|
40
|
-
views/
|
45
|
+
views/layout.haml
|
46
|
+
views/patterns.haml
|
47
|
+
views/samples.haml
|
data/README.rdoc
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
|
7
|
+
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
8
8
|
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
9
9
|
|
10
10
|
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
@@ -48,13 +48,13 @@ RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
|
48
48
|
|
49
49
|
== SYNOPSIS:
|
50
50
|
|
51
|
-
A knowledge base of mappings can be defined by storing RDF files inside ~/.scrappy/
|
51
|
+
A knowledge base of mappings can be defined by storing RDF files inside ~/.scrappy/extractors folder.
|
52
52
|
Then, the command-line tool can be used to get RDF data from web sites. You can get help on this
|
53
53
|
tool by typing:
|
54
54
|
|
55
55
|
$ scrappy --help
|
56
56
|
|
57
|
-
|
57
|
+
Scrappy offers many different interfaces to get RDF data from a web page:
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
@@ -78,7 +78,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
78
78
|
|
79
79
|
* Ruby interface:
|
80
80
|
|
81
|
-
You can use
|
81
|
+
You can use Scrappy in a Ruby program by requiring the gem:
|
82
82
|
|
83
83
|
require 'rubygems'
|
84
84
|
require 'scrappy'
|
@@ -143,11 +143,7 @@ Install it as any other gem:
|
|
143
143
|
The gem also requires raptor library (in Debian systems: sudo aptitude install raptor-utils), which is used
|
144
144
|
for outputting different RDF serialization formats.
|
145
145
|
|
146
|
-
|
147
|
-
|
148
|
-
* Visual parsing requires rbwebkitgtk: http://github.com/danlucraft/rbwebkitgtk
|
149
|
-
|
150
|
-
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
146
|
+
PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
151
147
|
|
152
148
|
In order to use Sesame, you will need to install it. Further instructions can be found
|
153
149
|
in the openRDF website, more precisely, in http://www.openrdf.org/doc/sesame2/users/ch06.html .
|
data/Rakefile
CHANGED
@@ -9,9 +9,8 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
9
9
|
p.url = "http://github.com/josei/scrappy"
|
10
10
|
p.author = "Jose Ignacio"
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
|
-
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
12
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.
|
13
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.0'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
|
15
14
|
end
|
16
15
|
|
17
16
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -18,14 +18,10 @@ module Scrappy
|
|
18
18
|
|
19
19
|
SESSION_TOKEN = rand(100000000)
|
20
20
|
Options = OpenStruct.new
|
21
|
+
Kb = OpenStruct.new
|
21
22
|
|
22
23
|
class App
|
23
|
-
def self.
|
24
|
-
puts "\"#{Quotes.sort_by{rand}.first}\"" unless Options.quiet
|
25
|
-
exit
|
26
|
-
end
|
27
|
-
|
28
|
-
def initialize
|
24
|
+
def self.run
|
29
25
|
Options.port = 3434
|
30
26
|
Agent::Options.workers = 10
|
31
27
|
Agent::Options.depth = -1
|
@@ -51,28 +47,22 @@ module Scrappy
|
|
51
47
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
52
48
|
end.parse!(args)
|
53
49
|
@file = args.shift
|
54
|
-
end
|
55
50
|
|
56
|
-
def run
|
57
|
-
onload
|
58
51
|
if Options.uri
|
59
52
|
Options.quiet = true
|
60
53
|
puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
|
61
54
|
elsif Options.observe
|
62
55
|
Agent.new.observe(Options.observe)
|
63
|
-
elsif Options.admin
|
64
|
-
|
56
|
+
elsif Options.admin or Options.server
|
57
|
+
if Options.admin
|
58
|
+
puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
|
59
|
+
else
|
60
|
+
puts "Launching Scrappy Web Server..."
|
61
|
+
end
|
65
62
|
require 'scrappy/server/server'
|
66
63
|
Thin::Logging.silent = true
|
67
|
-
Scrappy::Server.register Scrappy::Admin
|
68
64
|
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
69
65
|
:base_uri=>Options.base_uri
|
70
|
-
elsif Options.server
|
71
|
-
puts "Launching Scrappy Web Server..."
|
72
|
-
require 'scrappy/server/server'
|
73
|
-
Thin::Logging.silent = true
|
74
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
75
|
-
:base_uri => Options.base_uri
|
76
66
|
else
|
77
67
|
output_version
|
78
68
|
puts 'To get help use: scrappy -h'
|
@@ -81,8 +71,75 @@ module Scrappy
|
|
81
71
|
Scrappy::App.quit
|
82
72
|
end
|
83
73
|
|
74
|
+
def self.quit
|
75
|
+
puts "\"#{Quotes.sort_by{rand}.first}\"" unless Options.quiet
|
76
|
+
exit
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.data_folder
|
80
|
+
@data_folder
|
81
|
+
end
|
82
|
+
def self.cache_folder
|
83
|
+
@cache_folder
|
84
|
+
end
|
85
|
+
def self.samples_file
|
86
|
+
@samples_file
|
87
|
+
end
|
88
|
+
def self.samples
|
89
|
+
@samples ||= []
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.add_sample sample
|
93
|
+
self.samples.unshift sample
|
94
|
+
save_samples
|
95
|
+
sample
|
96
|
+
end
|
97
|
+
def self.delete_sample id
|
98
|
+
@samples.delete @samples[id]
|
99
|
+
save_samples
|
100
|
+
end
|
101
|
+
def self.save_samples
|
102
|
+
open(@samples_file, "w") { |f| Marshal.dump(@samples, f) }
|
103
|
+
end
|
104
|
+
def self.editable_kb?
|
105
|
+
@editable_kb
|
106
|
+
end
|
107
|
+
def self.add_pattern graph
|
108
|
+
new_patterns = Scrappy::Kb.patterns.merge graph
|
109
|
+
open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
|
110
|
+
onload
|
111
|
+
end
|
112
|
+
def self.delete_pattern uri
|
113
|
+
graph = Scrappy::Kb.patterns
|
114
|
+
fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
|
115
|
+
select { |fragment| fragment.sc::type.include?(Node(uri)) }
|
116
|
+
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
117
|
+
open(@patterns_file, "w") { |f| f.write graph.serialize(:yarf) }
|
118
|
+
onload
|
119
|
+
end
|
120
|
+
def self.add_extractor graph
|
121
|
+
open(File.join(@extractors_folder,"extractor_#{Dir[File.join(@extractors_folder,'*')].size}.yarf"), "w") { |f| f.write graph.serialize(:yarf) }
|
122
|
+
onload
|
123
|
+
end
|
124
|
+
def self.delete_extractor uri
|
125
|
+
Dir[File.join(@extractors_folder, '*')].each do |file|
|
126
|
+
format = file.split('.').last.to_sym
|
127
|
+
next if format==:ignore
|
128
|
+
graph = RDF::Parser.parse format, open(file).read
|
129
|
+
uri_selectors = (graph.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
130
|
+
graph.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).
|
131
|
+
flatten.select do |uri_selector|
|
132
|
+
uri_selector.rdf::value.include?(uri)
|
133
|
+
end
|
134
|
+
fragments = uri_selectors.map { |uri_selector| graph.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
135
|
+
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
136
|
+
open(file, "w") { |f| f.write graph.serialize(format) } if fragments.any?
|
137
|
+
end
|
138
|
+
onload
|
139
|
+
end
|
140
|
+
|
84
141
|
protected
|
85
|
-
def output_help
|
142
|
+
def self.output_help
|
86
143
|
output_version
|
87
144
|
puts """Synopsis
|
88
145
|
Scrappy is a tool to scrape semantic data out of the unstructured web
|
@@ -123,59 +180,92 @@ Copyright
|
|
123
180
|
http://www.opensource.org/licenses/mit-license.php"""
|
124
181
|
end
|
125
182
|
|
126
|
-
def output_version
|
183
|
+
def self.output_version
|
127
184
|
puts "Scrappy v#{Scrappy::VERSION}"
|
128
185
|
end
|
129
186
|
|
130
|
-
def
|
187
|
+
def self.define_paths
|
131
188
|
# Check local or global knowledge base
|
132
|
-
home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
189
|
+
@home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
133
190
|
|
134
|
-
|
135
|
-
cache_dirname
|
136
|
-
cache_filename
|
137
|
-
config_filename
|
191
|
+
extractors_dirname = "extractors"
|
192
|
+
cache_dirname = "cache"
|
193
|
+
cache_filename = "scrappy-#{Scrappy::VERSION}.kb"
|
194
|
+
config_filename = "config.yml"
|
195
|
+
samples_filename = "samples"
|
196
|
+
patterns_filename = "patterns.yarf"
|
138
197
|
|
139
|
-
if File.exists?(File.join(home,
|
140
|
-
|
141
|
-
|
198
|
+
if File.exists?(File.join(@home, extractors_dirname))
|
199
|
+
@editable_kb = true
|
200
|
+
@extractors_folder = File.join @home, extractors_dirname
|
201
|
+
@cache_folder = File.join @home, cache_dirname
|
142
202
|
else
|
143
|
-
|
144
|
-
|
203
|
+
@editable_kb = false
|
204
|
+
@extractors_folder = File.join Scrappy::Root, extractors_dirname
|
205
|
+
@cache_folder = Dir.tmpdir
|
145
206
|
end
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
207
|
+
@cache_file = File.join @cache_folder, cache_filename
|
208
|
+
@config_file = File.join @home, config_filename
|
209
|
+
@samples_file = File.join @home, samples_filename
|
210
|
+
@patterns_file = File.join @home, patterns_filename
|
211
|
+
Dir.mkdir @home if Dir[@home].empty?
|
212
|
+
Dir.mkdir cache_folder if Dir[cache_folder].empty?
|
213
|
+
end
|
214
|
+
|
215
|
+
def self.load_files_from folder
|
216
|
+
kb = RDF::Graph.new
|
217
|
+
kb.triples = ( Dir[File.join(folder, "*")].inject([]) do |triples, file|
|
218
|
+
extension = file.split('.').last.to_sym
|
219
|
+
triples + if extension==:ignore or File.directory?(file)
|
220
|
+
[]
|
221
|
+
else
|
222
|
+
graph = RDF::Parser.parse(extension, open(file).read)
|
223
|
+
kb.ns.merge! graph.ns
|
224
|
+
graph.triples
|
225
|
+
end
|
226
|
+
end )
|
227
|
+
kb
|
228
|
+
end
|
229
|
+
|
230
|
+
def self.onload
|
231
|
+
define_paths
|
232
|
+
|
150
233
|
# Load knowledge base
|
151
|
-
Agent::Options.kb
|
234
|
+
Agent::Options.kb ||= RDF::Graph.new
|
235
|
+
|
236
|
+
Kb.extractors, Kb.patterns = if File.exists?(@cache_file) and File.mtime(@cache_file) >= Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
|
152
237
|
# Just load kb from cache
|
153
|
-
open(cache_file) { |f| Marshal.load(f) }
|
238
|
+
open(@cache_file) { |f| Marshal.load(f) }
|
154
239
|
else
|
155
240
|
# Load YARF files and cache kb
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
kb.merge!(extension==:ignore ? RDF::Graph.new : graph)
|
161
|
-
kb
|
162
|
-
end
|
163
|
-
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
164
|
-
data
|
241
|
+
extractors = load_files_from(@extractors_folder)
|
242
|
+
patterns = File.exists?(@patterns_file) ? RDF::Parser.parse(:yarf, open(@patterns_file).read) : RDF::Graph.new
|
243
|
+
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns], f) }
|
244
|
+
[extractors, patterns]
|
165
245
|
end
|
166
246
|
|
167
|
-
#
|
247
|
+
# Sets new kb
|
248
|
+
Agent::Options.kb.replace Kb.extractors
|
249
|
+
Agent::Options.kb.ns = Kb.extractors.ns
|
250
|
+
# Adds defined prefixes to namespace
|
251
|
+
RDF::ID.ns.merge! Agent::Options.kb.ns
|
252
|
+
|
253
|
+
# Looks for a configuration file. If it does not exist, Scrappy does not use Sesame
|
168
254
|
# It looks for it in the home .scrappy dir
|
169
|
-
if File.exist?(config_file)
|
170
|
-
config = YAML::load_file(config_file)["repository"]
|
255
|
+
if File.exist?(@config_file)
|
256
|
+
config = YAML::load_file(@config_file)["repository"]
|
171
257
|
# Convert the strings from the YAML file into symbols
|
172
258
|
repository_options = {}
|
173
259
|
config.each { |k,v| repository_options[k.to_sym] = v }
|
174
260
|
Agent::Options.repository = Repository.new repository_options
|
175
261
|
end
|
176
262
|
|
177
|
-
|
263
|
+
if File.exist?(@samples_file)
|
264
|
+
@samples = open(@samples_file) { |f| Marshal.load(f) }
|
265
|
+
end
|
178
266
|
end
|
267
|
+
|
268
|
+
self.onload
|
179
269
|
end
|
180
270
|
|
181
271
|
Quotes = """Knowledge talks, wisdom listens
|
@@ -259,4 +349,4 @@ The reward of a thing well done is to have done it
|
|
259
349
|
Don’t argue with idiots. They will bring you down to their level and beat you with experience""".split("\n")
|
260
350
|
end
|
261
351
|
|
262
|
-
Scrappy::App.
|
352
|
+
Scrappy::App.run
|
data/lib/scrappy.rb
CHANGED
@@ -9,22 +9,19 @@ require 'active_support'
|
|
9
9
|
require 'tmpdir'
|
10
10
|
require 'lightrdf'
|
11
11
|
|
12
|
+
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
13
|
+
|
12
14
|
require 'scrappy/support'
|
13
15
|
require 'scrappy/repository'
|
14
16
|
|
15
|
-
require 'scrappy/
|
17
|
+
require 'scrappy/extractor/extractor'
|
18
|
+
require 'scrappy/trainer/trainer'
|
16
19
|
require 'scrappy/agent/map_reduce'
|
17
20
|
require 'scrappy/agent/cache'
|
18
21
|
require 'scrappy/agent/dumper'
|
19
|
-
require 'scrappy/agent/formats'
|
20
22
|
require 'scrappy/agent/blind_agent'
|
21
23
|
require 'scrappy/agent/agent'
|
22
24
|
|
23
|
-
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
24
|
-
|
25
25
|
module Scrappy
|
26
|
-
VERSION = '0.3.
|
27
|
-
end
|
28
|
-
|
29
|
-
# Require selectors
|
30
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
|
26
|
+
VERSION = '0.3.1'
|
27
|
+
end
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -2,6 +2,7 @@ module Scrappy
|
|
2
2
|
class Agent
|
3
3
|
include MonitorMixin
|
4
4
|
include Extractor
|
5
|
+
include Trainer
|
5
6
|
include MapReduce
|
6
7
|
include Cached
|
7
8
|
include BlindAgent
|
@@ -151,8 +152,7 @@ module Scrappy
|
|
151
152
|
end
|
152
153
|
|
153
154
|
def clean triples
|
154
|
-
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
155
|
-
select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
155
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
156
156
|
end
|
157
157
|
|
158
158
|
# Do the extraction using RDF repository
|
@@ -223,7 +223,7 @@ module Scrappy
|
|
223
223
|
puts 'done!' if options.debug
|
224
224
|
|
225
225
|
if self.html_data?
|
226
|
-
triples = extract(self.uri, html, options.referenceable) # Extract data
|
226
|
+
triples = extract(self.uri, html, self.kb, options.referenceable) # Extract data
|
227
227
|
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
228
228
|
triples
|
229
229
|
else
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'scrappy/extractor/fragment'
|
3
|
+
require 'scrappy/extractor/formats'
|
4
|
+
require 'scrappy/extractor/selector'
|
5
|
+
# Require selectors
|
6
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/selectors/*.rb"].each { |f| require f }
|
7
|
+
|
8
|
+
module Scrappy
|
9
|
+
module Extractor
|
10
|
+
def extract uri, html, kb, referenceable=nil
|
11
|
+
synchronize do
|
12
|
+
if options.debug
|
13
|
+
print "Extracting #{uri}..."; $stdout.flush
|
14
|
+
end
|
15
|
+
|
16
|
+
# Restart stateful selectors
|
17
|
+
kb = RDF::Graph.new(kb.triples)
|
18
|
+
|
19
|
+
# Parse document
|
20
|
+
content = Nokogiri::HTML(html, nil, 'utf-8')
|
21
|
+
|
22
|
+
# Extract each fragment
|
23
|
+
options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
|
24
|
+
triples = []
|
25
|
+
fragments_for(kb, uri).each do |fragment|
|
26
|
+
kb.node(fragment).extract(options).each do |node|
|
27
|
+
triples += node.graph.triples
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Add references to sources if requested
|
32
|
+
triples += add_referenceable_data uri, content, triples, referenceable if referenceable
|
33
|
+
|
34
|
+
puts "done!" if self.options.debug
|
35
|
+
|
36
|
+
triples
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def fragments_for kb, uri
|
41
|
+
uri_selectors = ( kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
42
|
+
kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
|
43
|
+
flatten.select do |uri_selector|
|
44
|
+
!kb.node(uri_selector).filter(:uri=>uri).empty?
|
45
|
+
end
|
46
|
+
|
47
|
+
visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
|
48
|
+
|
49
|
+
selectors = uri_selectors + visual_selectors
|
50
|
+
|
51
|
+
selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
|
52
|
+
flatten.
|
53
|
+
select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def add_referenceable_data uri, content, given_triples, referenceable
|
58
|
+
triples = []
|
59
|
+
resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
|
60
|
+
|
61
|
+
fragment = Node(Extractor.node_hash(uri, '/'))
|
62
|
+
selector = Node(nil)
|
63
|
+
presentation = Node(nil)
|
64
|
+
|
65
|
+
selector.rdf::type = Node('sc:UnivocalSelector')
|
66
|
+
selector.sc::path = '/'
|
67
|
+
selector.sc::document = uri
|
68
|
+
|
69
|
+
fragment.sc::selector = selector
|
70
|
+
|
71
|
+
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
|
72
|
+
|
73
|
+
content.search('*').each do |node|
|
74
|
+
next if node.text?
|
75
|
+
|
76
|
+
fragment = Extractor.node_hash(uri, node.path)
|
77
|
+
|
78
|
+
if referenceable == :dump or resources[fragment]
|
79
|
+
selector = ID(nil)
|
80
|
+
presentation = ID(nil)
|
81
|
+
|
82
|
+
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
83
|
+
triples << [selector, ID('sc:path'), node.path.to_s]
|
84
|
+
triples << [selector, ID('sc:tag'), node.name.to_s]
|
85
|
+
triples << [selector, ID('sc:document'), uri]
|
86
|
+
|
87
|
+
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
88
|
+
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
89
|
+
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
90
|
+
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
91
|
+
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
92
|
+
triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
|
93
|
+
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
94
|
+
triples << [presentation, ID('sc:text'), node.text.strip]
|
95
|
+
|
96
|
+
triples << [fragment, ID('sc:selector'), selector]
|
97
|
+
triples << [fragment, ID('sc:presentation'), presentation]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
triples
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.node_hash uri, path
|
104
|
+
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
105
|
+
:"_:bnode#{digest}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|