scrappy 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -9,32 +9,39 @@ lib/scrappy/agent/agent.rb
|
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
10
10
|
lib/scrappy/agent/cache.rb
|
11
11
|
lib/scrappy/agent/dumper.rb
|
12
|
-
lib/scrappy/agent/extractor.rb
|
13
|
-
lib/scrappy/agent/formats.rb
|
14
12
|
lib/scrappy/agent/map_reduce.rb
|
13
|
+
lib/scrappy/extractor/extractor.rb
|
14
|
+
lib/scrappy/extractor/formats.rb
|
15
|
+
lib/scrappy/extractor/fragment.rb
|
16
|
+
lib/scrappy/extractor/selector.rb
|
17
|
+
lib/scrappy/extractor/selectors/base_uri.rb
|
18
|
+
lib/scrappy/extractor/selectors/css.rb
|
19
|
+
lib/scrappy/extractor/selectors/new_uri.rb
|
20
|
+
lib/scrappy/extractor/selectors/root.rb
|
21
|
+
lib/scrappy/extractor/selectors/section.rb
|
22
|
+
lib/scrappy/extractor/selectors/slice.rb
|
23
|
+
lib/scrappy/extractor/selectors/uri.rb
|
24
|
+
lib/scrappy/extractor/selectors/uri_pattern.rb
|
25
|
+
lib/scrappy/extractor/selectors/visual.rb
|
26
|
+
lib/scrappy/extractor/selectors/xpath.rb
|
15
27
|
lib/scrappy/repository.rb
|
16
|
-
lib/scrappy/selectors/base_uri.rb
|
17
|
-
lib/scrappy/selectors/css.rb
|
18
|
-
lib/scrappy/selectors/new_uri.rb
|
19
|
-
lib/scrappy/selectors/root.rb
|
20
|
-
lib/scrappy/selectors/section.rb
|
21
|
-
lib/scrappy/selectors/slice.rb
|
22
|
-
lib/scrappy/selectors/uri.rb
|
23
|
-
lib/scrappy/selectors/uri_pattern.rb
|
24
|
-
lib/scrappy/selectors/xpath.rb
|
25
28
|
lib/scrappy/server/admin.rb
|
26
29
|
lib/scrappy/server/errors.rb
|
27
30
|
lib/scrappy/server/helpers.rb
|
28
31
|
lib/scrappy/server/server.rb
|
29
32
|
lib/scrappy/support.rb
|
33
|
+
lib/scrappy/trainer/trainer.rb
|
30
34
|
public/favicon.ico
|
31
35
|
public/images/logo.png
|
32
36
|
public/images/logo_tiny.png
|
33
|
-
public/javascripts/
|
37
|
+
public/javascripts/annotator.js
|
38
|
+
public/javascripts/remote.js
|
34
39
|
public/stylesheets/application.css
|
35
40
|
test/test_helper.rb
|
36
41
|
test/test_scrappy.rb
|
42
|
+
views/extractors.haml
|
37
43
|
views/help.haml
|
38
44
|
views/home.haml
|
39
|
-
views/
|
40
|
-
views/
|
45
|
+
views/layout.haml
|
46
|
+
views/patterns.haml
|
47
|
+
views/samples.haml
|
data/README.rdoc
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
|
7
|
+
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
8
8
|
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
9
9
|
|
10
10
|
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
@@ -48,13 +48,13 @@ RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
|
48
48
|
|
49
49
|
== SYNOPSIS:
|
50
50
|
|
51
|
-
A knowledge base of mappings can be defined by storing RDF files inside ~/.scrappy/
|
51
|
+
A knowledge base of mappings can be defined by storing RDF files inside ~/.scrappy/extractors folder.
|
52
52
|
Then, the command-line tool can be used to get RDF data from web sites. You can get help on this
|
53
53
|
tool by typing:
|
54
54
|
|
55
55
|
$ scrappy --help
|
56
56
|
|
57
|
-
|
57
|
+
Scrappy offers many different interfaces to get RDF data from a web page:
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
@@ -78,7 +78,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
78
78
|
|
79
79
|
* Ruby interface:
|
80
80
|
|
81
|
-
You can use
|
81
|
+
You can use Scrappy in a Ruby program by requiring the gem:
|
82
82
|
|
83
83
|
require 'rubygems'
|
84
84
|
require 'scrappy'
|
@@ -143,11 +143,7 @@ Install it as any other gem:
|
|
143
143
|
The gem also requires raptor library (in Debian systems: sudo aptitude install raptor-utils), which is used
|
144
144
|
for outputting different RDF serialization formats.
|
145
145
|
|
146
|
-
|
147
|
-
|
148
|
-
* Visual parsing requires rbwebkitgtk: http://github.com/danlucraft/rbwebkitgtk
|
149
|
-
|
150
|
-
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
146
|
+
PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
151
147
|
|
152
148
|
In order to use Sesame, you will need to install it. Further instructions can be found
|
153
149
|
in the openRDF website, more precisely, in http://www.openrdf.org/doc/sesame2/users/ch06.html .
|
data/Rakefile
CHANGED
@@ -9,9 +9,8 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
9
9
|
p.url = "http://github.com/josei/scrappy"
|
10
10
|
p.author = "Jose Ignacio"
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
|
-
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
12
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.
|
13
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.0'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
|
15
14
|
end
|
16
15
|
|
17
16
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -18,14 +18,10 @@ module Scrappy
|
|
18
18
|
|
19
19
|
SESSION_TOKEN = rand(100000000)
|
20
20
|
Options = OpenStruct.new
|
21
|
+
Kb = OpenStruct.new
|
21
22
|
|
22
23
|
class App
|
23
|
-
def self.
|
24
|
-
puts "\"#{Quotes.sort_by{rand}.first}\"" unless Options.quiet
|
25
|
-
exit
|
26
|
-
end
|
27
|
-
|
28
|
-
def initialize
|
24
|
+
def self.run
|
29
25
|
Options.port = 3434
|
30
26
|
Agent::Options.workers = 10
|
31
27
|
Agent::Options.depth = -1
|
@@ -51,28 +47,22 @@ module Scrappy
|
|
51
47
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
52
48
|
end.parse!(args)
|
53
49
|
@file = args.shift
|
54
|
-
end
|
55
50
|
|
56
|
-
def run
|
57
|
-
onload
|
58
51
|
if Options.uri
|
59
52
|
Options.quiet = true
|
60
53
|
puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
|
61
54
|
elsif Options.observe
|
62
55
|
Agent.new.observe(Options.observe)
|
63
|
-
elsif Options.admin
|
64
|
-
|
56
|
+
elsif Options.admin or Options.server
|
57
|
+
if Options.admin
|
58
|
+
puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
|
59
|
+
else
|
60
|
+
puts "Launching Scrappy Web Server..."
|
61
|
+
end
|
65
62
|
require 'scrappy/server/server'
|
66
63
|
Thin::Logging.silent = true
|
67
|
-
Scrappy::Server.register Scrappy::Admin
|
68
64
|
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
69
65
|
:base_uri=>Options.base_uri
|
70
|
-
elsif Options.server
|
71
|
-
puts "Launching Scrappy Web Server..."
|
72
|
-
require 'scrappy/server/server'
|
73
|
-
Thin::Logging.silent = true
|
74
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
75
|
-
:base_uri => Options.base_uri
|
76
66
|
else
|
77
67
|
output_version
|
78
68
|
puts 'To get help use: scrappy -h'
|
@@ -81,8 +71,75 @@ module Scrappy
|
|
81
71
|
Scrappy::App.quit
|
82
72
|
end
|
83
73
|
|
74
|
+
def self.quit
|
75
|
+
puts "\"#{Quotes.sort_by{rand}.first}\"" unless Options.quiet
|
76
|
+
exit
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.data_folder
|
80
|
+
@data_folder
|
81
|
+
end
|
82
|
+
def self.cache_folder
|
83
|
+
@cache_folder
|
84
|
+
end
|
85
|
+
def self.samples_file
|
86
|
+
@samples_file
|
87
|
+
end
|
88
|
+
def self.samples
|
89
|
+
@samples ||= []
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.add_sample sample
|
93
|
+
self.samples.unshift sample
|
94
|
+
save_samples
|
95
|
+
sample
|
96
|
+
end
|
97
|
+
def self.delete_sample id
|
98
|
+
@samples.delete @samples[id]
|
99
|
+
save_samples
|
100
|
+
end
|
101
|
+
def self.save_samples
|
102
|
+
open(@samples_file, "w") { |f| Marshal.dump(@samples, f) }
|
103
|
+
end
|
104
|
+
def self.editable_kb?
|
105
|
+
@editable_kb
|
106
|
+
end
|
107
|
+
def self.add_pattern graph
|
108
|
+
new_patterns = Scrappy::Kb.patterns.merge graph
|
109
|
+
open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
|
110
|
+
onload
|
111
|
+
end
|
112
|
+
def self.delete_pattern uri
|
113
|
+
graph = Scrappy::Kb.patterns
|
114
|
+
fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
|
115
|
+
select { |fragment| fragment.sc::type.include?(Node(uri)) }
|
116
|
+
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
117
|
+
open(@patterns_file, "w") { |f| f.write graph.serialize(:yarf) }
|
118
|
+
onload
|
119
|
+
end
|
120
|
+
def self.add_extractor graph
|
121
|
+
open(File.join(@extractors_folder,"extractor_#{Dir[File.join(@extractors_folder,'*')].size}.yarf"), "w") { |f| f.write graph.serialize(:yarf) }
|
122
|
+
onload
|
123
|
+
end
|
124
|
+
def self.delete_extractor uri
|
125
|
+
Dir[File.join(@extractors_folder, '*')].each do |file|
|
126
|
+
format = file.split('.').last.to_sym
|
127
|
+
next if format==:ignore
|
128
|
+
graph = RDF::Parser.parse format, open(file).read
|
129
|
+
uri_selectors = (graph.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
130
|
+
graph.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).
|
131
|
+
flatten.select do |uri_selector|
|
132
|
+
uri_selector.rdf::value.include?(uri)
|
133
|
+
end
|
134
|
+
fragments = uri_selectors.map { |uri_selector| graph.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
135
|
+
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
136
|
+
open(file, "w") { |f| f.write graph.serialize(format) } if fragments.any?
|
137
|
+
end
|
138
|
+
onload
|
139
|
+
end
|
140
|
+
|
84
141
|
protected
|
85
|
-
def output_help
|
142
|
+
def self.output_help
|
86
143
|
output_version
|
87
144
|
puts """Synopsis
|
88
145
|
Scrappy is a tool to scrape semantic data out of the unstructured web
|
@@ -123,59 +180,92 @@ Copyright
|
|
123
180
|
http://www.opensource.org/licenses/mit-license.php"""
|
124
181
|
end
|
125
182
|
|
126
|
-
def output_version
|
183
|
+
def self.output_version
|
127
184
|
puts "Scrappy v#{Scrappy::VERSION}"
|
128
185
|
end
|
129
186
|
|
130
|
-
def
|
187
|
+
def self.define_paths
|
131
188
|
# Check local or global knowledge base
|
132
|
-
home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
189
|
+
@home = WINDOWS_PLATFORM ? "#{ENV['HOME']}/scrappy" : "#{ENV['HOME']}/.scrappy"
|
133
190
|
|
134
|
-
|
135
|
-
cache_dirname
|
136
|
-
cache_filename
|
137
|
-
config_filename
|
191
|
+
extractors_dirname = "extractors"
|
192
|
+
cache_dirname = "cache"
|
193
|
+
cache_filename = "scrappy-#{Scrappy::VERSION}.kb"
|
194
|
+
config_filename = "config.yml"
|
195
|
+
samples_filename = "samples"
|
196
|
+
patterns_filename = "patterns.yarf"
|
138
197
|
|
139
|
-
if File.exists?(File.join(home,
|
140
|
-
|
141
|
-
|
198
|
+
if File.exists?(File.join(@home, extractors_dirname))
|
199
|
+
@editable_kb = true
|
200
|
+
@extractors_folder = File.join @home, extractors_dirname
|
201
|
+
@cache_folder = File.join @home, cache_dirname
|
142
202
|
else
|
143
|
-
|
144
|
-
|
203
|
+
@editable_kb = false
|
204
|
+
@extractors_folder = File.join Scrappy::Root, extractors_dirname
|
205
|
+
@cache_folder = Dir.tmpdir
|
145
206
|
end
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
207
|
+
@cache_file = File.join @cache_folder, cache_filename
|
208
|
+
@config_file = File.join @home, config_filename
|
209
|
+
@samples_file = File.join @home, samples_filename
|
210
|
+
@patterns_file = File.join @home, patterns_filename
|
211
|
+
Dir.mkdir @home if Dir[@home].empty?
|
212
|
+
Dir.mkdir cache_folder if Dir[cache_folder].empty?
|
213
|
+
end
|
214
|
+
|
215
|
+
def self.load_files_from folder
|
216
|
+
kb = RDF::Graph.new
|
217
|
+
kb.triples = ( Dir[File.join(folder, "*")].inject([]) do |triples, file|
|
218
|
+
extension = file.split('.').last.to_sym
|
219
|
+
triples + if extension==:ignore or File.directory?(file)
|
220
|
+
[]
|
221
|
+
else
|
222
|
+
graph = RDF::Parser.parse(extension, open(file).read)
|
223
|
+
kb.ns.merge! graph.ns
|
224
|
+
graph.triples
|
225
|
+
end
|
226
|
+
end )
|
227
|
+
kb
|
228
|
+
end
|
229
|
+
|
230
|
+
def self.onload
|
231
|
+
define_paths
|
232
|
+
|
150
233
|
# Load knowledge base
|
151
|
-
Agent::Options.kb
|
234
|
+
Agent::Options.kb ||= RDF::Graph.new
|
235
|
+
|
236
|
+
Kb.extractors, Kb.patterns = if File.exists?(@cache_file) and File.mtime(@cache_file) >= Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
|
152
237
|
# Just load kb from cache
|
153
|
-
open(cache_file) { |f| Marshal.load(f) }
|
238
|
+
open(@cache_file) { |f| Marshal.load(f) }
|
154
239
|
else
|
155
240
|
# Load YARF files and cache kb
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
kb.merge!(extension==:ignore ? RDF::Graph.new : graph)
|
161
|
-
kb
|
162
|
-
end
|
163
|
-
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
164
|
-
data
|
241
|
+
extractors = load_files_from(@extractors_folder)
|
242
|
+
patterns = File.exists?(@patterns_file) ? RDF::Parser.parse(:yarf, open(@patterns_file).read) : RDF::Graph.new
|
243
|
+
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns], f) }
|
244
|
+
[extractors, patterns]
|
165
245
|
end
|
166
246
|
|
167
|
-
#
|
247
|
+
# Sets new kb
|
248
|
+
Agent::Options.kb.replace Kb.extractors
|
249
|
+
Agent::Options.kb.ns = Kb.extractors.ns
|
250
|
+
# Adds defined prefixes to namespace
|
251
|
+
RDF::ID.ns.merge! Agent::Options.kb.ns
|
252
|
+
|
253
|
+
# Looks for a configuration file. If it does not exist, Scrappy does not use Sesame
|
168
254
|
# It looks for it in the home .scrappy dir
|
169
|
-
if File.exist?(config_file)
|
170
|
-
config = YAML::load_file(config_file)["repository"]
|
255
|
+
if File.exist?(@config_file)
|
256
|
+
config = YAML::load_file(@config_file)["repository"]
|
171
257
|
# Convert the strings from the YAML file into symbols
|
172
258
|
repository_options = {}
|
173
259
|
config.each { |k,v| repository_options[k.to_sym] = v }
|
174
260
|
Agent::Options.repository = Repository.new repository_options
|
175
261
|
end
|
176
262
|
|
177
|
-
|
263
|
+
if File.exist?(@samples_file)
|
264
|
+
@samples = open(@samples_file) { |f| Marshal.load(f) }
|
265
|
+
end
|
178
266
|
end
|
267
|
+
|
268
|
+
self.onload
|
179
269
|
end
|
180
270
|
|
181
271
|
Quotes = """Knowledge talks, wisdom listens
|
@@ -259,4 +349,4 @@ The reward of a thing well done is to have done it
|
|
259
349
|
Don’t argue with idiots. They will bring you down to their level and beat you with experience""".split("\n")
|
260
350
|
end
|
261
351
|
|
262
|
-
Scrappy::App.
|
352
|
+
Scrappy::App.run
|
data/lib/scrappy.rb
CHANGED
@@ -9,22 +9,19 @@ require 'active_support'
|
|
9
9
|
require 'tmpdir'
|
10
10
|
require 'lightrdf'
|
11
11
|
|
12
|
+
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
13
|
+
|
12
14
|
require 'scrappy/support'
|
13
15
|
require 'scrappy/repository'
|
14
16
|
|
15
|
-
require 'scrappy/
|
17
|
+
require 'scrappy/extractor/extractor'
|
18
|
+
require 'scrappy/trainer/trainer'
|
16
19
|
require 'scrappy/agent/map_reduce'
|
17
20
|
require 'scrappy/agent/cache'
|
18
21
|
require 'scrappy/agent/dumper'
|
19
|
-
require 'scrappy/agent/formats'
|
20
22
|
require 'scrappy/agent/blind_agent'
|
21
23
|
require 'scrappy/agent/agent'
|
22
24
|
|
23
|
-
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
24
|
-
|
25
25
|
module Scrappy
|
26
|
-
VERSION = '0.3.
|
27
|
-
end
|
28
|
-
|
29
|
-
# Require selectors
|
30
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
|
26
|
+
VERSION = '0.3.1'
|
27
|
+
end
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -2,6 +2,7 @@ module Scrappy
|
|
2
2
|
class Agent
|
3
3
|
include MonitorMixin
|
4
4
|
include Extractor
|
5
|
+
include Trainer
|
5
6
|
include MapReduce
|
6
7
|
include Cached
|
7
8
|
include BlindAgent
|
@@ -151,8 +152,7 @@ module Scrappy
|
|
151
152
|
end
|
152
153
|
|
153
154
|
def clean triples
|
154
|
-
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
155
|
-
select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
155
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }
|
156
156
|
end
|
157
157
|
|
158
158
|
# Do the extraction using RDF repository
|
@@ -223,7 +223,7 @@ module Scrappy
|
|
223
223
|
puts 'done!' if options.debug
|
224
224
|
|
225
225
|
if self.html_data?
|
226
|
-
triples = extract(self.uri, html, options.referenceable) # Extract data
|
226
|
+
triples = extract(self.uri, html, self.kb, options.referenceable) # Extract data
|
227
227
|
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
228
228
|
triples
|
229
229
|
else
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'scrappy/extractor/fragment'
|
3
|
+
require 'scrappy/extractor/formats'
|
4
|
+
require 'scrappy/extractor/selector'
|
5
|
+
# Require selectors
|
6
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/selectors/*.rb"].each { |f| require f }
|
7
|
+
|
8
|
+
module Scrappy
|
9
|
+
module Extractor
|
10
|
+
def extract uri, html, kb, referenceable=nil
|
11
|
+
synchronize do
|
12
|
+
if options.debug
|
13
|
+
print "Extracting #{uri}..."; $stdout.flush
|
14
|
+
end
|
15
|
+
|
16
|
+
# Restart stateful selectors
|
17
|
+
kb = RDF::Graph.new(kb.triples)
|
18
|
+
|
19
|
+
# Parse document
|
20
|
+
content = Nokogiri::HTML(html, nil, 'utf-8')
|
21
|
+
|
22
|
+
# Extract each fragment
|
23
|
+
options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
|
24
|
+
triples = []
|
25
|
+
fragments_for(kb, uri).each do |fragment|
|
26
|
+
kb.node(fragment).extract(options).each do |node|
|
27
|
+
triples += node.graph.triples
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Add references to sources if requested
|
32
|
+
triples += add_referenceable_data uri, content, triples, referenceable if referenceable
|
33
|
+
|
34
|
+
puts "done!" if self.options.debug
|
35
|
+
|
36
|
+
triples
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def fragments_for kb, uri
|
41
|
+
uri_selectors = ( kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
|
42
|
+
kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
|
43
|
+
flatten.select do |uri_selector|
|
44
|
+
!kb.node(uri_selector).filter(:uri=>uri).empty?
|
45
|
+
end
|
46
|
+
|
47
|
+
visual_selectors = kb.find(nil, Node('rdf:type'), Node('sc:VisualSelector'))
|
48
|
+
|
49
|
+
selectors = uri_selectors + visual_selectors
|
50
|
+
|
51
|
+
selectors.map { |selector| kb.find(nil, Node('sc:selector'), selector) }.
|
52
|
+
flatten.
|
53
|
+
select { |selector| selector.rdf::type.include?(Node('sc:Fragment')) }
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def add_referenceable_data uri, content, given_triples, referenceable
|
58
|
+
triples = []
|
59
|
+
resources = {}; given_triples.each { |s,p,o| resources[s] = resources[o] = true }
|
60
|
+
|
61
|
+
fragment = Node(Extractor.node_hash(uri, '/'))
|
62
|
+
selector = Node(nil)
|
63
|
+
presentation = Node(nil)
|
64
|
+
|
65
|
+
selector.rdf::type = Node('sc:UnivocalSelector')
|
66
|
+
selector.sc::path = '/'
|
67
|
+
selector.sc::document = uri
|
68
|
+
|
69
|
+
fragment.sc::selector = selector
|
70
|
+
|
71
|
+
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources[fragment.id]
|
72
|
+
|
73
|
+
content.search('*').each do |node|
|
74
|
+
next if node.text?
|
75
|
+
|
76
|
+
fragment = Extractor.node_hash(uri, node.path)
|
77
|
+
|
78
|
+
if referenceable == :dump or resources[fragment]
|
79
|
+
selector = ID(nil)
|
80
|
+
presentation = ID(nil)
|
81
|
+
|
82
|
+
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
83
|
+
triples << [selector, ID('sc:path'), node.path.to_s]
|
84
|
+
triples << [selector, ID('sc:tag'), node.name.to_s]
|
85
|
+
triples << [selector, ID('sc:document'), uri]
|
86
|
+
|
87
|
+
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
88
|
+
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
89
|
+
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
90
|
+
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
91
|
+
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
92
|
+
triples << [presentation, ID('sc:font_family'), node[:vfont]] if node[:vfont]
|
93
|
+
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
94
|
+
triples << [presentation, ID('sc:text'), node.text.strip]
|
95
|
+
|
96
|
+
triples << [fragment, ID('sc:selector'), selector]
|
97
|
+
triples << [fragment, ID('sc:presentation'), presentation]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
triples
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.node_hash uri, path
|
104
|
+
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
105
|
+
:"_:bnode#{digest}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|