scrappy 0.3.5 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Rakefile +1 -1
- data/bin/scrappy +75 -17
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/extractor.rb +11 -8
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +6 -2
- data/lib/scrappy/extractor/selectors/uri_pattern.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +66 -52
- data/lib/scrappy/learning/optimizer.rb +355 -107
- data/lib/scrappy/learning/trainer.rb +112 -40
- data/lib/scrappy/server/admin.rb +180 -17
- data/lib/scrappy/support.rb +0 -24
- data/public/javascripts/annotator.js +1 -1
- data/public/stylesheets/application.css +33 -0
- data/scrappy.gemspec +5 -5
- data/views/help.haml +1 -2
- data/views/layout.haml +1 -0
- data/views/patterns.haml +10 -5
- data/views/samples.haml +46 -22
- metadata +6 -6
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
10
10
|
p.author = "Jose Ignacio"
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.ignore_pattern = ["pkg/*"]
|
13
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.
|
13
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.9'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
|
14
14
|
end
|
15
15
|
|
16
16
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -33,7 +33,7 @@ module Scrappy
|
|
33
33
|
opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
|
34
34
|
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
35
35
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
36
|
-
opts.on('-u', '--debug')
|
36
|
+
opts.on('-u [KEY]', '--debug [KEY]') { |key| Agent::Options.debug = true; Agent::Options.debug_key = key.downcase if key }
|
37
37
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
38
38
|
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
39
39
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
@@ -71,7 +71,7 @@ module Scrappy
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def self.quit
|
74
|
-
puts "\"#{Quotes
|
74
|
+
puts "\"#{Quotes[rand(Quotes.length)]}\"" unless Options.quiet
|
75
75
|
exit
|
76
76
|
end
|
77
77
|
|
@@ -103,26 +103,61 @@ module Scrappy
|
|
103
103
|
def self.editable_kb?
|
104
104
|
@editable_kb
|
105
105
|
end
|
106
|
-
def self.
|
106
|
+
def self.add_patterns graph
|
107
107
|
new_patterns = Scrappy::Kb.patterns.merge graph
|
108
108
|
save_patterns new_patterns
|
109
109
|
onload
|
110
110
|
end
|
111
111
|
def self.save_patterns new_patterns
|
112
|
-
|
112
|
+
fragments = case new_patterns
|
113
|
+
when Array then
|
114
|
+
new_patterns
|
115
|
+
when RDF::Graph then
|
116
|
+
( new_patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) - new_patterns.find([], Node('sc:subfragment'), nil) ).
|
117
|
+
map { |f| RDF::Graph.new(f.all_triples) }
|
118
|
+
end
|
119
|
+
|
120
|
+
content = RDF::ID.ns.map{|k,v| "#{k}: #{v}\n"} * ''
|
121
|
+
fragments.each { |f| content += f.serialize(:yarf, false) }
|
122
|
+
open(@patterns_file, "w") { |f| f.write content }
|
123
|
+
end
|
124
|
+
def self.delete_patterns
|
125
|
+
graph = Scrappy::Kb.patterns
|
126
|
+
graph.triples = []
|
127
|
+
content = graph.serialize(:yarf)
|
128
|
+
open(@patterns_file, "w") { |f| f.write content }
|
129
|
+
onload
|
113
130
|
end
|
114
|
-
def self.delete_pattern
|
115
|
-
graph
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
open(@patterns_file, "w") { |f| f.write
|
131
|
+
def self.delete_pattern id
|
132
|
+
graph = Scrappy::Kb.patterns
|
133
|
+
fragment = graph[id]
|
134
|
+
graph.triples -= fragment.all_triples
|
135
|
+
content = graph.serialize(:yarf)
|
136
|
+
open(@patterns_file, "w") { |f| f.write content }
|
120
137
|
onload
|
121
138
|
end
|
122
139
|
def self.add_extractor graph
|
123
140
|
open(File.join(@extractors_folder,"extractor_#{Dir[File.join(@extractors_folder,'*')].size}.yarf"), "w") { |f| f.write graph.serialize(:yarf) }
|
124
141
|
onload
|
125
142
|
end
|
143
|
+
def self.replace_extractor graph, samples
|
144
|
+
kb = Scrappy::Kb.extractors
|
145
|
+
|
146
|
+
all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
147
|
+
all_fragments.each do |fragment|
|
148
|
+
fragment.sc::selector.each do |selector|
|
149
|
+
next unless ( selector.rdf::type.include?(Node('sc:UriSelector')) or
|
150
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) )
|
151
|
+
|
152
|
+
samples.each do |sample|
|
153
|
+
selector.rdf::value.each do |uri|
|
154
|
+
delete_extractor(uri) if !kb.node(selector).filter(:uri=>sample[:uri]).empty?
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
add_extractor graph
|
160
|
+
end
|
126
161
|
def self.delete_extractor uri
|
127
162
|
Dir[File.join(@extractors_folder, '*')].each do |file|
|
128
163
|
format = file.split('.').last.to_sym
|
@@ -133,9 +168,12 @@ module Scrappy
|
|
133
168
|
flatten.select do |uri_selector|
|
134
169
|
uri_selector.rdf::value.include?(uri)
|
135
170
|
end
|
171
|
+
next if uri_selectors.empty?
|
136
172
|
fragments = uri_selectors.map { |uri_selector| graph.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
137
173
|
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
138
|
-
|
174
|
+
text = graph.serialize(format)
|
175
|
+
open(file, "w") { |f| f.write text } if fragments.any?
|
176
|
+
File.delete(file) if text==""
|
139
177
|
end
|
140
178
|
onload
|
141
179
|
end
|
@@ -165,7 +203,7 @@ Options
|
|
165
203
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
|
166
204
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
167
205
|
-D, --dump Dumps RDF data to disk
|
168
|
-
-u, --debug
|
206
|
+
-u, --debug [KEYWORD] Shows debugging traces. Use optional keyword to filter selectors' output
|
169
207
|
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
170
208
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
171
209
|
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
@@ -234,15 +272,15 @@ Copyright
|
|
234
272
|
# Load knowledge base
|
235
273
|
Agent::Options.kb ||= RDF::Graph.new
|
236
274
|
|
237
|
-
Kb.extractors, Kb.patterns = if File.exists?(@cache_file) and File.mtime(@cache_file)
|
275
|
+
Kb.extractors, Kb.patterns, RDF::ID::count = if File.exists?(@cache_file) and File.mtime(@cache_file) > Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
|
238
276
|
# Just load kb from cache
|
239
277
|
open(@cache_file) { |f| Marshal.load(f) }
|
240
278
|
else
|
241
279
|
# Load YARF files and cache kb
|
242
280
|
extractors = load_files_from(@extractors_folder)
|
243
281
|
patterns = File.exists?(@patterns_file) ? RDF::Parser.parse(:yarf, open(@patterns_file).read) : RDF::Graph.new
|
244
|
-
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns], f) }
|
245
|
-
[extractors, patterns]
|
282
|
+
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns, RDF::ID::count], f) }
|
283
|
+
[extractors, patterns, RDF::ID::count]
|
246
284
|
end
|
247
285
|
|
248
286
|
# Sets new kb
|
@@ -315,7 +353,7 @@ Dogs have owners, cats have staff
|
|
315
353
|
I put all my genius into my life; I put only my talent into my works
|
316
354
|
It is better to be beautiful than to be good, but it is better to be good than to be ugly
|
317
355
|
All human beings, by nature, desire to know
|
318
|
-
All life is an experiment
|
356
|
+
All life is an experiment. The more experiments you make the better
|
319
357
|
An investment in knowledge always pays the best interest
|
320
358
|
An optimist is a person who sees a green light everywhere. The pessimist sees only the red light. But the truly wise person is color blind
|
321
359
|
Chance favors only those who court her
|
@@ -347,7 +385,27 @@ The man who does things makes many mistakes, but he never makes the biggest mist
|
|
347
385
|
The man who makes no mistakes does not usually make anything
|
348
386
|
The results you achieve will be in direct proportion to the effort you apply
|
349
387
|
The reward of a thing well done is to have done it
|
350
|
-
Don’t argue with idiots. They will bring you down to their level and beat you with experience
|
388
|
+
Don’t argue with idiots. They will bring you down to their level and beat you with experience
|
389
|
+
Choose a work you love, and you will never have to work a day in your life
|
390
|
+
The secret of creativity is knowing how to hide your sources
|
391
|
+
I never think of the future. It comes soon enough
|
392
|
+
If you want to go quick, go alone. If you want to go far, go together
|
393
|
+
The only thing that interferes with my learning is my education
|
394
|
+
Excesive literary production is a social offense
|
395
|
+
A man who dares to waste one hour of time has not discovered the value of life
|
396
|
+
Any idiot can face a crisis -- it's day to day living that wears you out
|
397
|
+
Every man dies. Not every man really lives
|
398
|
+
After two weeks of working on a project, you know whether it will work or not
|
399
|
+
All things are difficult before they are easy
|
400
|
+
Sport is hard work for which you do not get paid
|
401
|
+
Do not hire a man who does your work for money, but him who does it for love of it
|
402
|
+
Failure is success if we learn from it
|
403
|
+
Formal education will make you a living; self-education will make you a fortune
|
404
|
+
Lost time is never found again
|
405
|
+
Men talk of killing time, while time quietly kills them
|
406
|
+
Only entropy comes easy
|
407
|
+
Any man can make mistakes, but only an idiot persists in his error
|
408
|
+
Managing is getting paid for home runs someone else hits""".split("\n")
|
351
409
|
end
|
352
410
|
|
353
411
|
Scrappy::App.run
|
data/lib/scrappy.rb
CHANGED
@@ -21,19 +21,15 @@ module Scrappy
|
|
21
21
|
|
22
22
|
# Extract each fragment
|
23
23
|
options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
|
24
|
-
|
25
|
-
fragments_for(kb, uri).each do |fragment|
|
26
|
-
kb.node(fragment).extract(options).each do |node|
|
27
|
-
triples += node.graph.triples
|
28
|
-
end
|
29
|
-
end
|
24
|
+
output = extract_graph(fragments_for(kb, uri), options)
|
30
25
|
|
31
26
|
puts "done!" if self.options.debug
|
32
27
|
|
33
|
-
triples
|
28
|
+
output.triples
|
34
29
|
end
|
35
30
|
end
|
36
31
|
|
32
|
+
# Returns a list of fragments that have mappings in a given URI
|
37
33
|
def fragments_for kb, uri
|
38
34
|
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
39
35
|
|
@@ -52,7 +48,14 @@ module Scrappy
|
|
52
48
|
|
53
49
|
visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
|
54
50
|
|
55
|
-
(uri_selectors + visual_selectors).map { |selector| fragments[selector] }
|
51
|
+
(uri_selectors + visual_selectors).map { |selector| fragments[selector].proxy }
|
52
|
+
end
|
53
|
+
|
54
|
+
# Extracts all mappings from a fragment and returns a graph
|
55
|
+
def extract_graph fragments, options
|
56
|
+
output = RDF::Graph.new
|
57
|
+
fragments.each { |fragment| fragment.extract(options).each { |result| output << result } }
|
58
|
+
output
|
56
59
|
end
|
57
60
|
end
|
58
61
|
end
|
@@ -4,7 +4,9 @@ module Sc
|
|
4
4
|
include Scrappy::Formats
|
5
5
|
|
6
6
|
def select doc
|
7
|
-
if sc::debug.first=="true" and Scrappy::Agent::Options.debug
|
7
|
+
if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
|
8
|
+
(Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
|
9
|
+
|
8
10
|
puts '== DEBUG'
|
9
11
|
puts '== Selector:'
|
10
12
|
puts node.serialize(:yarf, false)
|
@@ -18,7 +20,9 @@ module Sc
|
|
18
20
|
# Filter method is defined in each subclass
|
19
21
|
results = filter doc
|
20
22
|
|
21
|
-
if sc::debug.first=="true" and Scrappy::Agent::Options.debug
|
23
|
+
if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
|
24
|
+
(Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
|
25
|
+
|
22
26
|
puts "== No results" if results.empty?
|
23
27
|
results.each_with_index do |result, i|
|
24
28
|
puts "== Result ##{i}:"
|
@@ -2,7 +2,7 @@ module Sc
|
|
2
2
|
class UriPatternSelector < Selector
|
3
3
|
def filter doc
|
4
4
|
# Check if the uri fits the pattern
|
5
|
-
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
5
|
+
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+').gsub('?', '\?')}\Z/ }
|
6
6
|
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
7
7
|
else
|
8
8
|
[]
|
@@ -1,60 +1,74 @@
|
|
1
1
|
module Sc
|
2
2
|
class VisualSelector < Selector
|
3
|
+
|
4
|
+
def initialize args={}
|
5
|
+
super
|
6
|
+
@cache = {}
|
7
|
+
end
|
8
|
+
|
3
9
|
def filter doc
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
27
|
-
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
28
|
-
|
29
|
-
!node.text? and
|
30
|
-
( !min_relative_x or relative_x >= min_relative_x) and
|
31
|
-
( !max_relative_x or relative_x <= max_relative_x) and
|
32
|
-
( !min_relative_y or relative_y >= min_relative_y) and
|
33
|
-
( !max_relative_y or relative_y <= max_relative_y) and
|
34
|
-
|
35
|
-
( !min_x or node['vx'].to_i >= min_x) and
|
36
|
-
( !max_x or node['vx'].to_i <= max_x) and
|
37
|
-
( !min_y or node['vy'].to_i >= min_y) and
|
38
|
-
( !max_y or node['vy'].to_i <= max_y) and
|
10
|
+
@cache[doc] ||= begin
|
11
|
+
# By initializing variables, we avoid getting data from a hash (slow)
|
12
|
+
min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
|
13
|
+
max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
|
14
|
+
min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
|
15
|
+
max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
|
16
|
+
min_x = (sc::min_x.first.to_i if sc::min_x.first)
|
17
|
+
max_x = (sc::max_x.first.to_i if sc::max_x.first)
|
18
|
+
min_y = (sc::min_y.first.to_i if sc::min_y.first)
|
19
|
+
max_y = (sc::max_y.first.to_i if sc::max_y.first)
|
20
|
+
min_width = (sc::min_width.first.to_i if sc::min_width.first)
|
21
|
+
max_width = (sc::max_width.first.to_i if sc::max_width.first)
|
22
|
+
min_height = (sc::min_height.first.to_i if sc::min_height.first)
|
23
|
+
max_height = (sc::max_height.first.to_i if sc::max_height.first)
|
24
|
+
min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
|
25
|
+
max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
|
26
|
+
min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
|
27
|
+
max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
|
28
|
+
font_family = sc::font_family.first
|
29
|
+
attributes = sc::attribute
|
30
|
+
formats = sc::format
|
31
|
+
tag = sc::tag
|
39
32
|
|
40
|
-
(
|
41
|
-
|
42
|
-
( !min_height or node['vh'].to_i >= min_height) and
|
43
|
-
( !max_height or node['vh'].to_i <= max_height) and
|
33
|
+
elements = doc[:content].search((tag - ["text"]).first || "*")
|
34
|
+
elements += Nokogiri::XML::NodeSet.new(doc[:content].document, [doc[:content]]) if tag.include?(doc[:content].name)
|
44
35
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
elements.select do |node|
|
37
|
+
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
38
|
+
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
39
|
+
|
40
|
+
!node.text? and
|
41
|
+
( (node['vfont'] and node.name!="a" and node.name!="img") or !tag.include?("text") ) and
|
42
|
+
( !min_relative_x or relative_x >= min_relative_x) and
|
43
|
+
( !max_relative_x or relative_x <= max_relative_x) and
|
44
|
+
( !min_relative_y or relative_y >= min_relative_y) and
|
45
|
+
( !max_relative_y or relative_y <= max_relative_y) and
|
46
|
+
|
47
|
+
( !min_x or node['vx'].to_i >= min_x) and
|
48
|
+
( !max_x or node['vx'].to_i <= max_x) and
|
49
|
+
( !min_y or node['vy'].to_i >= min_y) and
|
50
|
+
( !max_y or node['vy'].to_i <= max_y) and
|
51
|
+
|
52
|
+
( !min_width or node['vw'].to_i >= min_width) and
|
53
|
+
( !max_width or node['vw'].to_i <= max_width) and
|
54
|
+
( !min_height or node['vh'].to_i >= min_height) and
|
55
|
+
( !max_height or node['vh'].to_i <= max_height) and
|
56
|
+
|
57
|
+
( !min_font_size or node['vsize'].to_i >= min_font_size) and
|
58
|
+
( !max_font_size or node['vsize'].to_i <= max_font_size) and
|
59
|
+
( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
|
60
|
+
( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
|
61
|
+
( !font_family or node['vfont'] == font_family)
|
62
|
+
end.map do |content|
|
63
|
+
if attributes.first
|
64
|
+
# Select node's attribute if given
|
65
|
+
attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
|
66
|
+
else
|
67
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
|
68
|
+
end
|
69
|
+
end.flatten
|
70
|
+
end
|
58
71
|
end
|
72
|
+
|
59
73
|
end
|
60
74
|
end
|
@@ -1,151 +1,399 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
1
3
|
module Scrappy
|
2
4
|
module Optimizer
|
3
5
|
# Iterates through a knowledge base and tries to merge and generalize
|
4
6
|
# selectors whenever the output of the resulting kb is the same
|
5
|
-
def
|
7
|
+
def optimize_extractors kb, samples
|
6
8
|
# Build an array of fragments
|
7
|
-
|
8
|
-
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
9
|
+
all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
root_superfragments = all_fragments.select do |fragment|
|
12
|
+
fragment.sc::selector.any? do |selector|
|
13
|
+
( selector.rdf::type.include?(Node('sc:UriSelector')) or
|
14
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) ) and
|
15
|
+
samples.any? { |sample| !kb.node(selector).filter(:uri=>sample[:uri]).empty? }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
root_fragments = root_superfragments.map { |f| f.sc::subfragment }.flatten
|
12
19
|
|
13
|
-
# Optimize the
|
14
|
-
fragments =
|
20
|
+
# Optimize the fragments
|
21
|
+
fragments = optimize_all root_fragments, samples, :extractors
|
15
22
|
|
16
|
-
graph
|
17
|
-
|
23
|
+
# Build a graph by adding all fragments to a common URI-selected superfragment
|
24
|
+
superfragment = Node(nil)
|
25
|
+
identifier = Node(nil)
|
26
|
+
selector = uri_selector_for(samples.map { |sample| sample[:uri] })
|
27
|
+
identifier.rdf::type = Node('sc:BaseUriSelector')
|
28
|
+
superfragment.rdf::type = Node('sc:Fragment')
|
29
|
+
superfragment.sc::selector = selector
|
30
|
+
superfragment.sc::identifier = identifier
|
31
|
+
superfragment.graph << selector
|
32
|
+
superfragment.graph << identifier
|
33
|
+
|
34
|
+
triples = fragments.inject([]) do |triples, fragment|
|
35
|
+
triples << [superfragment.id, ID('sc:subfragment'), fragment.id]
|
36
|
+
triples += fragment.all_triples
|
37
|
+
end
|
38
|
+
triples += superfragment.all_triples
|
39
|
+
|
40
|
+
RDF::Graph.new(triples)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Iterates through a knowledge base and tries to merge and generalize
|
44
|
+
# selectors whenever the output of the resulting kb is the same
|
45
|
+
def optimize_patterns kb, samples
|
46
|
+
# Build an array of fragments
|
47
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
48
|
+
|
49
|
+
# Optimize the fragments
|
50
|
+
fragments = optimize_all root_fragments, samples, :patterns
|
18
51
|
|
19
|
-
graph
|
52
|
+
# Build a graph
|
53
|
+
RDF::Graph.new(fragments.inject([]) { |triples, fragment| triples += fragment.all_triples })
|
20
54
|
end
|
21
55
|
|
22
56
|
protected
|
23
|
-
#
|
24
|
-
def
|
25
|
-
#
|
57
|
+
# Optimizes a set of fragments
|
58
|
+
def optimize_all root_fragments, samples, kb_type
|
59
|
+
# Parse the documents
|
60
|
+
docs = samples.map do |sample|
|
61
|
+
output = kb_type==:patterns ? samples[:output] : extract(sample[:uri], sample[:html], Scrappy::Kb.extractors)
|
62
|
+
content = Nokogiri::HTML(sample[:html], nil, 'utf-8')
|
63
|
+
{ :uri=>sample[:uri], :content=>content, :output=>output }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Fragment cloning to use a new common pool for caching intermediate results
|
67
|
+
fragments = []
|
68
|
+
pool = {}
|
69
|
+
root_fragments.each do |f|
|
70
|
+
fragment = Node(f.id, RDF::Graph.new(f.all_triples))
|
71
|
+
fragment.graph.pool = pool
|
72
|
+
fragments << fragment
|
73
|
+
end
|
74
|
+
|
75
|
+
# Iterates until no changes are made
|
26
76
|
@tried = []
|
27
|
-
|
77
|
+
@distances = {}
|
78
|
+
new_fragments = fragments
|
79
|
+
score = 0.0
|
80
|
+
i = 0
|
81
|
+
last_save = 0
|
28
82
|
begin
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
# Tries to perform one optimization of two fragments out of a set of fragments
|
36
|
-
def optimize_once fragments, options
|
37
|
-
docs = options[:docs]
|
38
|
-
fragments.each do |fragment1|
|
39
|
-
fragments.each do |fragment2|
|
40
|
-
next if fragment1 == fragment2
|
41
|
-
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
42
|
-
next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
43
|
-
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
44
|
-
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
45
|
-
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
46
|
-
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
47
|
-
|
48
|
-
next if @tried.include?([fragment1, fragment2])
|
49
|
-
next if @tried.include?([fragment2, fragment1])
|
83
|
+
new_score = score(new_fragments, docs, kb_type)
|
84
|
+
|
85
|
+
if new_score >= score # Improvement after optimization?
|
86
|
+
puts 'Successful optimization' if i > 0
|
87
|
+
score = new_score
|
88
|
+
fragments = new_fragments
|
50
89
|
|
51
|
-
|
90
|
+
# Save to disk
|
91
|
+
if (Time.now - last_save).to_i > 60 and i > 0 and kb_type == :patterns
|
92
|
+
print "Saving..."; $stdout.flush
|
93
|
+
Scrappy::App.save_patterns fragments
|
94
|
+
puts "done!"
|
52
95
|
|
53
|
-
|
54
|
-
old_mappings = []
|
55
|
-
docs.each do |doc|
|
56
|
-
old_mappings += fragment1.all_mappings(:doc=>doc)
|
57
|
-
old_mappings += fragment2.all_mappings(:doc=>doc)
|
96
|
+
last_save = Time.now
|
58
97
|
end
|
59
|
-
|
98
|
+
else
|
99
|
+
puts 'Unsuccessful optimization, rolling back...'
|
100
|
+
end
|
101
|
+
puts
|
102
|
+
puts "Fragments: #{fragments.size}, score: #{score}"
|
103
|
+
puts "Trying optimization #{i+=1}..."
|
104
|
+
new_fragments = optimize fragments
|
105
|
+
end while new_fragments
|
106
|
+
puts 'Optimization finished'
|
107
|
+
|
108
|
+
fragments
|
109
|
+
end
|
110
|
+
|
111
|
+
# Tries to perform one optimization in a set of fragments
|
112
|
+
def optimize fragments
|
113
|
+
fragments.each_with_index do |fragment1, index|
|
114
|
+
fragments[0...index].sort_by { |fragment2| distance(fragment1, fragment2) }.each do |fragment2|
|
115
|
+
next if @tried.include?([fragment1, fragment2]) or @tried.include?([fragment2, fragment1])
|
60
116
|
|
61
|
-
|
62
|
-
new_fragment = mix(fragment1, fragment2, options)
|
117
|
+
new_fragment = group fragment1, fragment2
|
63
118
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
# Optimize subfragments
|
70
|
-
subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
|
71
|
-
subfragments.each { |subfragment| new_fragment.graph << subfragment }
|
72
|
-
new_fragment.sc::subfragment = subfragments.map &:node
|
73
|
-
|
74
|
-
# End if the new fragment returns the same results
|
75
|
-
if true
|
76
|
-
return fragments - [fragment1] - [fragment2] + [new_fragment]
|
77
|
-
end
|
119
|
+
@tried << [fragment1, fragment2]
|
120
|
+
|
121
|
+
# End by including the new fragment in the list and returning it
|
122
|
+
return fragments - [fragment1] - [fragment2] + [new_fragment] if new_fragment
|
78
123
|
end
|
79
124
|
end
|
80
|
-
|
125
|
+
return
|
81
126
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
127
|
+
|
128
|
+
# Groups two fragments into one
|
129
|
+
def group fragment1, fragment2, siblings=true
|
130
|
+
return unless signature(fragment1) == signature(fragment2)
|
85
131
|
|
86
|
-
|
87
|
-
new_fragment
|
88
|
-
new_fragment.
|
132
|
+
new_fragment = Node(nil)
|
133
|
+
new_fragment.rdf::type = Node("sc:Fragment")
|
134
|
+
new_fragment.graph.pool = fragment1.graph.pool
|
89
135
|
new_fragment.sc::type = fragment1.sc::type
|
90
136
|
new_fragment.sc::relation = fragment1.sc::relation
|
91
137
|
new_fragment.sc::superclass = fragment1.sc::superclass
|
92
138
|
new_fragment.sc::sameas = fragment1.sc::sameas
|
93
139
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
else
|
101
|
-
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
102
|
-
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
140
|
+
if fragment1.sc::min_cardinality.first and fragment2.sc::min_cardinality.first
|
141
|
+
if siblings
|
142
|
+
new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
|
143
|
+
else
|
144
|
+
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
145
|
+
end
|
103
146
|
end
|
104
|
-
|
147
|
+
if fragment1.sc::max_cardinality.first and fragment2.sc::max_cardinality.first
|
148
|
+
if siblings
|
149
|
+
new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
|
150
|
+
else
|
151
|
+
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
105
155
|
# sc:selector
|
106
|
-
|
107
|
-
|
108
|
-
new_fragment.sc::selector =
|
156
|
+
new_selector = merge(*(fragment1.sc::selector + fragment2.sc::selector))
|
157
|
+
return unless new_selector
|
158
|
+
new_fragment.sc::selector = new_selector
|
159
|
+
new_fragment.graph << new_selector
|
109
160
|
|
110
161
|
# sc:identifier
|
111
162
|
if fragment1.sc::identifier.first
|
112
|
-
|
113
|
-
|
114
|
-
new_fragment.sc::identifier =
|
163
|
+
new_identifier = merge(*(fragment1.sc::identifier + fragment2.sc::identifier))
|
164
|
+
return unless new_identifier
|
165
|
+
new_fragment.sc::identifier = new_identifier
|
166
|
+
new_fragment.graph << new_identifier
|
115
167
|
end
|
168
|
+
|
169
|
+
subfragments = mix(fragment1.sc::subfragment, fragment2.sc::subfragment)
|
170
|
+
return unless subfragments
|
171
|
+
|
172
|
+
subfragments.each { |f| return if !f; new_fragment.graph << f }
|
173
|
+
new_fragment.sc::subfragment = subfragments
|
174
|
+
|
175
|
+
puts " new fragment #{new_fragment} (#{short_name(new_fragment)}) out of #{fragment1} and #{fragment2}"
|
116
176
|
|
117
|
-
# All new nodes are expected to be inconsistent after performing
|
118
|
-
# subfragments' extractions. Otherwise, if new nodes are consistent, it means
|
119
|
-
# the output from the mixed fragment is different from the separate fragments
|
120
|
-
# and therefore the generalization has failed, so no mixed fragment is returned
|
121
177
|
new_fragment
|
122
178
|
end
|
123
179
|
|
124
|
-
#
|
125
|
-
def
|
180
|
+
# Mixes and aligns two set of fragments
|
181
|
+
def mix fragments1, fragments2
|
182
|
+
return unless fragments1.size == fragments2.size
|
183
|
+
|
184
|
+
# Build new fragments
|
185
|
+
used_fragments = []
|
186
|
+
fragments1.map do |fragment1|
|
187
|
+
fragment2 = fragments2.select { |fragment2| signature(fragment1) == signature(fragment2) }.first
|
188
|
+
return unless fragment2
|
189
|
+
return if used_fragments.include?(fragment2)
|
190
|
+
|
191
|
+
used_fragments << fragment2
|
192
|
+
|
193
|
+
group fragment1, fragment2, false
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def signature fragment
|
198
|
+
[ fragment.sc::type.map(&:to_sym).to_set,
|
199
|
+
fragment.sc::relation.map(&:to_sym).to_set,
|
200
|
+
fragment.sc::superclass.map(&:to_sym).to_set,
|
201
|
+
fragment.sc::sameas.map(&:to_sym).to_set,
|
202
|
+
fragment.sc::identifier.first.nil?,
|
203
|
+
fragment.sc::subfragment.map { |sf| signature(sf) }.to_set ]
|
204
|
+
end
|
205
|
+
|
206
|
+
# Merges a set of selectors, returning a new more general one
|
207
|
+
def merge *selectors
|
126
208
|
selector = Node(nil)
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
209
|
+
if selectors.first.rdf::type.first == Node('sc:XPathSelector')
|
210
|
+
selector.rdf::type = Node('sc:XPathSelector')
|
211
|
+
selector.sc::attribute = selectors.first.sc::attribute
|
212
|
+
selector.sc::text = selectors.map { |s| s.sc::text }.flatten
|
213
|
+
|
214
|
+
xpaths = selectors.map { |s| s.rdf::value }.flatten.map { |s| xpath_for(s) }
|
215
|
+
selector.rdf::value = if selectors.map { |s| s.rdf::value }.uniq.size == 1
|
216
|
+
# All in common
|
217
|
+
selectors.first.rdf::value
|
218
|
+
elsif xpaths.map(&:size).uniq.size == 1
|
219
|
+
# Possible siblings
|
220
|
+
new_xpath = []
|
221
|
+
(0...xpaths.first.size).each do |i|
|
222
|
+
terms = xpaths.map { |xp| xp[i] }
|
223
|
+
tags = terms.map { |term| term[:tag] }.uniq
|
224
|
+
indexes = terms.map { |term| term[:index] }.uniq
|
225
|
+
conditions = terms.map { |term| term[:conditions] }
|
226
|
+
|
227
|
+
tag = tags.size > 1 ? '*' : tags.first
|
228
|
+
index = indexes.first if indexes.size == 1
|
229
|
+
conditions = conditions.inject { |acc, n| acc & n }
|
230
|
+
|
231
|
+
new_xpath << {:tag => tag, :conditions => conditions, :index => index}
|
232
|
+
end
|
233
|
+
xpath_expression_for(new_xpath)
|
234
|
+
else
|
235
|
+
# Nothing in common
|
236
|
+
return
|
237
|
+
nil
|
238
|
+
end
|
239
|
+
elsif selectors.first.rdf::type.first == Node('sc:VisualSelector')
|
240
|
+
selector.rdf::type = Node('sc:VisualSelector')
|
241
|
+
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
242
|
+
selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
|
243
|
+
selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
|
244
|
+
selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
|
245
|
+
selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
|
246
|
+
selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
|
247
|
+
selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
|
248
|
+
selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
|
249
|
+
selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
|
250
|
+
selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
|
251
|
+
selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
|
252
|
+
selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
|
253
|
+
selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
|
254
|
+
selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
|
255
|
+
selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
|
256
|
+
selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
|
257
|
+
selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family.sort }.uniq.size == 1
|
258
|
+
selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag.sort }.uniq.size == 1
|
259
|
+
selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute.sort }.uniq.size == 1
|
260
|
+
end
|
147
261
|
|
148
262
|
selector
|
149
263
|
end
|
264
|
+
|
265
|
+
def distance fragment1, fragment2
|
266
|
+
return @distances[[fragment1.id, fragment2.id]] if @distances[[fragment1.id, fragment2.id]]
|
267
|
+
return 1/0.0 if signature(fragment1) != signature(fragment2)
|
268
|
+
|
269
|
+
# Calculate distances
|
270
|
+
distance = selector_distance(fragment1.sc::selector.first, fragment2.sc::selector.first)
|
271
|
+
distance += selector_distance(fragment1.sc::identifier.first, fragment2.sc::identifier.first) if fragment1.sc::identifier.first
|
272
|
+
|
273
|
+
# Calculate subfragments' distances
|
274
|
+
subfragments2 = fragment2.sc::subfragment
|
275
|
+
subdistances = fragment1.sc::subfragment.map do |subfragment1|
|
276
|
+
subfragment2 = subfragments2.select { |f| signature(subfragment1) == signature(f) }.first
|
277
|
+
subfragments2.delete subfragment2
|
278
|
+
|
279
|
+
subfragment2.nil? ? 500.0 : distance(subfragment1, subfragment2)
|
280
|
+
end
|
281
|
+
|
282
|
+
final_distance = distance + subdistances.inject(0.0) {|sum,d| sum+d} + subfragments2.size*500.0
|
283
|
+
@distances[[fragment1.id, fragment2.id]] = final_distance
|
284
|
+
@distances[[fragment2.id, fragment1.id]] = final_distance
|
285
|
+
end
|
286
|
+
|
287
|
+
def selector_distance selector1, selector2
|
288
|
+
distance = 0.0
|
289
|
+
distance += (selector1.sc::min_relative_x.first.to_i - selector2.sc::min_relative_x.first.to_i).abs
|
290
|
+
distance += (selector1.sc::max_relative_x.first.to_i - selector2.sc::max_relative_x.first.to_i).abs
|
291
|
+
distance += (selector1.sc::min_relative_y.first.to_i - selector2.sc::min_relative_y.first.to_i).abs
|
292
|
+
distance += (selector1.sc::max_relative_y.first.to_i - selector2.sc::max_relative_y.first.to_i).abs
|
293
|
+
distance += (selector1.sc::min_x.first.to_i - selector2.sc::min_x.first.to_i).abs
|
294
|
+
distance += (selector1.sc::max_x.first.to_i - selector2.sc::max_x.first.to_i).abs
|
295
|
+
distance += (selector1.sc::min_y.first.to_i - selector2.sc::min_y.first.to_i).abs
|
296
|
+
distance += (selector1.sc::max_y.first.to_i - selector2.sc::max_y.first.to_i).abs
|
297
|
+
distance += (selector1.sc::min_width.first.to_i - selector2.sc::min_width.first.to_i).abs
|
298
|
+
distance += (selector1.sc::max_width.first.to_i - selector2.sc::max_width.first.to_i).abs
|
299
|
+
distance += (selector1.sc::min_height.first.to_i - selector2.sc::min_height.first.to_i).abs
|
300
|
+
distance += (selector1.sc::max_height.first.to_i - selector2.sc::max_height.first.to_i).abs
|
301
|
+
distance += (selector1.sc::min_font_size.first.to_i - selector2.sc::min_font_size.first.to_i).abs * 100
|
302
|
+
distance += (selector1.sc::max_font_size.first.to_i - selector2.sc::max_font_size.first.to_i).abs * 100
|
303
|
+
distance += (selector1.sc::min_font_weight.first.to_i - selector2.sc::min_font_weight.first.to_i).abs
|
304
|
+
distance += (selector1.sc::max_font_weight.first.to_i - selector2.sc::max_font_weight.first.to_i).abs
|
305
|
+
distance += 100 if selector1.sc::font_family != selector2.sc::font_family
|
306
|
+
distance += 500 if selector1.sc::tag != selector2.sc::tag
|
307
|
+
distance
|
308
|
+
end
|
309
|
+
|
310
|
+
def score fragments, docs, kb_type
|
311
|
+
return 0.0 unless fragments
|
312
|
+
docs.inject(0.0) { |sum,doc| doc_score(fragments, doc, kb_type)+sum } / docs.size.to_f
|
313
|
+
end
|
314
|
+
|
315
|
+
def doc_score fragments, doc, kb_type
|
316
|
+
count = RDF::ID.count
|
317
|
+
extraction = extract_graph(fragments.map(&:proxy), :doc=>doc).triples
|
318
|
+
RDF::ID.count = count # Hack to reduce symbol creation
|
319
|
+
|
320
|
+
correct = doc[:output]
|
321
|
+
precision, recall, fscore = metrics(correct, extraction, true)
|
322
|
+
|
323
|
+
kb_type == :patterns ? fscore : recall
|
324
|
+
end
|
325
|
+
|
326
|
+
def metrics correct, extraction, debug=false
|
327
|
+
right = correct.size - (correct - extraction).size
|
328
|
+
|
329
|
+
if debug
|
330
|
+
puts " Wrong triples: \n" + RDF::Graph.new(extraction - correct).to_ntriples
|
331
|
+
puts " Missing triples: \n" + RDF::Graph.new(correct - extraction).to_ntriples
|
332
|
+
end
|
333
|
+
|
334
|
+
precision = extraction.size != 0 ? right/extraction.size.to_f : 1.0
|
335
|
+
recall = correct.size != 0 ? right/correct.size.to_f : 1.0
|
336
|
+
|
337
|
+
# Calculate fscore
|
338
|
+
fscore = 2.0*(recall*precision)/(precision+recall)
|
339
|
+
|
340
|
+
puts " Fscore: #{fscore}" if debug
|
341
|
+
|
342
|
+
[ precision, recall, fscore ]
|
343
|
+
end
|
344
|
+
|
345
|
+
private
|
346
|
+
def short_name fragment
|
347
|
+
[ fragment.sc::type.first, fragment.sc::relation.first ].
|
348
|
+
compact.
|
349
|
+
map { |id| RDF::ID.compress(id) } * ", "
|
350
|
+
end
|
351
|
+
|
352
|
+
def uri_selector_for uris
|
353
|
+
selector = Node(nil)
|
354
|
+
if uris.uniq.size == 1
|
355
|
+
selector.rdf::type = Node('sc:UriSelector')
|
356
|
+
selector.rdf::value = uris.first
|
357
|
+
selector
|
358
|
+
else
|
359
|
+
min_length = uris.map(&:length).min
|
360
|
+
pattern = ""
|
361
|
+
(0..min_length).map.reverse.each do |length|
|
362
|
+
pattern = uris.first[0..length]
|
363
|
+
break if uris.all? { |uri| uri.index(pattern) == 0 and uri.length > pattern.length }
|
364
|
+
end
|
365
|
+
selector.rdf::type = Node('sc:UriPatternSelector')
|
366
|
+
selector.rdf::value = pattern + "*"
|
367
|
+
selector
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
# Parses an xpath expression into an array
|
372
|
+
def xpath_for expression
|
373
|
+
start = expression[0..0]=='/' ? 1 : 0
|
374
|
+
expression.split('/')[start..-1].map do |term|
|
375
|
+
chunks = term.split('[')
|
376
|
+
tag = chunks[0]
|
377
|
+
conditions = chunks[1]
|
378
|
+
if conditions.to_i.to_s == conditions
|
379
|
+
# It's the index in fact
|
380
|
+
index = chunks[1]
|
381
|
+
else
|
382
|
+
conditions = conditions.chop.split(" and ") if conditions
|
383
|
+
index = chunks[2]
|
384
|
+
end
|
385
|
+
index = index.to_i if index
|
386
|
+
{ :tag=>tag, :conditions=>conditions||[], :index=>index }
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
# Serializes an xpath expression
|
391
|
+
def xpath_expression_for xpath
|
392
|
+
(xpath.first[:tag]=='.' ? "" : "/" ) + xpath.map do |term|
|
393
|
+
term[:tag] +
|
394
|
+
("[" + (term[:conditions]*' and ') + "]" if term[:conditions].size > 0).to_s +
|
395
|
+
("[" + term[:index].to_s + "]" if term[:index]).to_s
|
396
|
+
end * '/'
|
397
|
+
end
|
150
398
|
end
|
151
399
|
end
|