scrappy 0.3.5 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Rakefile +1 -1
- data/bin/scrappy +75 -17
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/extractor.rb +11 -8
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +6 -2
- data/lib/scrappy/extractor/selectors/uri_pattern.rb +1 -1
- data/lib/scrappy/extractor/selectors/visual.rb +66 -52
- data/lib/scrappy/learning/optimizer.rb +355 -107
- data/lib/scrappy/learning/trainer.rb +112 -40
- data/lib/scrappy/server/admin.rb +180 -17
- data/lib/scrappy/support.rb +0 -24
- data/public/javascripts/annotator.js +1 -1
- data/public/stylesheets/application.css +33 -0
- data/scrappy.gemspec +5 -5
- data/views/help.haml +1 -2
- data/views/layout.haml +1 -0
- data/views/patterns.haml +10 -5
- data/views/samples.haml +46 -22
- metadata +6 -6
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
10
10
|
p.author = "Jose Ignacio"
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.ignore_pattern = ["pkg/*"]
|
13
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.
|
13
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.9'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
|
14
14
|
end
|
15
15
|
|
16
16
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -33,7 +33,7 @@ module Scrappy
|
|
33
33
|
opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
|
34
34
|
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
35
35
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
36
|
-
opts.on('-u', '--debug')
|
36
|
+
opts.on('-u [KEY]', '--debug [KEY]') { |key| Agent::Options.debug = true; Agent::Options.debug_key = key.downcase if key }
|
37
37
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
38
38
|
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
39
39
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
@@ -71,7 +71,7 @@ module Scrappy
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def self.quit
|
74
|
-
puts "\"#{Quotes
|
74
|
+
puts "\"#{Quotes[rand(Quotes.length)]}\"" unless Options.quiet
|
75
75
|
exit
|
76
76
|
end
|
77
77
|
|
@@ -103,26 +103,61 @@ module Scrappy
|
|
103
103
|
def self.editable_kb?
|
104
104
|
@editable_kb
|
105
105
|
end
|
106
|
-
def self.
|
106
|
+
def self.add_patterns graph
|
107
107
|
new_patterns = Scrappy::Kb.patterns.merge graph
|
108
108
|
save_patterns new_patterns
|
109
109
|
onload
|
110
110
|
end
|
111
111
|
def self.save_patterns new_patterns
|
112
|
-
|
112
|
+
fragments = case new_patterns
|
113
|
+
when Array then
|
114
|
+
new_patterns
|
115
|
+
when RDF::Graph then
|
116
|
+
( new_patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) - new_patterns.find([], Node('sc:subfragment'), nil) ).
|
117
|
+
map { |f| RDF::Graph.new(f.all_triples) }
|
118
|
+
end
|
119
|
+
|
120
|
+
content = RDF::ID.ns.map{|k,v| "#{k}: #{v}\n"} * ''
|
121
|
+
fragments.each { |f| content += f.serialize(:yarf, false) }
|
122
|
+
open(@patterns_file, "w") { |f| f.write content }
|
123
|
+
end
|
124
|
+
def self.delete_patterns
|
125
|
+
graph = Scrappy::Kb.patterns
|
126
|
+
graph.triples = []
|
127
|
+
content = graph.serialize(:yarf)
|
128
|
+
open(@patterns_file, "w") { |f| f.write content }
|
129
|
+
onload
|
113
130
|
end
|
114
|
-
def self.delete_pattern
|
115
|
-
graph
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
open(@patterns_file, "w") { |f| f.write
|
131
|
+
def self.delete_pattern id
|
132
|
+
graph = Scrappy::Kb.patterns
|
133
|
+
fragment = graph[id]
|
134
|
+
graph.triples -= fragment.all_triples
|
135
|
+
content = graph.serialize(:yarf)
|
136
|
+
open(@patterns_file, "w") { |f| f.write content }
|
120
137
|
onload
|
121
138
|
end
|
122
139
|
def self.add_extractor graph
|
123
140
|
open(File.join(@extractors_folder,"extractor_#{Dir[File.join(@extractors_folder,'*')].size}.yarf"), "w") { |f| f.write graph.serialize(:yarf) }
|
124
141
|
onload
|
125
142
|
end
|
143
|
+
def self.replace_extractor graph, samples
|
144
|
+
kb = Scrappy::Kb.extractors
|
145
|
+
|
146
|
+
all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
147
|
+
all_fragments.each do |fragment|
|
148
|
+
fragment.sc::selector.each do |selector|
|
149
|
+
next unless ( selector.rdf::type.include?(Node('sc:UriSelector')) or
|
150
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) )
|
151
|
+
|
152
|
+
samples.each do |sample|
|
153
|
+
selector.rdf::value.each do |uri|
|
154
|
+
delete_extractor(uri) if !kb.node(selector).filter(:uri=>sample[:uri]).empty?
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
add_extractor graph
|
160
|
+
end
|
126
161
|
def self.delete_extractor uri
|
127
162
|
Dir[File.join(@extractors_folder, '*')].each do |file|
|
128
163
|
format = file.split('.').last.to_sym
|
@@ -133,9 +168,12 @@ module Scrappy
|
|
133
168
|
flatten.select do |uri_selector|
|
134
169
|
uri_selector.rdf::value.include?(uri)
|
135
170
|
end
|
171
|
+
next if uri_selectors.empty?
|
136
172
|
fragments = uri_selectors.map { |uri_selector| graph.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
137
173
|
fragments.each { |fragment| graph.triples -= fragment.all_triples }
|
138
|
-
|
174
|
+
text = graph.serialize(format)
|
175
|
+
open(file, "w") { |f| f.write text } if fragments.any?
|
176
|
+
File.delete(file) if text==""
|
139
177
|
end
|
140
178
|
onload
|
141
179
|
end
|
@@ -165,7 +203,7 @@ Options
|
|
165
203
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
|
166
204
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
167
205
|
-D, --dump Dumps RDF data to disk
|
168
|
-
-u, --debug
|
206
|
+
-u, --debug [KEYWORD] Shows debugging traces. Use optional keyword to filter selectors' output
|
169
207
|
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
170
208
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
171
209
|
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
@@ -234,15 +272,15 @@ Copyright
|
|
234
272
|
# Load knowledge base
|
235
273
|
Agent::Options.kb ||= RDF::Graph.new
|
236
274
|
|
237
|
-
Kb.extractors, Kb.patterns = if File.exists?(@cache_file) and File.mtime(@cache_file)
|
275
|
+
Kb.extractors, Kb.patterns, RDF::ID::count = if File.exists?(@cache_file) and File.mtime(@cache_file) > Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
|
238
276
|
# Just load kb from cache
|
239
277
|
open(@cache_file) { |f| Marshal.load(f) }
|
240
278
|
else
|
241
279
|
# Load YARF files and cache kb
|
242
280
|
extractors = load_files_from(@extractors_folder)
|
243
281
|
patterns = File.exists?(@patterns_file) ? RDF::Parser.parse(:yarf, open(@patterns_file).read) : RDF::Graph.new
|
244
|
-
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns], f) }
|
245
|
-
[extractors, patterns]
|
282
|
+
open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns, RDF::ID::count], f) }
|
283
|
+
[extractors, patterns, RDF::ID::count]
|
246
284
|
end
|
247
285
|
|
248
286
|
# Sets new kb
|
@@ -315,7 +353,7 @@ Dogs have owners, cats have staff
|
|
315
353
|
I put all my genius into my life; I put only my talent into my works
|
316
354
|
It is better to be beautiful than to be good, but it is better to be good than to be ugly
|
317
355
|
All human beings, by nature, desire to know
|
318
|
-
All life is an experiment
|
356
|
+
All life is an experiment. The more experiments you make the better
|
319
357
|
An investment in knowledge always pays the best interest
|
320
358
|
An optimist is a person who sees a green light everywhere. The pessimist sees only the red light. But the truly wise person is color blind
|
321
359
|
Chance favors only those who court her
|
@@ -347,7 +385,27 @@ The man who does things makes many mistakes, but he never makes the biggest mist
|
|
347
385
|
The man who makes no mistakes does not usually make anything
|
348
386
|
The results you achieve will be in direct proportion to the effort you apply
|
349
387
|
The reward of a thing well done is to have done it
|
350
|
-
Don’t argue with idiots. They will bring you down to their level and beat you with experience
|
388
|
+
Don’t argue with idiots. They will bring you down to their level and beat you with experience
|
389
|
+
Choose a work you love, and you will never have to work a day in your life
|
390
|
+
The secret of creativity is knowing how to hide your sources
|
391
|
+
I never think of the future. It comes soon enough
|
392
|
+
If you want to go quick, go alone. If you want to go far, go together
|
393
|
+
The only thing that interferes with my learning is my education
|
394
|
+
Excesive literary production is a social offense
|
395
|
+
A man who dares to waste one hour of time has not discovered the value of life
|
396
|
+
Any idiot can face a crisis -- it's day to day living that wears you out
|
397
|
+
Every man dies. Not every man really lives
|
398
|
+
After two weeks of working on a project, you know whether it will work or not
|
399
|
+
All things are difficult before they are easy
|
400
|
+
Sport is hard work for which you do not get paid
|
401
|
+
Do not hire a man who does your work for money, but him who does it for love of it
|
402
|
+
Failure is success if we learn from it
|
403
|
+
Formal education will make you a living; self-education will make you a fortune
|
404
|
+
Lost time is never found again
|
405
|
+
Men talk of killing time, while time quietly kills them
|
406
|
+
Only entropy comes easy
|
407
|
+
Any man can make mistakes, but only an idiot persists in his error
|
408
|
+
Managing is getting paid for home runs someone else hits""".split("\n")
|
351
409
|
end
|
352
410
|
|
353
411
|
Scrappy::App.run
|
data/lib/scrappy.rb
CHANGED
@@ -21,19 +21,15 @@ module Scrappy
|
|
21
21
|
|
22
22
|
# Extract each fragment
|
23
23
|
options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
|
24
|
-
|
25
|
-
fragments_for(kb, uri).each do |fragment|
|
26
|
-
kb.node(fragment).extract(options).each do |node|
|
27
|
-
triples += node.graph.triples
|
28
|
-
end
|
29
|
-
end
|
24
|
+
output = extract_graph(fragments_for(kb, uri), options)
|
30
25
|
|
31
26
|
puts "done!" if self.options.debug
|
32
27
|
|
33
|
-
triples
|
28
|
+
output.triples
|
34
29
|
end
|
35
30
|
end
|
36
31
|
|
32
|
+
# Returns a list of fragments that have mappings in a given URI
|
37
33
|
def fragments_for kb, uri
|
38
34
|
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
39
35
|
|
@@ -52,7 +48,14 @@ module Scrappy
|
|
52
48
|
|
53
49
|
visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
|
54
50
|
|
55
|
-
(uri_selectors + visual_selectors).map { |selector| fragments[selector] }
|
51
|
+
(uri_selectors + visual_selectors).map { |selector| fragments[selector].proxy }
|
52
|
+
end
|
53
|
+
|
54
|
+
# Extracts all mappings from a fragment and returns a graph
|
55
|
+
def extract_graph fragments, options
|
56
|
+
output = RDF::Graph.new
|
57
|
+
fragments.each { |fragment| fragment.extract(options).each { |result| output << result } }
|
58
|
+
output
|
56
59
|
end
|
57
60
|
end
|
58
61
|
end
|
@@ -4,7 +4,9 @@ module Sc
|
|
4
4
|
include Scrappy::Formats
|
5
5
|
|
6
6
|
def select doc
|
7
|
-
if sc::debug.first=="true" and Scrappy::Agent::Options.debug
|
7
|
+
if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
|
8
|
+
(Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
|
9
|
+
|
8
10
|
puts '== DEBUG'
|
9
11
|
puts '== Selector:'
|
10
12
|
puts node.serialize(:yarf, false)
|
@@ -18,7 +20,9 @@ module Sc
|
|
18
20
|
# Filter method is defined in each subclass
|
19
21
|
results = filter doc
|
20
22
|
|
21
|
-
if sc::debug.first=="true" and Scrappy::Agent::Options.debug
|
23
|
+
if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
|
24
|
+
(Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
|
25
|
+
|
22
26
|
puts "== No results" if results.empty?
|
23
27
|
results.each_with_index do |result, i|
|
24
28
|
puts "== Result ##{i}:"
|
@@ -2,7 +2,7 @@ module Sc
|
|
2
2
|
class UriPatternSelector < Selector
|
3
3
|
def filter doc
|
4
4
|
# Check if the uri fits the pattern
|
5
|
-
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
5
|
+
if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+').gsub('?', '\?')}\Z/ }
|
6
6
|
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
7
7
|
else
|
8
8
|
[]
|
@@ -1,60 +1,74 @@
|
|
1
1
|
module Sc
|
2
2
|
class VisualSelector < Selector
|
3
|
+
|
4
|
+
def initialize args={}
|
5
|
+
super
|
6
|
+
@cache = {}
|
7
|
+
end
|
8
|
+
|
3
9
|
def filter doc
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
27
|
-
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
28
|
-
|
29
|
-
!node.text? and
|
30
|
-
( !min_relative_x or relative_x >= min_relative_x) and
|
31
|
-
( !max_relative_x or relative_x <= max_relative_x) and
|
32
|
-
( !min_relative_y or relative_y >= min_relative_y) and
|
33
|
-
( !max_relative_y or relative_y <= max_relative_y) and
|
34
|
-
|
35
|
-
( !min_x or node['vx'].to_i >= min_x) and
|
36
|
-
( !max_x or node['vx'].to_i <= max_x) and
|
37
|
-
( !min_y or node['vy'].to_i >= min_y) and
|
38
|
-
( !max_y or node['vy'].to_i <= max_y) and
|
10
|
+
@cache[doc] ||= begin
|
11
|
+
# By initializing variables, we avoid getting data from a hash (slow)
|
12
|
+
min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
|
13
|
+
max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
|
14
|
+
min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
|
15
|
+
max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
|
16
|
+
min_x = (sc::min_x.first.to_i if sc::min_x.first)
|
17
|
+
max_x = (sc::max_x.first.to_i if sc::max_x.first)
|
18
|
+
min_y = (sc::min_y.first.to_i if sc::min_y.first)
|
19
|
+
max_y = (sc::max_y.first.to_i if sc::max_y.first)
|
20
|
+
min_width = (sc::min_width.first.to_i if sc::min_width.first)
|
21
|
+
max_width = (sc::max_width.first.to_i if sc::max_width.first)
|
22
|
+
min_height = (sc::min_height.first.to_i if sc::min_height.first)
|
23
|
+
max_height = (sc::max_height.first.to_i if sc::max_height.first)
|
24
|
+
min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
|
25
|
+
max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
|
26
|
+
min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
|
27
|
+
max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
|
28
|
+
font_family = sc::font_family.first
|
29
|
+
attributes = sc::attribute
|
30
|
+
formats = sc::format
|
31
|
+
tag = sc::tag
|
39
32
|
|
40
|
-
(
|
41
|
-
|
42
|
-
( !min_height or node['vh'].to_i >= min_height) and
|
43
|
-
( !max_height or node['vh'].to_i <= max_height) and
|
33
|
+
elements = doc[:content].search((tag - ["text"]).first || "*")
|
34
|
+
elements += Nokogiri::XML::NodeSet.new(doc[:content].document, [doc[:content]]) if tag.include?(doc[:content].name)
|
44
35
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
elements.select do |node|
|
37
|
+
relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
|
38
|
+
relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
|
39
|
+
|
40
|
+
!node.text? and
|
41
|
+
( (node['vfont'] and node.name!="a" and node.name!="img") or !tag.include?("text") ) and
|
42
|
+
( !min_relative_x or relative_x >= min_relative_x) and
|
43
|
+
( !max_relative_x or relative_x <= max_relative_x) and
|
44
|
+
( !min_relative_y or relative_y >= min_relative_y) and
|
45
|
+
( !max_relative_y or relative_y <= max_relative_y) and
|
46
|
+
|
47
|
+
( !min_x or node['vx'].to_i >= min_x) and
|
48
|
+
( !max_x or node['vx'].to_i <= max_x) and
|
49
|
+
( !min_y or node['vy'].to_i >= min_y) and
|
50
|
+
( !max_y or node['vy'].to_i <= max_y) and
|
51
|
+
|
52
|
+
( !min_width or node['vw'].to_i >= min_width) and
|
53
|
+
( !max_width or node['vw'].to_i <= max_width) and
|
54
|
+
( !min_height or node['vh'].to_i >= min_height) and
|
55
|
+
( !max_height or node['vh'].to_i <= max_height) and
|
56
|
+
|
57
|
+
( !min_font_size or node['vsize'].to_i >= min_font_size) and
|
58
|
+
( !max_font_size or node['vsize'].to_i <= max_font_size) and
|
59
|
+
( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
|
60
|
+
( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
|
61
|
+
( !font_family or node['vfont'] == font_family)
|
62
|
+
end.map do |content|
|
63
|
+
if attributes.first
|
64
|
+
# Select node's attribute if given
|
65
|
+
attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
|
66
|
+
else
|
67
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
|
68
|
+
end
|
69
|
+
end.flatten
|
70
|
+
end
|
58
71
|
end
|
72
|
+
|
59
73
|
end
|
60
74
|
end
|
@@ -1,151 +1,399 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
1
3
|
module Scrappy
|
2
4
|
module Optimizer
|
3
5
|
# Iterates through a knowledge base and tries to merge and generalize
|
4
6
|
# selectors whenever the output of the resulting kb is the same
|
5
|
-
def
|
7
|
+
def optimize_extractors kb, samples
|
6
8
|
# Build an array of fragments
|
7
|
-
|
8
|
-
fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
|
9
|
+
all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
root_superfragments = all_fragments.select do |fragment|
|
12
|
+
fragment.sc::selector.any? do |selector|
|
13
|
+
( selector.rdf::type.include?(Node('sc:UriSelector')) or
|
14
|
+
selector.rdf::type.include?(Node('sc:UriPatternSelector')) ) and
|
15
|
+
samples.any? { |sample| !kb.node(selector).filter(:uri=>sample[:uri]).empty? }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
root_fragments = root_superfragments.map { |f| f.sc::subfragment }.flatten
|
12
19
|
|
13
|
-
# Optimize the
|
14
|
-
fragments =
|
20
|
+
# Optimize the fragments
|
21
|
+
fragments = optimize_all root_fragments, samples, :extractors
|
15
22
|
|
16
|
-
graph
|
17
|
-
|
23
|
+
# Build a graph by adding all fragments to a common URI-selected superfragment
|
24
|
+
superfragment = Node(nil)
|
25
|
+
identifier = Node(nil)
|
26
|
+
selector = uri_selector_for(samples.map { |sample| sample[:uri] })
|
27
|
+
identifier.rdf::type = Node('sc:BaseUriSelector')
|
28
|
+
superfragment.rdf::type = Node('sc:Fragment')
|
29
|
+
superfragment.sc::selector = selector
|
30
|
+
superfragment.sc::identifier = identifier
|
31
|
+
superfragment.graph << selector
|
32
|
+
superfragment.graph << identifier
|
33
|
+
|
34
|
+
triples = fragments.inject([]) do |triples, fragment|
|
35
|
+
triples << [superfragment.id, ID('sc:subfragment'), fragment.id]
|
36
|
+
triples += fragment.all_triples
|
37
|
+
end
|
38
|
+
triples += superfragment.all_triples
|
39
|
+
|
40
|
+
RDF::Graph.new(triples)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Iterates through a knowledge base and tries to merge and generalize
|
44
|
+
# selectors whenever the output of the resulting kb is the same
|
45
|
+
def optimize_patterns kb, samples
|
46
|
+
# Build an array of fragments
|
47
|
+
root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
|
48
|
+
|
49
|
+
# Optimize the fragments
|
50
|
+
fragments = optimize_all root_fragments, samples, :patterns
|
18
51
|
|
19
|
-
graph
|
52
|
+
# Build a graph
|
53
|
+
RDF::Graph.new(fragments.inject([]) { |triples, fragment| triples += fragment.all_triples })
|
20
54
|
end
|
21
55
|
|
22
56
|
protected
|
23
|
-
#
|
24
|
-
def
|
25
|
-
#
|
57
|
+
# Optimizes a set of fragments
|
58
|
+
def optimize_all root_fragments, samples, kb_type
|
59
|
+
# Parse the documents
|
60
|
+
docs = samples.map do |sample|
|
61
|
+
output = kb_type==:patterns ? samples[:output] : extract(sample[:uri], sample[:html], Scrappy::Kb.extractors)
|
62
|
+
content = Nokogiri::HTML(sample[:html], nil, 'utf-8')
|
63
|
+
{ :uri=>sample[:uri], :content=>content, :output=>output }
|
64
|
+
end
|
65
|
+
|
66
|
+
# Fragment cloning to use a new common pool for caching intermediate results
|
67
|
+
fragments = []
|
68
|
+
pool = {}
|
69
|
+
root_fragments.each do |f|
|
70
|
+
fragment = Node(f.id, RDF::Graph.new(f.all_triples))
|
71
|
+
fragment.graph.pool = pool
|
72
|
+
fragments << fragment
|
73
|
+
end
|
74
|
+
|
75
|
+
# Iterates until no changes are made
|
26
76
|
@tried = []
|
27
|
-
|
77
|
+
@distances = {}
|
78
|
+
new_fragments = fragments
|
79
|
+
score = 0.0
|
80
|
+
i = 0
|
81
|
+
last_save = 0
|
28
82
|
begin
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
# Tries to perform one optimization of two fragments out of a set of fragments
|
36
|
-
def optimize_once fragments, options
|
37
|
-
docs = options[:docs]
|
38
|
-
fragments.each do |fragment1|
|
39
|
-
fragments.each do |fragment2|
|
40
|
-
next if fragment1 == fragment2
|
41
|
-
# Won't get gain if the fragment does not produce the same kind of RDF resource
|
42
|
-
next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
|
43
|
-
!fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
|
44
|
-
!fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
|
45
|
-
!fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
|
46
|
-
fragment1.sc::identifier.size != fragment2.sc::identifier.size
|
47
|
-
|
48
|
-
next if @tried.include?([fragment1, fragment2])
|
49
|
-
next if @tried.include?([fragment2, fragment1])
|
83
|
+
new_score = score(new_fragments, docs, kb_type)
|
84
|
+
|
85
|
+
if new_score >= score # Improvement after optimization?
|
86
|
+
puts 'Successful optimization' if i > 0
|
87
|
+
score = new_score
|
88
|
+
fragments = new_fragments
|
50
89
|
|
51
|
-
|
90
|
+
# Save to disk
|
91
|
+
if (Time.now - last_save).to_i > 60 and i > 0 and kb_type == :patterns
|
92
|
+
print "Saving..."; $stdout.flush
|
93
|
+
Scrappy::App.save_patterns fragments
|
94
|
+
puts "done!"
|
52
95
|
|
53
|
-
|
54
|
-
old_mappings = []
|
55
|
-
docs.each do |doc|
|
56
|
-
old_mappings += fragment1.all_mappings(:doc=>doc)
|
57
|
-
old_mappings += fragment2.all_mappings(:doc=>doc)
|
96
|
+
last_save = Time.now
|
58
97
|
end
|
59
|
-
|
98
|
+
else
|
99
|
+
puts 'Unsuccessful optimization, rolling back...'
|
100
|
+
end
|
101
|
+
puts
|
102
|
+
puts "Fragments: #{fragments.size}, score: #{score}"
|
103
|
+
puts "Trying optimization #{i+=1}..."
|
104
|
+
new_fragments = optimize fragments
|
105
|
+
end while new_fragments
|
106
|
+
puts 'Optimization finished'
|
107
|
+
|
108
|
+
fragments
|
109
|
+
end
|
110
|
+
|
111
|
+
# Tries to perform one optimization in a set of fragments
|
112
|
+
def optimize fragments
|
113
|
+
fragments.each_with_index do |fragment1, index|
|
114
|
+
fragments[0...index].sort_by { |fragment2| distance(fragment1, fragment2) }.each do |fragment2|
|
115
|
+
next if @tried.include?([fragment1, fragment2]) or @tried.include?([fragment2, fragment1])
|
60
116
|
|
61
|
-
|
62
|
-
new_fragment = mix(fragment1, fragment2, options)
|
117
|
+
new_fragment = group fragment1, fragment2
|
63
118
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
# Optimize subfragments
|
70
|
-
subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
|
71
|
-
subfragments.each { |subfragment| new_fragment.graph << subfragment }
|
72
|
-
new_fragment.sc::subfragment = subfragments.map &:node
|
73
|
-
|
74
|
-
# End if the new fragment returns the same results
|
75
|
-
if true
|
76
|
-
return fragments - [fragment1] - [fragment2] + [new_fragment]
|
77
|
-
end
|
119
|
+
@tried << [fragment1, fragment2]
|
120
|
+
|
121
|
+
# End by including the new fragment in the list and returning it
|
122
|
+
return fragments - [fragment1] - [fragment2] + [new_fragment] if new_fragment
|
78
123
|
end
|
79
124
|
end
|
80
|
-
|
125
|
+
return
|
81
126
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
127
|
+
|
128
|
+
# Groups two fragments into one
|
129
|
+
def group fragment1, fragment2, siblings=true
|
130
|
+
return unless signature(fragment1) == signature(fragment2)
|
85
131
|
|
86
|
-
|
87
|
-
new_fragment
|
88
|
-
new_fragment.
|
132
|
+
new_fragment = Node(nil)
|
133
|
+
new_fragment.rdf::type = Node("sc:Fragment")
|
134
|
+
new_fragment.graph.pool = fragment1.graph.pool
|
89
135
|
new_fragment.sc::type = fragment1.sc::type
|
90
136
|
new_fragment.sc::relation = fragment1.sc::relation
|
91
137
|
new_fragment.sc::superclass = fragment1.sc::superclass
|
92
138
|
new_fragment.sc::sameas = fragment1.sc::sameas
|
93
139
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
else
|
101
|
-
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
102
|
-
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
140
|
+
if fragment1.sc::min_cardinality.first and fragment2.sc::min_cardinality.first
|
141
|
+
if siblings
|
142
|
+
new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
|
143
|
+
else
|
144
|
+
new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
|
145
|
+
end
|
103
146
|
end
|
104
|
-
|
147
|
+
if fragment1.sc::max_cardinality.first and fragment2.sc::max_cardinality.first
|
148
|
+
if siblings
|
149
|
+
new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
|
150
|
+
else
|
151
|
+
new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
105
155
|
# sc:selector
|
106
|
-
|
107
|
-
|
108
|
-
new_fragment.sc::selector =
|
156
|
+
new_selector = merge(*(fragment1.sc::selector + fragment2.sc::selector))
|
157
|
+
return unless new_selector
|
158
|
+
new_fragment.sc::selector = new_selector
|
159
|
+
new_fragment.graph << new_selector
|
109
160
|
|
110
161
|
# sc:identifier
|
111
162
|
if fragment1.sc::identifier.first
|
112
|
-
|
113
|
-
|
114
|
-
new_fragment.sc::identifier =
|
163
|
+
new_identifier = merge(*(fragment1.sc::identifier + fragment2.sc::identifier))
|
164
|
+
return unless new_identifier
|
165
|
+
new_fragment.sc::identifier = new_identifier
|
166
|
+
new_fragment.graph << new_identifier
|
115
167
|
end
|
168
|
+
|
169
|
+
subfragments = mix(fragment1.sc::subfragment, fragment2.sc::subfragment)
|
170
|
+
return unless subfragments
|
171
|
+
|
172
|
+
subfragments.each { |f| return if !f; new_fragment.graph << f }
|
173
|
+
new_fragment.sc::subfragment = subfragments
|
174
|
+
|
175
|
+
puts " new fragment #{new_fragment} (#{short_name(new_fragment)}) out of #{fragment1} and #{fragment2}"
|
116
176
|
|
117
|
-
# All new nodes are expected to be inconsistent after performing
|
118
|
-
# subfragments' extractions. Otherwise, if new nodes are consistent, it means
|
119
|
-
# the output from the mixed fragment is different from the separate fragments
|
120
|
-
# and therefore the generalization has failed, so no mixed fragment is returned
|
121
177
|
new_fragment
|
122
178
|
end
|
123
179
|
|
124
|
-
#
|
125
|
-
def
|
180
|
+
# Mixes and aligns two set of fragments
|
181
|
+
def mix fragments1, fragments2
|
182
|
+
return unless fragments1.size == fragments2.size
|
183
|
+
|
184
|
+
# Build new fragments
|
185
|
+
used_fragments = []
|
186
|
+
fragments1.map do |fragment1|
|
187
|
+
fragment2 = fragments2.select { |fragment2| signature(fragment1) == signature(fragment2) }.first
|
188
|
+
return unless fragment2
|
189
|
+
return if used_fragments.include?(fragment2)
|
190
|
+
|
191
|
+
used_fragments << fragment2
|
192
|
+
|
193
|
+
group fragment1, fragment2, false
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
def signature fragment
|
198
|
+
[ fragment.sc::type.map(&:to_sym).to_set,
|
199
|
+
fragment.sc::relation.map(&:to_sym).to_set,
|
200
|
+
fragment.sc::superclass.map(&:to_sym).to_set,
|
201
|
+
fragment.sc::sameas.map(&:to_sym).to_set,
|
202
|
+
fragment.sc::identifier.first.nil?,
|
203
|
+
fragment.sc::subfragment.map { |sf| signature(sf) }.to_set ]
|
204
|
+
end
|
205
|
+
|
206
|
+
# Merges a set of selectors, returning a new more general one
|
207
|
+
def merge *selectors
|
126
208
|
selector = Node(nil)
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
209
|
+
if selectors.first.rdf::type.first == Node('sc:XPathSelector')
|
210
|
+
selector.rdf::type = Node('sc:XPathSelector')
|
211
|
+
selector.sc::attribute = selectors.first.sc::attribute
|
212
|
+
selector.sc::text = selectors.map { |s| s.sc::text }.flatten
|
213
|
+
|
214
|
+
xpaths = selectors.map { |s| s.rdf::value }.flatten.map { |s| xpath_for(s) }
|
215
|
+
selector.rdf::value = if selectors.map { |s| s.rdf::value }.uniq.size == 1
|
216
|
+
# All in common
|
217
|
+
selectors.first.rdf::value
|
218
|
+
elsif xpaths.map(&:size).uniq.size == 1
|
219
|
+
# Possible siblings
|
220
|
+
new_xpath = []
|
221
|
+
(0...xpaths.first.size).each do |i|
|
222
|
+
terms = xpaths.map { |xp| xp[i] }
|
223
|
+
tags = terms.map { |term| term[:tag] }.uniq
|
224
|
+
indexes = terms.map { |term| term[:index] }.uniq
|
225
|
+
conditions = terms.map { |term| term[:conditions] }
|
226
|
+
|
227
|
+
tag = tags.size > 1 ? '*' : tags.first
|
228
|
+
index = indexes.first if indexes.size == 1
|
229
|
+
conditions = conditions.inject { |acc, n| acc & n }
|
230
|
+
|
231
|
+
new_xpath << {:tag => tag, :conditions => conditions, :index => index}
|
232
|
+
end
|
233
|
+
xpath_expression_for(new_xpath)
|
234
|
+
else
|
235
|
+
# Nothing in common
|
236
|
+
return
|
237
|
+
nil
|
238
|
+
end
|
239
|
+
elsif selectors.first.rdf::type.first == Node('sc:VisualSelector')
|
240
|
+
selector.rdf::type = Node('sc:VisualSelector')
|
241
|
+
selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
|
242
|
+
selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
|
243
|
+
selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
|
244
|
+
selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
|
245
|
+
selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
|
246
|
+
selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
|
247
|
+
selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
|
248
|
+
selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
|
249
|
+
selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
|
250
|
+
selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
|
251
|
+
selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
|
252
|
+
selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
|
253
|
+
selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
|
254
|
+
selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
|
255
|
+
selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
|
256
|
+
selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
|
257
|
+
selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family.sort }.uniq.size == 1
|
258
|
+
selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag.sort }.uniq.size == 1
|
259
|
+
selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute.sort }.uniq.size == 1
|
260
|
+
end
|
147
261
|
|
148
262
|
selector
|
149
263
|
end
|
264
|
+
|
265
|
+
def distance fragment1, fragment2
|
266
|
+
return @distances[[fragment1.id, fragment2.id]] if @distances[[fragment1.id, fragment2.id]]
|
267
|
+
return 1/0.0 if signature(fragment1) != signature(fragment2)
|
268
|
+
|
269
|
+
# Calculate distances
|
270
|
+
distance = selector_distance(fragment1.sc::selector.first, fragment2.sc::selector.first)
|
271
|
+
distance += selector_distance(fragment1.sc::identifier.first, fragment2.sc::identifier.first) if fragment1.sc::identifier.first
|
272
|
+
|
273
|
+
# Calculate subfragments' distances
|
274
|
+
subfragments2 = fragment2.sc::subfragment
|
275
|
+
subdistances = fragment1.sc::subfragment.map do |subfragment1|
|
276
|
+
subfragment2 = subfragments2.select { |f| signature(subfragment1) == signature(f) }.first
|
277
|
+
subfragments2.delete subfragment2
|
278
|
+
|
279
|
+
subfragment2.nil? ? 500.0 : distance(subfragment1, subfragment2)
|
280
|
+
end
|
281
|
+
|
282
|
+
final_distance = distance + subdistances.inject(0.0) {|sum,d| sum+d} + subfragments2.size*500.0
|
283
|
+
@distances[[fragment1.id, fragment2.id]] = final_distance
|
284
|
+
@distances[[fragment2.id, fragment1.id]] = final_distance
|
285
|
+
end
|
286
|
+
|
287
|
+
def selector_distance selector1, selector2
|
288
|
+
distance = 0.0
|
289
|
+
distance += (selector1.sc::min_relative_x.first.to_i - selector2.sc::min_relative_x.first.to_i).abs
|
290
|
+
distance += (selector1.sc::max_relative_x.first.to_i - selector2.sc::max_relative_x.first.to_i).abs
|
291
|
+
distance += (selector1.sc::min_relative_y.first.to_i - selector2.sc::min_relative_y.first.to_i).abs
|
292
|
+
distance += (selector1.sc::max_relative_y.first.to_i - selector2.sc::max_relative_y.first.to_i).abs
|
293
|
+
distance += (selector1.sc::min_x.first.to_i - selector2.sc::min_x.first.to_i).abs
|
294
|
+
distance += (selector1.sc::max_x.first.to_i - selector2.sc::max_x.first.to_i).abs
|
295
|
+
distance += (selector1.sc::min_y.first.to_i - selector2.sc::min_y.first.to_i).abs
|
296
|
+
distance += (selector1.sc::max_y.first.to_i - selector2.sc::max_y.first.to_i).abs
|
297
|
+
distance += (selector1.sc::min_width.first.to_i - selector2.sc::min_width.first.to_i).abs
|
298
|
+
distance += (selector1.sc::max_width.first.to_i - selector2.sc::max_width.first.to_i).abs
|
299
|
+
distance += (selector1.sc::min_height.first.to_i - selector2.sc::min_height.first.to_i).abs
|
300
|
+
distance += (selector1.sc::max_height.first.to_i - selector2.sc::max_height.first.to_i).abs
|
301
|
+
distance += (selector1.sc::min_font_size.first.to_i - selector2.sc::min_font_size.first.to_i).abs * 100
|
302
|
+
distance += (selector1.sc::max_font_size.first.to_i - selector2.sc::max_font_size.first.to_i).abs * 100
|
303
|
+
distance += (selector1.sc::min_font_weight.first.to_i - selector2.sc::min_font_weight.first.to_i).abs
|
304
|
+
distance += (selector1.sc::max_font_weight.first.to_i - selector2.sc::max_font_weight.first.to_i).abs
|
305
|
+
distance += 100 if selector1.sc::font_family != selector2.sc::font_family
|
306
|
+
distance += 500 if selector1.sc::tag != selector2.sc::tag
|
307
|
+
distance
|
308
|
+
end
|
309
|
+
|
310
|
+
def score fragments, docs, kb_type
|
311
|
+
return 0.0 unless fragments
|
312
|
+
docs.inject(0.0) { |sum,doc| doc_score(fragments, doc, kb_type)+sum } / docs.size.to_f
|
313
|
+
end
|
314
|
+
|
315
|
+
def doc_score fragments, doc, kb_type
|
316
|
+
count = RDF::ID.count
|
317
|
+
extraction = extract_graph(fragments.map(&:proxy), :doc=>doc).triples
|
318
|
+
RDF::ID.count = count # Hack to reduce symbol creation
|
319
|
+
|
320
|
+
correct = doc[:output]
|
321
|
+
precision, recall, fscore = metrics(correct, extraction, true)
|
322
|
+
|
323
|
+
kb_type == :patterns ? fscore : recall
|
324
|
+
end
|
325
|
+
|
326
|
+
def metrics correct, extraction, debug=false
|
327
|
+
right = correct.size - (correct - extraction).size
|
328
|
+
|
329
|
+
if debug
|
330
|
+
puts " Wrong triples: \n" + RDF::Graph.new(extraction - correct).to_ntriples
|
331
|
+
puts " Missing triples: \n" + RDF::Graph.new(correct - extraction).to_ntriples
|
332
|
+
end
|
333
|
+
|
334
|
+
precision = extraction.size != 0 ? right/extraction.size.to_f : 1.0
|
335
|
+
recall = correct.size != 0 ? right/correct.size.to_f : 1.0
|
336
|
+
|
337
|
+
# Calculate fscore
|
338
|
+
fscore = 2.0*(recall*precision)/(precision+recall)
|
339
|
+
|
340
|
+
puts " Fscore: #{fscore}" if debug
|
341
|
+
|
342
|
+
[ precision, recall, fscore ]
|
343
|
+
end
|
344
|
+
|
345
|
+
private
|
346
|
+
def short_name fragment
|
347
|
+
[ fragment.sc::type.first, fragment.sc::relation.first ].
|
348
|
+
compact.
|
349
|
+
map { |id| RDF::ID.compress(id) } * ", "
|
350
|
+
end
|
351
|
+
|
352
|
+
def uri_selector_for uris
|
353
|
+
selector = Node(nil)
|
354
|
+
if uris.uniq.size == 1
|
355
|
+
selector.rdf::type = Node('sc:UriSelector')
|
356
|
+
selector.rdf::value = uris.first
|
357
|
+
selector
|
358
|
+
else
|
359
|
+
min_length = uris.map(&:length).min
|
360
|
+
pattern = ""
|
361
|
+
(0..min_length).map.reverse.each do |length|
|
362
|
+
pattern = uris.first[0..length]
|
363
|
+
break if uris.all? { |uri| uri.index(pattern) == 0 and uri.length > pattern.length }
|
364
|
+
end
|
365
|
+
selector.rdf::type = Node('sc:UriPatternSelector')
|
366
|
+
selector.rdf::value = pattern + "*"
|
367
|
+
selector
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
# Parses an xpath expression into an array
|
372
|
+
def xpath_for expression
|
373
|
+
start = expression[0..0]=='/' ? 1 : 0
|
374
|
+
expression.split('/')[start..-1].map do |term|
|
375
|
+
chunks = term.split('[')
|
376
|
+
tag = chunks[0]
|
377
|
+
conditions = chunks[1]
|
378
|
+
if conditions.to_i.to_s == conditions
|
379
|
+
# It's the index in fact
|
380
|
+
index = chunks[1]
|
381
|
+
else
|
382
|
+
conditions = conditions.chop.split(" and ") if conditions
|
383
|
+
index = chunks[2]
|
384
|
+
end
|
385
|
+
index = index.to_i if index
|
386
|
+
{ :tag=>tag, :conditions=>conditions||[], :index=>index }
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
# Serializes an xpath expression
|
391
|
+
def xpath_expression_for xpath
|
392
|
+
(xpath.first[:tag]=='.' ? "" : "/" ) + xpath.map do |term|
|
393
|
+
term[:tag] +
|
394
|
+
("[" + (term[:conditions]*' and ') + "]" if term[:conditions].size > 0).to_s +
|
395
|
+
("[" + term[:index].to_s + "]" if term[:index]).to_s
|
396
|
+
end * '/'
|
397
|
+
end
|
150
398
|
end
|
151
399
|
end
|