scrappy 0.3.5 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,6 +1,11 @@
1
+ === 0.4.0 2011-06-30
2
+
3
+ * Added machine learning of patterns that allow creating extractors automatically
4
+
1
5
  === 0.3.5 2011-03-29
2
6
 
3
7
  * Added nofollow support to NewUriSelectors
8
+ * Fix in queue management
4
9
 
5
10
  === 0.3.4 2011-03-25
6
11
 
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
10
10
  p.author = "Jose Ignacio"
11
11
  p.email = "joseignacio.fernandez@gmail.com"
12
12
  p.ignore_pattern = ["pkg/*"]
13
- p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.7'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
13
+ p.dependencies = [['activesupport','>= 2.3.5'], ['sinatra', '>= 1.1.2'], ['thin', '>= 1.2.7'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.3.9'], ['i18n', '>= 0.4.2'], ['rest-client', '>=1.6.1'], ['haml', '>= 3.0.24'], ['rack-flash', '>= 0.1.1']]
14
14
  end
15
15
 
16
16
  Rake::RDocTask.new(:rdoc) do |rdoc|
data/bin/scrappy CHANGED
@@ -33,7 +33,7 @@ module Scrappy
33
33
  opts.on('-g URI', '--get URI') { |uri| Options.uri = uri; Options.http_method=:get }
34
34
  opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
35
35
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
36
- opts.on('-u', '--debug') { Agent::Options.debug = true }
36
+ opts.on('-u [KEY]', '--debug [KEY]') { |key| Agent::Options.debug = true; Agent::Options.debug_key = key.downcase if key }
37
37
  opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
38
38
  opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
39
39
  opts.on('-P P', '--port P') { |p| Options.port = p }
@@ -71,7 +71,7 @@ module Scrappy
71
71
  end
72
72
 
73
73
  def self.quit
74
- puts "\"#{Quotes.sort_by{rand}.first}\"" unless Options.quiet
74
+ puts "\"#{Quotes[rand(Quotes.length)]}\"" unless Options.quiet
75
75
  exit
76
76
  end
77
77
 
@@ -103,26 +103,61 @@ module Scrappy
103
103
  def self.editable_kb?
104
104
  @editable_kb
105
105
  end
106
- def self.add_pattern graph
106
+ def self.add_patterns graph
107
107
  new_patterns = Scrappy::Kb.patterns.merge graph
108
108
  save_patterns new_patterns
109
109
  onload
110
110
  end
111
111
  def self.save_patterns new_patterns
112
- open(@patterns_file, "w") { |f| f.write new_patterns.serialize(:yarf) }
112
+ fragments = case new_patterns
113
+ when Array then
114
+ new_patterns
115
+ when RDF::Graph then
116
+ ( new_patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) - new_patterns.find([], Node('sc:subfragment'), nil) ).
117
+ map { |f| RDF::Graph.new(f.all_triples) }
118
+ end
119
+
120
+ content = RDF::ID.ns.map{|k,v| "#{k}: #{v}\n"} * ''
121
+ fragments.each { |f| content += f.serialize(:yarf, false) }
122
+ open(@patterns_file, "w") { |f| f.write content }
123
+ end
124
+ def self.delete_patterns
125
+ graph = Scrappy::Kb.patterns
126
+ graph.triples = []
127
+ content = graph.serialize(:yarf)
128
+ open(@patterns_file, "w") { |f| f.write content }
129
+ onload
113
130
  end
114
- def self.delete_pattern uri
115
- graph = Scrappy::Kb.patterns
116
- fragments = graph.find(nil, Node('rdf:type'), Node('sc:Fragment')).
117
- select { |fragment| fragment.sc::type.include?(Node(uri)) }
118
- fragments.each { |fragment| graph.triples -= fragment.all_triples }
119
- open(@patterns_file, "w") { |f| f.write graph.serialize(:yarf) }
131
+ def self.delete_pattern id
132
+ graph = Scrappy::Kb.patterns
133
+ fragment = graph[id]
134
+ graph.triples -= fragment.all_triples
135
+ content = graph.serialize(:yarf)
136
+ open(@patterns_file, "w") { |f| f.write content }
120
137
  onload
121
138
  end
122
139
  def self.add_extractor graph
123
140
  open(File.join(@extractors_folder,"extractor_#{Dir[File.join(@extractors_folder,'*')].size}.yarf"), "w") { |f| f.write graph.serialize(:yarf) }
124
141
  onload
125
142
  end
143
+ def self.replace_extractor graph, samples
144
+ kb = Scrappy::Kb.extractors
145
+
146
+ all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
147
+ all_fragments.each do |fragment|
148
+ fragment.sc::selector.each do |selector|
149
+ next unless ( selector.rdf::type.include?(Node('sc:UriSelector')) or
150
+ selector.rdf::type.include?(Node('sc:UriPatternSelector')) )
151
+
152
+ samples.each do |sample|
153
+ selector.rdf::value.each do |uri|
154
+ delete_extractor(uri) if !kb.node(selector).filter(:uri=>sample[:uri]).empty?
155
+ end
156
+ end
157
+ end
158
+ end
159
+ add_extractor graph
160
+ end
126
161
  def self.delete_extractor uri
127
162
  Dir[File.join(@extractors_folder, '*')].each do |file|
128
163
  format = file.split('.').last.to_sym
@@ -133,9 +168,12 @@ module Scrappy
133
168
  flatten.select do |uri_selector|
134
169
  uri_selector.rdf::value.include?(uri)
135
170
  end
171
+ next if uri_selectors.empty?
136
172
  fragments = uri_selectors.map { |uri_selector| graph.find(nil, Node('sc:selector'), uri_selector) }.flatten
137
173
  fragments.each { |fragment| graph.triples -= fragment.all_triples }
138
- open(file, "w") { |f| f.write graph.serialize(format) } if fragments.any?
174
+ text = graph.serialize(format)
175
+ open(file, "w") { |f| f.write text } if fragments.any?
176
+ File.delete(file) if text==""
139
177
  end
140
178
  onload
141
179
  end
@@ -165,7 +203,7 @@ Options
165
203
  -l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
166
204
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
167
205
  -D, --dump Dumps RDF data to disk
168
- -u, --debug Shows debugging traces
206
+ -u, --debug [KEYWORD] Shows debugging traces. Use optional keyword to filter selectors' output
169
207
  -o, --observe URLs Observes the specified URLs storing their data into the repository
170
208
  -s, --server [ROOT] Runs web server (optionally specify server's root url)
171
209
  -a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
@@ -234,15 +272,15 @@ Copyright
234
272
  # Load knowledge base
235
273
  Agent::Options.kb ||= RDF::Graph.new
236
274
 
237
- Kb.extractors, Kb.patterns = if File.exists?(@cache_file) and File.mtime(@cache_file) >= Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
275
+ Kb.extractors, Kb.patterns, RDF::ID::count = if File.exists?(@cache_file) and File.mtime(@cache_file) > Dir["#{@extractors_folder}/*",@extractors_folder,@patterns_file].map{ |f| File.mtime(f) }.max
238
276
  # Just load kb from cache
239
277
  open(@cache_file) { |f| Marshal.load(f) }
240
278
  else
241
279
  # Load YARF files and cache kb
242
280
  extractors = load_files_from(@extractors_folder)
243
281
  patterns = File.exists?(@patterns_file) ? RDF::Parser.parse(:yarf, open(@patterns_file).read) : RDF::Graph.new
244
- open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns], f) }
245
- [extractors, patterns]
282
+ open(@cache_file, "w") { |f| Marshal.dump([extractors, patterns, RDF::ID::count], f) }
283
+ [extractors, patterns, RDF::ID::count]
246
284
  end
247
285
 
248
286
  # Sets new kb
@@ -315,7 +353,7 @@ Dogs have owners, cats have staff
315
353
  I put all my genius into my life; I put only my talent into my works
316
354
  It is better to be beautiful than to be good, but it is better to be good than to be ugly
317
355
  All human beings, by nature, desire to know
318
- All life is an experiment
356
+ All life is an experiment. The more experiments you make the better
319
357
  An investment in knowledge always pays the best interest
320
358
  An optimist is a person who sees a green light everywhere. The pessimist sees only the red light. But the truly wise person is color blind
321
359
  Chance favors only those who court her
@@ -347,7 +385,27 @@ The man who does things makes many mistakes, but he never makes the biggest mist
347
385
  The man who makes no mistakes does not usually make anything
348
386
  The results you achieve will be in direct proportion to the effort you apply
349
387
  The reward of a thing well done is to have done it
350
- Don’t argue with idiots. They will bring you down to their level and beat you with experience""".split("\n")
388
+ Don’t argue with idiots. They will bring you down to their level and beat you with experience
389
+ Choose a work you love, and you will never have to work a day in your life
390
+ The secret of creativity is knowing how to hide your sources
391
+ I never think of the future. It comes soon enough
392
+ If you want to go quick, go alone. If you want to go far, go together
393
+ The only thing that interferes with my learning is my education
394
+ Excesive literary production is a social offense
395
+ A man who dares to waste one hour of time has not discovered the value of life
396
+ Any idiot can face a crisis -- it's day to day living that wears you out
397
+ Every man dies. Not every man really lives
398
+ After two weeks of working on a project, you know whether it will work or not
399
+ All things are difficult before they are easy
400
+ Sport is hard work for which you do not get paid
401
+ Do not hire a man who does your work for money, but him who does it for love of it
402
+ Failure is success if we learn from it
403
+ Formal education will make you a living; self-education will make you a fortune
404
+ Lost time is never found again
405
+ Men talk of killing time, while time quietly kills them
406
+ Only entropy comes easy
407
+ Any man can make mistakes, but only an idiot persists in his error
408
+ Managing is getting paid for home runs someone else hits""".split("\n")
351
409
  end
352
410
 
353
411
  Scrappy::App.run
data/lib/scrappy.rb CHANGED
@@ -24,5 +24,5 @@ require 'scrappy/agent/blind_agent'
24
24
  require 'scrappy/agent/agent'
25
25
 
26
26
  module Scrappy
27
- VERSION = '0.3.5'
27
+ VERSION = '0.4.0'
28
28
  end
@@ -21,19 +21,15 @@ module Scrappy
21
21
 
22
22
  # Extract each fragment
23
23
  options = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable }
24
- triples = []
25
- fragments_for(kb, uri).each do |fragment|
26
- kb.node(fragment).extract(options).each do |node|
27
- triples += node.graph.triples
28
- end
29
- end
24
+ output = extract_graph(fragments_for(kb, uri), options)
30
25
 
31
26
  puts "done!" if self.options.debug
32
27
 
33
- triples
28
+ output.triples
34
29
  end
35
30
  end
36
31
 
32
+ # Returns a list of fragments that have mappings in a given URI
37
33
  def fragments_for kb, uri
38
34
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
39
35
 
@@ -52,7 +48,14 @@ module Scrappy
52
48
 
53
49
  visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) }
54
50
 
55
- (uri_selectors + visual_selectors).map { |selector| fragments[selector] }
51
+ (uri_selectors + visual_selectors).map { |selector| fragments[selector].proxy }
52
+ end
53
+
54
+ # Extracts all mappings from a fragment and returns a graph
55
+ def extract_graph fragments, options
56
+ output = RDF::Graph.new
57
+ fragments.each { |fragment| fragment.extract(options).each { |result| output << result } }
58
+ output
56
59
  end
57
60
  end
58
61
  end
@@ -1,7 +1,7 @@
1
1
  module Sc
2
2
  class Fragment
3
3
  include RDF::NodeProxy
4
-
4
+
5
5
  # Extracts data out of a document and returns an array of nodes
6
6
  def extract options={}
7
7
  all_mappings(options).map { |mapping| mapping[:node] }
@@ -4,7 +4,9 @@ module Sc
4
4
  include Scrappy::Formats
5
5
 
6
6
  def select doc
7
- if sc::debug.first=="true" and Scrappy::Agent::Options.debug
7
+ if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
8
+ (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
9
+
8
10
  puts '== DEBUG'
9
11
  puts '== Selector:'
10
12
  puts node.serialize(:yarf, false)
@@ -18,7 +20,9 @@ module Sc
18
20
  # Filter method is defined in each subclass
19
21
  results = filter doc
20
22
 
21
- if sc::debug.first=="true" and Scrappy::Agent::Options.debug
23
+ if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
24
+ (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
25
+
22
26
  puts "== No results" if results.empty?
23
27
  results.each_with_index do |result, i|
24
28
  puts "== Result ##{i}:"
@@ -2,7 +2,7 @@ module Sc
2
2
  class UriPatternSelector < Selector
3
3
  def filter doc
4
4
  # Check if the uri fits the pattern
5
- if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
5
+ if rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+').gsub('?', '\?')}\Z/ }
6
6
  [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
7
7
  else
8
8
  []
@@ -1,60 +1,74 @@
1
1
  module Sc
2
2
  class VisualSelector < Selector
3
+
4
+ def initialize args={}
5
+ super
6
+ @cache = {}
7
+ end
8
+
3
9
  def filter doc
4
- # By initializing variables, we avoid getting data from a hash (slow)
5
- min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
6
- max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
7
- min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
8
- max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
9
- min_x = (sc::min_x.first.to_i if sc::min_x.first)
10
- max_x = (sc::max_x.first.to_i if sc::max_x.first)
11
- min_y = (sc::min_y.first.to_i if sc::min_y.first)
12
- max_y = (sc::max_y.first.to_i if sc::max_y.first)
13
- min_width = (sc::min_width.first.to_i if sc::min_width.first)
14
- max_width = (sc::max_width.first.to_i if sc::max_width.first)
15
- min_height = (sc::min_height.first.to_i if sc::min_height.first)
16
- max_height = (sc::max_height.first.to_i if sc::max_height.first)
17
- min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
18
- max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
19
- min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
20
- max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
21
- font_family = sc::font_family.first
22
- attributes = sc::attribute
23
- formats = sc::format
24
-
25
- doc[:content].search(sc::tag.first || "*").select do |node|
26
- relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
27
- relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
28
-
29
- !node.text? and
30
- ( !min_relative_x or relative_x >= min_relative_x) and
31
- ( !max_relative_x or relative_x <= max_relative_x) and
32
- ( !min_relative_y or relative_y >= min_relative_y) and
33
- ( !max_relative_y or relative_y <= max_relative_y) and
34
-
35
- ( !min_x or node['vx'].to_i >= min_x) and
36
- ( !max_x or node['vx'].to_i <= max_x) and
37
- ( !min_y or node['vy'].to_i >= min_y) and
38
- ( !max_y or node['vy'].to_i <= max_y) and
10
+ @cache[doc] ||= begin
11
+ # By initializing variables, we avoid getting data from a hash (slow)
12
+ min_relative_x = (sc::min_relative_x.first.to_i if sc::min_relative_x.first)
13
+ max_relative_x = (sc::max_relative_x.first.to_i if sc::max_relative_x.first)
14
+ min_relative_y = (sc::min_relative_y.first.to_i if sc::min_relative_y.first)
15
+ max_relative_y = (sc::max_relative_y.first.to_i if sc::max_relative_y.first)
16
+ min_x = (sc::min_x.first.to_i if sc::min_x.first)
17
+ max_x = (sc::max_x.first.to_i if sc::max_x.first)
18
+ min_y = (sc::min_y.first.to_i if sc::min_y.first)
19
+ max_y = (sc::max_y.first.to_i if sc::max_y.first)
20
+ min_width = (sc::min_width.first.to_i if sc::min_width.first)
21
+ max_width = (sc::max_width.first.to_i if sc::max_width.first)
22
+ min_height = (sc::min_height.first.to_i if sc::min_height.first)
23
+ max_height = (sc::max_height.first.to_i if sc::max_height.first)
24
+ min_font_size = (sc::min_font_size.first.to_i if sc::min_font_size.first)
25
+ max_font_size = (sc::max_font_size.first.to_i if sc::max_font_size.first)
26
+ min_font_weight = (sc::min_font_weight.first.to_i if sc::min_font_weight.first)
27
+ max_font_weight = (sc::max_font_weight.first.to_i if sc::max_font_weight.first)
28
+ font_family = sc::font_family.first
29
+ attributes = sc::attribute
30
+ formats = sc::format
31
+ tag = sc::tag
39
32
 
40
- ( !min_width or node['vw'].to_i >= min_width) and
41
- ( !max_width or node['vw'].to_i <= max_width) and
42
- ( !min_height or node['vh'].to_i >= min_height) and
43
- ( !max_height or node['vh'].to_i <= max_height) and
33
+ elements = doc[:content].search((tag - ["text"]).first || "*")
34
+ elements += Nokogiri::XML::NodeSet.new(doc[:content].document, [doc[:content]]) if tag.include?(doc[:content].name)
44
35
 
45
- ( !min_font_size or node['vsize'].to_i >= min_font_size) and
46
- ( !max_font_size or node['vsize'].to_i <= max_font_size) and
47
- ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
48
- ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
49
- ( !font_family or node['vfont'] == font_family)
50
- end.map do |content|
51
- if attributes.first
52
- # Select node's attribute if given
53
- attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
54
- else
55
- [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
56
- end
57
- end.flatten
36
+ elements.select do |node|
37
+ relative_x = node['vx'].to_i - doc[:content]['vx'].to_i
38
+ relative_y = node['vy'].to_i - doc[:content]['vy'].to_i
39
+
40
+ !node.text? and
41
+ ( (node['vfont'] and node.name!="a" and node.name!="img") or !tag.include?("text") ) and
42
+ ( !min_relative_x or relative_x >= min_relative_x) and
43
+ ( !max_relative_x or relative_x <= max_relative_x) and
44
+ ( !min_relative_y or relative_y >= min_relative_y) and
45
+ ( !max_relative_y or relative_y <= max_relative_y) and
46
+
47
+ ( !min_x or node['vx'].to_i >= min_x) and
48
+ ( !max_x or node['vx'].to_i <= max_x) and
49
+ ( !min_y or node['vy'].to_i >= min_y) and
50
+ ( !max_y or node['vy'].to_i <= max_y) and
51
+
52
+ ( !min_width or node['vw'].to_i >= min_width) and
53
+ ( !max_width or node['vw'].to_i <= max_width) and
54
+ ( !min_height or node['vh'].to_i >= min_height) and
55
+ ( !max_height or node['vh'].to_i <= max_height) and
56
+
57
+ ( !min_font_size or node['vsize'].to_i >= min_font_size) and
58
+ ( !max_font_size or node['vsize'].to_i <= max_font_size) and
59
+ ( !min_font_weight or node['vweight'].to_i >= min_font_weight) and
60
+ ( !max_font_weight or node['vweight'].to_i <= max_font_weight) and
61
+ ( !font_family or node['vfont'] == font_family)
62
+ end.map do |content|
63
+ if attributes.first
64
+ # Select node's attribute if given
65
+ attributes.map { |attribute| { :uri=>doc[:uri], :content=>content, :value=>content[attribute], :attribute=>attribute } }
66
+ else
67
+ [ { :uri=>doc[:uri], :content=>content, :value=>format(content, formats, doc[:uri]) } ]
68
+ end
69
+ end.flatten
70
+ end
58
71
  end
72
+
59
73
  end
60
74
  end
@@ -1,151 +1,399 @@
1
+ require 'set'
2
+
1
3
  module Scrappy
2
4
  module Optimizer
3
5
  # Iterates through a knowledge base and tries to merge and generalize
4
6
  # selectors whenever the output of the resulting kb is the same
5
- def optimize_patterns kb, sample
7
+ def optimize_extractors kb, samples
6
8
  # Build an array of fragments
7
- root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
8
- fragments = []; root_fragments.each { |f| fragments << kb.node(Node(f.id, RDF::Graph.new(f.all_triples))) }
9
+ all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
9
10
 
10
- # Parse the document
11
- doc = { :uri=>sample[:uri], :content=>Nokogiri::HTML(sample[:html], nil, 'utf-8') }
11
+ root_superfragments = all_fragments.select do |fragment|
12
+ fragment.sc::selector.any? do |selector|
13
+ ( selector.rdf::type.include?(Node('sc:UriSelector')) or
14
+ selector.rdf::type.include?(Node('sc:UriPatternSelector')) ) and
15
+ samples.any? { |sample| !kb.node(selector).filter(:uri=>sample[:uri]).empty? }
16
+ end
17
+ end
18
+ root_fragments = root_superfragments.map { |f| f.sc::subfragment }.flatten
12
19
 
13
- # Optimize the fragment
14
- fragments = optimize fragments, :docs=>[doc]
20
+ # Optimize the fragments
21
+ fragments = optimize_all root_fragments, samples, :extractors
15
22
 
16
- graph = RDF::Graph.new
17
- fragments.each { |fragment| graph << fragment }
23
+ # Build a graph by adding all fragments to a common URI-selected superfragment
24
+ superfragment = Node(nil)
25
+ identifier = Node(nil)
26
+ selector = uri_selector_for(samples.map { |sample| sample[:uri] })
27
+ identifier.rdf::type = Node('sc:BaseUriSelector')
28
+ superfragment.rdf::type = Node('sc:Fragment')
29
+ superfragment.sc::selector = selector
30
+ superfragment.sc::identifier = identifier
31
+ superfragment.graph << selector
32
+ superfragment.graph << identifier
33
+
34
+ triples = fragments.inject([]) do |triples, fragment|
35
+ triples << [superfragment.id, ID('sc:subfragment'), fragment.id]
36
+ triples += fragment.all_triples
37
+ end
38
+ triples += superfragment.all_triples
39
+
40
+ RDF::Graph.new(triples)
41
+ end
42
+
43
+ # Iterates through a knowledge base and tries to merge and generalize
44
+ # selectors whenever the output of the resulting kb is the same
45
+ def optimize_patterns kb, samples
46
+ # Build an array of fragments
47
+ root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)
48
+
49
+ # Optimize the fragments
50
+ fragments = optimize_all root_fragments, samples, :patterns
18
51
 
19
- graph
52
+ # Build a graph
53
+ RDF::Graph.new(fragments.inject([]) { |triples, fragment| triples += fragment.all_triples })
20
54
  end
21
55
 
22
56
  protected
23
- # Tries to optimize a set of fragments
24
- def optimize fragments, options
25
- # Tries to iterate until no changes are made
57
+ # Optimizes a set of fragments
58
+ def optimize_all root_fragments, samples, kb_type
59
+ # Parse the documents
60
+ docs = samples.map do |sample|
61
+ output = kb_type==:patterns ? samples[:output] : extract(sample[:uri], sample[:html], Scrappy::Kb.extractors)
62
+ content = Nokogiri::HTML(sample[:html], nil, 'utf-8')
63
+ { :uri=>sample[:uri], :content=>content, :output=>output }
64
+ end
65
+
66
+ # Fragment cloning to use a new common pool for caching intermediate results
67
+ fragments = []
68
+ pool = {}
69
+ root_fragments.each do |f|
70
+ fragment = Node(f.id, RDF::Graph.new(f.all_triples))
71
+ fragment.graph.pool = pool
72
+ fragments << fragment
73
+ end
74
+
75
+ # Iterates until no changes are made
26
76
  @tried = []
27
- new_fragments = fragments.map{ |f| f.proxy(Node('sc:Fragment')) }
77
+ @distances = {}
78
+ new_fragments = fragments
79
+ score = 0.0
80
+ i = 0
81
+ last_save = 0
28
82
  begin
29
- fragments = new_fragments
30
- new_fragments = optimize_once fragments, options
31
- end until fragments == new_fragments
32
- fragments
33
- end
34
-
35
- # Tries to perform one optimization of two fragments out of a set of fragments
36
- def optimize_once fragments, options
37
- docs = options[:docs]
38
- fragments.each do |fragment1|
39
- fragments.each do |fragment2|
40
- next if fragment1 == fragment2
41
- # Won't get gain if the fragment does not produce the same kind of RDF resource
42
- next if !fragment1.sc::type.equivalent?(fragment2.sc::type) or
43
- !fragment1.sc::relation.equivalent?(fragment2.sc::relation) or
44
- !fragment1.sc::superclass.equivalent?(fragment2.sc::superclass) or
45
- !fragment1.sc::sameas.equivalent?(fragment2.sc::sameas) or
46
- fragment1.sc::identifier.size != fragment2.sc::identifier.size
47
-
48
- next if @tried.include?([fragment1, fragment2])
49
- next if @tried.include?([fragment2, fragment1])
83
+ new_score = score(new_fragments, docs, kb_type)
84
+
85
+ if new_score >= score # Improvement after optimization?
86
+ puts 'Successful optimization' if i > 0
87
+ score = new_score
88
+ fragments = new_fragments
50
89
 
51
- @tried << [fragment1, fragment2]
90
+ # Save to disk
91
+ if (Time.now - last_save).to_i > 60 and i > 0 and kb_type == :patterns
92
+ print "Saving..."; $stdout.flush
93
+ Scrappy::App.save_patterns fragments
94
+ puts "done!"
52
95
 
53
- # Get mappings without mixing fragments
54
- old_mappings = []
55
- docs.each do |doc|
56
- old_mappings += fragment1.all_mappings(:doc=>doc)
57
- old_mappings += fragment2.all_mappings(:doc=>doc)
96
+ last_save = Time.now
58
97
  end
59
- old_docs = old_mappings.map { |mapping| mapping[:doc] }
98
+ else
99
+ puts 'Unsuccessful optimization, rolling back...'
100
+ end
101
+ puts
102
+ puts "Fragments: #{fragments.size}, score: #{score}"
103
+ puts "Trying optimization #{i+=1}..."
104
+ new_fragments = optimize fragments
105
+ end while new_fragments
106
+ puts 'Optimization finished'
107
+
108
+ fragments
109
+ end
110
+
111
+ # Tries to perform one optimization in a set of fragments
112
+ def optimize fragments
113
+ fragments.each_with_index do |fragment1, index|
114
+ fragments[0...index].sort_by { |fragment2| distance(fragment1, fragment2) }.each do |fragment2|
115
+ next if @tried.include?([fragment1, fragment2]) or @tried.include?([fragment2, fragment1])
60
116
 
61
- # Get mixed fragment
62
- new_fragment = mix(fragment1, fragment2, options)
117
+ new_fragment = group fragment1, fragment2
63
118
 
64
- # Get new mappings
65
- new_mappings = []
66
- docs.each { |doc| new_mappings += new_fragment.mappings(:doc=>doc) }
67
- new_docs = new_mappings.map { |mapping| mapping[:doc] }
68
-
69
- # Optimize subfragments
70
- subfragments = optimize(fragment1.sc::subfragment + fragment2.sc::subfragment, options.merge(:docs=>new_docs))
71
- subfragments.each { |subfragment| new_fragment.graph << subfragment }
72
- new_fragment.sc::subfragment = subfragments.map &:node
73
-
74
- # End if the new fragment returns the same results
75
- if true
76
- return fragments - [fragment1] - [fragment2] + [new_fragment]
77
- end
119
+ @tried << [fragment1, fragment2]
120
+
121
+ # End by including the new fragment in the list and returning it
122
+ return fragments - [fragment1] - [fragment2] + [new_fragment] if new_fragment
78
123
  end
79
124
  end
80
- fragments
125
+ return
81
126
  end
82
-
83
- def mix fragment1, fragment2, options
84
- docs = options[:docs]
127
+
128
+ # Groups two fragments into one
129
+ def group fragment1, fragment2, siblings=true
130
+ return unless signature(fragment1) == signature(fragment2)
85
131
 
86
- # Build new fragment
87
- new_fragment = Node(nil).proxy(Node('sc:Fragment'))
88
- new_fragment.rdf::type = Node('sc:Fragment')
132
+ new_fragment = Node(nil)
133
+ new_fragment.rdf::type = Node("sc:Fragment")
134
+ new_fragment.graph.pool = fragment1.graph.pool
89
135
  new_fragment.sc::type = fragment1.sc::type
90
136
  new_fragment.sc::relation = fragment1.sc::relation
91
137
  new_fragment.sc::superclass = fragment1.sc::superclass
92
138
  new_fragment.sc::sameas = fragment1.sc::sameas
93
139
 
94
- # If fragments share the same parent, cardinality has to increase
95
- # Otherwise, they might map the same subdocument, so cardinality
96
- # limits are made more general.
97
- if fragment1.sc::superfragment.first and fragment1.sc::superfragment == fragment2.sc::superfragment
98
- new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
99
- new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
100
- else
101
- new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
102
- new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
140
+ if fragment1.sc::min_cardinality.first and fragment2.sc::min_cardinality.first
141
+ if siblings
142
+ new_fragment.sc::min_cardinality = (fragment1.sc::min_cardinality.first.to_i + fragment2.sc::min_cardinality.first.to_i).to_s
143
+ else
144
+ new_fragment.sc::min_cardinality = [fragment1.sc::min_cardinality.first.to_i, + fragment2.sc::min_cardinality.first.to_i].min.to_s
145
+ end
103
146
  end
104
-
147
+ if fragment1.sc::max_cardinality.first and fragment2.sc::max_cardinality.first
148
+ if siblings
149
+ new_fragment.sc::max_cardinality = (fragment1.sc::max_cardinality.first.to_i + fragment2.sc::max_cardinality.first.to_i).to_s
150
+ else
151
+ new_fragment.sc::max_cardinality = [fragment1.sc::max_cardinality.first.to_i, + fragment2.sc::max_cardinality.first.to_i].max.to_s
152
+ end
153
+ end
154
+
105
155
  # sc:selector
106
- selector = generalize(fragment1.sc::selector + fragment2.sc::selector)
107
- new_fragment.graph << selector
108
- new_fragment.sc::selector = selector
156
+ new_selector = merge(*(fragment1.sc::selector + fragment2.sc::selector))
157
+ return unless new_selector
158
+ new_fragment.sc::selector = new_selector
159
+ new_fragment.graph << new_selector
109
160
 
110
161
  # sc:identifier
111
162
  if fragment1.sc::identifier.first
112
- selector = generalize(fragment1.sc::identifier + fragment2.sc::identifier)
113
- new_fragment.graph << selector
114
- new_fragment.sc::identifier = selector
163
+ new_identifier = merge(*(fragment1.sc::identifier + fragment2.sc::identifier))
164
+ return unless new_identifier
165
+ new_fragment.sc::identifier = new_identifier
166
+ new_fragment.graph << new_identifier
115
167
  end
168
+
169
+ subfragments = mix(fragment1.sc::subfragment, fragment2.sc::subfragment)
170
+ return unless subfragments
171
+
172
+ subfragments.each { |f| return if !f; new_fragment.graph << f }
173
+ new_fragment.sc::subfragment = subfragments
174
+
175
+ puts " new fragment #{new_fragment} (#{short_name(new_fragment)}) out of #{fragment1} and #{fragment2}"
116
176
 
117
- # All new nodes are expected to be inconsistent after performing
118
- # subfragments' extractions. Otherwise, if new nodes are consistent, it means
119
- # the output from the mixed fragment is different from the separate fragments
120
- # and therefore the generalization has failed, so no mixed fragment is returned
121
177
  new_fragment
122
178
  end
123
179
 
124
- # Generalize a set of selectors
125
- def generalize selectors
180
+ # Mixes and aligns two set of fragments
181
+ def mix fragments1, fragments2
182
+ return unless fragments1.size == fragments2.size
183
+
184
+ # Build new fragments
185
+ used_fragments = []
186
+ fragments1.map do |fragment1|
187
+ fragment2 = fragments2.select { |fragment2| signature(fragment1) == signature(fragment2) }.first
188
+ return unless fragment2
189
+ return if used_fragments.include?(fragment2)
190
+
191
+ used_fragments << fragment2
192
+
193
+ group fragment1, fragment2, false
194
+ end
195
+ end
196
+
197
+ def signature fragment
198
+ [ fragment.sc::type.map(&:to_sym).to_set,
199
+ fragment.sc::relation.map(&:to_sym).to_set,
200
+ fragment.sc::superclass.map(&:to_sym).to_set,
201
+ fragment.sc::sameas.map(&:to_sym).to_set,
202
+ fragment.sc::identifier.first.nil?,
203
+ fragment.sc::subfragment.map { |sf| signature(sf) }.to_set ]
204
+ end
205
+
206
+ # Merges a set of selectors, returning a new more general one
207
+ def merge *selectors
126
208
  selector = Node(nil)
127
- selector.rdf::type = Node('sc:VisualSelector')
128
- selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
129
- selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
130
- selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
131
- selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
132
- selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
133
- selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
134
- selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
135
- selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
136
- selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
137
- selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
138
- selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
139
- selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
140
- selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
141
- selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
142
- selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
143
- selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
144
- selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family }.flatten.uniq.size == 1
145
- selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag }.flatten.uniq.size == 1
146
- selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute }.flatten.uniq.size == 1
209
+ if selectors.first.rdf::type.first == Node('sc:XPathSelector')
210
+ selector.rdf::type = Node('sc:XPathSelector')
211
+ selector.sc::attribute = selectors.first.sc::attribute
212
+ selector.sc::text = selectors.map { |s| s.sc::text }.flatten
213
+
214
+ xpaths = selectors.map { |s| s.rdf::value }.flatten.map { |s| xpath_for(s) }
215
+ selector.rdf::value = if selectors.map { |s| s.rdf::value }.uniq.size == 1
216
+ # All in common
217
+ selectors.first.rdf::value
218
+ elsif xpaths.map(&:size).uniq.size == 1
219
+ # Possible siblings
220
+ new_xpath = []
221
+ (0...xpaths.first.size).each do |i|
222
+ terms = xpaths.map { |xp| xp[i] }
223
+ tags = terms.map { |term| term[:tag] }.uniq
224
+ indexes = terms.map { |term| term[:index] }.uniq
225
+ conditions = terms.map { |term| term[:conditions] }
226
+
227
+ tag = tags.size > 1 ? '*' : tags.first
228
+ index = indexes.first if indexes.size == 1
229
+ conditions = conditions.inject { |acc, n| acc & n }
230
+
231
+ new_xpath << {:tag => tag, :conditions => conditions, :index => index}
232
+ end
233
+ xpath_expression_for(new_xpath)
234
+ else
235
+ # Nothing in common
236
+ return
237
+ nil
238
+ end
239
+ elsif selectors.first.rdf::type.first == Node('sc:VisualSelector')
240
+ selector.rdf::type = Node('sc:VisualSelector')
241
+ selector.sc::min_relative_x = selectors.map { |s| s.sc::min_relative_x.map(&:to_i) }.flatten.min.to_s
242
+ selector.sc::max_relative_x = selectors.map { |s| s.sc::max_relative_x.map(&:to_i) }.flatten.max.to_s
243
+ selector.sc::min_relative_y = selectors.map { |s| s.sc::min_relative_y.map(&:to_i) }.flatten.min.to_s
244
+ selector.sc::max_relative_y = selectors.map { |s| s.sc::max_relative_y.map(&:to_i) }.flatten.max.to_s
245
+ selector.sc::min_x = selectors.map { |s| s.sc::min_x.map(&:to_i) }.flatten.min.to_s
246
+ selector.sc::max_x = selectors.map { |s| s.sc::max_x.map(&:to_i) }.flatten.max.to_s
247
+ selector.sc::min_y = selectors.map { |s| s.sc::min_y.map(&:to_i) }.flatten.min.to_s
248
+ selector.sc::max_y = selectors.map { |s| s.sc::max_y.map(&:to_i) }.flatten.max.to_s
249
+ selector.sc::min_width = selectors.map { |s| s.sc::min_width.map(&:to_i) }.flatten.min.to_s
250
+ selector.sc::max_width = selectors.map { |s| s.sc::max_width.map(&:to_i) }.flatten.max.to_s
251
+ selector.sc::min_height = selectors.map { |s| s.sc::min_height.map(&:to_i) }.flatten.min.to_s
252
+ selector.sc::max_height = selectors.map { |s| s.sc::max_height.map(&:to_i) }.flatten.max.to_s
253
+ selector.sc::min_font_size = selectors.map { |s| s.sc::min_font_size.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_size.first }
254
+ selector.sc::max_font_size = selectors.map { |s| s.sc::max_font_size.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_size.first }
255
+ selector.sc::min_font_weight = selectors.map { |s| s.sc::min_font_weight.map(&:to_i) }.flatten.min.to_s if selectors.all? { |s| s.sc::min_font_weight.first }
256
+ selector.sc::max_font_weight = selectors.map { |s| s.sc::max_font_weight.map(&:to_i) }.flatten.max.to_s if selectors.all? { |s| s.sc::max_font_weight.first }
257
+ selector.sc::font_family = selectors.first.sc::font_family if selectors.map { |s| s.sc::font_family.sort }.uniq.size == 1
258
+ selector.sc::tag = selectors.first.sc::tag if selectors.map { |s| s.sc::tag.sort }.uniq.size == 1
259
+ selector.sc::attribute = selectors.first.sc::attribute if selectors.map { |s| s.sc::attribute.sort }.uniq.size == 1
260
+ end
147
261
 
148
262
  selector
149
263
  end
264
+
265
+ def distance fragment1, fragment2
266
+ return @distances[[fragment1.id, fragment2.id]] if @distances[[fragment1.id, fragment2.id]]
267
+ return 1/0.0 if signature(fragment1) != signature(fragment2)
268
+
269
+ # Calculate distances
270
+ distance = selector_distance(fragment1.sc::selector.first, fragment2.sc::selector.first)
271
+ distance += selector_distance(fragment1.sc::identifier.first, fragment2.sc::identifier.first) if fragment1.sc::identifier.first
272
+
273
+ # Calculate subfragments' distances
274
+ subfragments2 = fragment2.sc::subfragment
275
+ subdistances = fragment1.sc::subfragment.map do |subfragment1|
276
+ subfragment2 = subfragments2.select { |f| signature(subfragment1) == signature(f) }.first
277
+ subfragments2.delete subfragment2
278
+
279
+ subfragment2.nil? ? 500.0 : distance(subfragment1, subfragment2)
280
+ end
281
+
282
+ final_distance = distance + subdistances.inject(0.0) {|sum,d| sum+d} + subfragments2.size*500.0
283
+ @distances[[fragment1.id, fragment2.id]] = final_distance
284
+ @distances[[fragment2.id, fragment1.id]] = final_distance
285
+ end
286
+
287
+ def selector_distance selector1, selector2
288
+ distance = 0.0
289
+ distance += (selector1.sc::min_relative_x.first.to_i - selector2.sc::min_relative_x.first.to_i).abs
290
+ distance += (selector1.sc::max_relative_x.first.to_i - selector2.sc::max_relative_x.first.to_i).abs
291
+ distance += (selector1.sc::min_relative_y.first.to_i - selector2.sc::min_relative_y.first.to_i).abs
292
+ distance += (selector1.sc::max_relative_y.first.to_i - selector2.sc::max_relative_y.first.to_i).abs
293
+ distance += (selector1.sc::min_x.first.to_i - selector2.sc::min_x.first.to_i).abs
294
+ distance += (selector1.sc::max_x.first.to_i - selector2.sc::max_x.first.to_i).abs
295
+ distance += (selector1.sc::min_y.first.to_i - selector2.sc::min_y.first.to_i).abs
296
+ distance += (selector1.sc::max_y.first.to_i - selector2.sc::max_y.first.to_i).abs
297
+ distance += (selector1.sc::min_width.first.to_i - selector2.sc::min_width.first.to_i).abs
298
+ distance += (selector1.sc::max_width.first.to_i - selector2.sc::max_width.first.to_i).abs
299
+ distance += (selector1.sc::min_height.first.to_i - selector2.sc::min_height.first.to_i).abs
300
+ distance += (selector1.sc::max_height.first.to_i - selector2.sc::max_height.first.to_i).abs
301
+ distance += (selector1.sc::min_font_size.first.to_i - selector2.sc::min_font_size.first.to_i).abs * 100
302
+ distance += (selector1.sc::max_font_size.first.to_i - selector2.sc::max_font_size.first.to_i).abs * 100
303
+ distance += (selector1.sc::min_font_weight.first.to_i - selector2.sc::min_font_weight.first.to_i).abs
304
+ distance += (selector1.sc::max_font_weight.first.to_i - selector2.sc::max_font_weight.first.to_i).abs
305
+ distance += 100 if selector1.sc::font_family != selector2.sc::font_family
306
+ distance += 500 if selector1.sc::tag != selector2.sc::tag
307
+ distance
308
+ end
309
+
310
+ def score fragments, docs, kb_type
311
+ return 0.0 unless fragments
312
+ docs.inject(0.0) { |sum,doc| doc_score(fragments, doc, kb_type)+sum } / docs.size.to_f
313
+ end
314
+
315
+ def doc_score fragments, doc, kb_type
316
+ count = RDF::ID.count
317
+ extraction = extract_graph(fragments.map(&:proxy), :doc=>doc).triples
318
+ RDF::ID.count = count # Hack to reduce symbol creation
319
+
320
+ correct = doc[:output]
321
+ precision, recall, fscore = metrics(correct, extraction, true)
322
+
323
+ kb_type == :patterns ? fscore : recall
324
+ end
325
+
326
+ def metrics correct, extraction, debug=false
327
+ right = correct.size - (correct - extraction).size
328
+
329
+ if debug
330
+ puts " Wrong triples: \n" + RDF::Graph.new(extraction - correct).to_ntriples
331
+ puts " Missing triples: \n" + RDF::Graph.new(correct - extraction).to_ntriples
332
+ end
333
+
334
+ precision = extraction.size != 0 ? right/extraction.size.to_f : 1.0
335
+ recall = correct.size != 0 ? right/correct.size.to_f : 1.0
336
+
337
+ # Calculate fscore
338
+ fscore = 2.0*(recall*precision)/(precision+recall)
339
+
340
+ puts " Fscore: #{fscore}" if debug
341
+
342
+ [ precision, recall, fscore ]
343
+ end
344
+
345
+ private
346
+ def short_name fragment
347
+ [ fragment.sc::type.first, fragment.sc::relation.first ].
348
+ compact.
349
+ map { |id| RDF::ID.compress(id) } * ", "
350
+ end
351
+
352
+ def uri_selector_for uris
353
+ selector = Node(nil)
354
+ if uris.uniq.size == 1
355
+ selector.rdf::type = Node('sc:UriSelector')
356
+ selector.rdf::value = uris.first
357
+ selector
358
+ else
359
+ min_length = uris.map(&:length).min
360
+ pattern = ""
361
+ (0..min_length).map.reverse.each do |length|
362
+ pattern = uris.first[0..length]
363
+ break if uris.all? { |uri| uri.index(pattern) == 0 and uri.length > pattern.length }
364
+ end
365
+ selector.rdf::type = Node('sc:UriPatternSelector')
366
+ selector.rdf::value = pattern + "*"
367
+ selector
368
+ end
369
+ end
370
+
371
+ # Parses an xpath expression into an array
372
+ def xpath_for expression
373
+ start = expression[0..0]=='/' ? 1 : 0
374
+ expression.split('/')[start..-1].map do |term|
375
+ chunks = term.split('[')
376
+ tag = chunks[0]
377
+ conditions = chunks[1]
378
+ if conditions.to_i.to_s == conditions
379
+ # It's the index in fact
380
+ index = chunks[1]
381
+ else
382
+ conditions = conditions.chop.split(" and ") if conditions
383
+ index = chunks[2]
384
+ end
385
+ index = index.to_i if index
386
+ { :tag=>tag, :conditions=>conditions||[], :index=>index }
387
+ end
388
+ end
389
+
390
+ # Serializes an xpath expression
391
+ def xpath_expression_for xpath
392
+ (xpath.first[:tag]=='.' ? "" : "/" ) + xpath.map do |term|
393
+ term[:tag] +
394
+ ("[" + (term[:conditions]*' and ') + "]" if term[:conditions].size > 0).to_s +
395
+ ("[" + term[:index].to_s + "]" if term[:index]).to_s
396
+ end * '/'
397
+ end
150
398
  end
151
399
  end