scrappy 0.3.5 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,43 +3,79 @@ module Scrappy
3
3
  # Generates visual patterns
4
4
  def train *samples
5
5
  RDF::Graph.new( samples.inject([]) do |triples, sample|
6
- triples + train_sample(sample).triples
6
+ triples + train_sample( sample ).triples
7
7
  end )
8
8
  end
9
-
9
+
10
+ # Generate XPath fragments
11
+ def train_xpath *samples
12
+ RDF::Graph.new( samples.inject([]) do |triples, sample|
13
+ triples + train_sample(sample, true).triples
14
+ end )
15
+ end
16
+
10
17
  private
11
- def train_sample sample
12
- results = RDF::Graph.new extract(sample[:uri], sample[:html], Scrappy::Kb.extractors, :minimum)
18
+ def train_sample sample, xpath=false
19
+ sample = sample.merge(:content=>Nokogiri::HTML(sample[:html], nil, 'utf-8'))
20
+ results = RDF::Graph.new extract(sample[:uri], sample[:html], xpath ? Scrappy::Kb.patterns : Scrappy::Kb.extractors, :minimum)
13
21
 
14
22
  typed_nodes = results.find(nil, Node("rdf:type"), [])
15
23
  non_root_nodes = results.find([], [], nil)
16
24
 
17
25
  nodes = typed_nodes - non_root_nodes
18
-
26
+
27
+ superfragment = Node(nil)
28
+ selector = Node(nil)
29
+ identifier = Node(nil)
30
+ selector.rdf::type = Node('sc:UriSelector')
31
+ selector.rdf::value = sample[:uri]
32
+ identifier.rdf::type = Node('sc:BaseUriSelector')
33
+ superfragment.rdf::type = Node('sc:Fragment')
34
+ superfragment.sc::selector = selector
35
+ superfragment.sc::identifier = identifier
36
+ superfragment.graph << selector
37
+ superfragment.graph << identifier
38
+
19
39
  RDF::Graph.new( nodes.inject([]) do |triples, node|
20
- triples + fragment_for(node).graph.triples
21
- end )
40
+ fragment = fragment_for(node, sample, xpath)
41
+ # Include a superfragment that limits the fragment to a specified URI
42
+ if xpath
43
+ other_triples = [ [superfragment.id, ID('sc:subfragment'), fragment.id] ]
44
+ else
45
+ other_triples = []
46
+ end
47
+
48
+ triples + fragment.graph.triples + other_triples
49
+ end + (xpath ? superfragment.graph.triples : []) )
22
50
  end
23
51
 
24
- def fragment_for node, parent=nil
52
+ def fragment_for node, sample, xpath, parent=nil, parent_path=nil
25
53
  fragment = Node(nil)
54
+ node_path = node.sc::source.first.sc::selector.first.sc::path.first
26
55
  node.keys.each do |predicate|
27
56
  case predicate
28
57
  when ID("sc:source") then
29
- selector = selector_for(node.sc::source.first, parent)
58
+ selector = selector_for(node.sc::source.first, sample, xpath, parent, parent_path)
30
59
  fragment.graph << selector
31
60
  fragment.sc::selector = selector
32
61
  when ID("sc:uri") then
33
- selector = selector_for(node.sc::uri.first.sc::source.first, node)
62
+ selector = selector_for(node.sc::uri.first.sc::source.first, sample, xpath, node, node_path)
34
63
  fragment.graph << selector
35
64
  fragment.sc::identifier = selector
36
65
  when ID("rdf:type") then
37
66
  fragment.sc::type = node.rdf::type
38
67
  else
39
68
  if node[predicate].map(&:class).uniq.first == RDF::Node
69
+ done = []
40
70
  node[predicate].map do |subnode|
41
- subfragment = fragment_for(subnode, node)
71
+ selector = subnode.sc::source.first.sc::selector.first
72
+ next if done.include?( {}.merge(selector) )
73
+ done << {}.merge(selector)
74
+
75
+ subfragment = fragment_for(subnode, sample, xpath, node, node_path)
42
76
  subfragment.sc::relation = Node(predicate)
77
+ subfragment.sc::min_cardinality = "1"
78
+ subfragment.sc::max_cardinality = "1"
43
79
 
44
80
  fragment.graph << subfragment
45
81
  fragment.sc::subfragment += [subfragment]
@@ -48,47 +84,83 @@ module Scrappy
48
84
  end
49
85
  end
50
86
  fragment.rdf::type = Node("sc:Fragment")
51
- fragment.sc::min_cardinality = "1"
52
- fragment.sc::max_cardinality = "1"
87
+
53
88
  fragment
54
89
  end
55
90
 
56
- def selector_for fragment, parent=nil
91
+ def selector_for fragment, sample, xpath=false, parent=nil, parent_path=nil
57
92
  fragment_selector = fragment.sc::selector.first
58
93
  presentation = fragment.sc::presentation.first
59
94
 
60
95
  selector = Node(nil)
61
- selector.rdf::type = Node("sc:VisualSelector")
62
-
63
- origin_x = parent ? parent.sc::source.first.sc::presentation.first.sc::x.first.to_i : 0
64
- origin_y = parent ? parent.sc::source.first.sc::presentation.first.sc::y.first.to_i : 0
65
-
66
- relative_x = presentation.sc::x.first.to_i - origin_x
67
- relative_y = presentation.sc::y.first.to_i - origin_y
68
96
 
69
- selector.sc::min_relative_x = relative_x.to_s
70
- selector.sc::max_relative_x = relative_x.to_s
71
- selector.sc::min_relative_y = relative_y.to_s
72
- selector.sc::max_relative_y = relative_y.to_s
73
- selector.sc::min_x = presentation.sc::x
74
- selector.sc::max_x = presentation.sc::x
75
- selector.sc::min_y = presentation.sc::y
76
- selector.sc::max_y = presentation.sc::y
77
-
78
- selector.sc::min_width = presentation.sc::width
79
- selector.sc::max_width = presentation.sc::width
80
- selector.sc::min_height = presentation.sc::height
81
- selector.sc::max_height = presentation.sc::height
82
- selector.sc::min_font_size = presentation.sc::font_size
83
- selector.sc::max_font_size = presentation.sc::font_size
84
- selector.sc::min_font_weight = presentation.sc::font_weight
85
- selector.sc::max_font_weight = presentation.sc::font_weight
86
- selector.sc::font_family = presentation.sc::font_family
97
+ if xpath
98
+ selector.rdf::type = Node("sc:XPathSelector")
99
+ selector.sc::text = presentation.sc::text
100
+ selector.rdf::value = path_for fragment_selector.sc::path.first, parent_path, sample
101
+ else
102
+ selector.rdf::type = Node("sc:VisualSelector")
103
+
104
+ origin_x = parent ? parent.sc::source.first.sc::presentation.first.sc::x.first.to_i : 0
105
+ origin_y = parent ? parent.sc::source.first.sc::presentation.first.sc::y.first.to_i : 0
106
+
107
+ relative_x = presentation.sc::x.first.to_i - origin_x
108
+ relative_y = presentation.sc::y.first.to_i - origin_y
109
+
110
+ selector.sc::min_relative_x = relative_x.to_s
111
+ selector.sc::max_relative_x = relative_x.to_s
112
+ selector.sc::min_relative_y = relative_y.to_s
113
+ selector.sc::max_relative_y = relative_y.to_s
114
+ selector.sc::min_x = presentation.sc::x
115
+ selector.sc::max_x = presentation.sc::x
116
+ selector.sc::min_y = presentation.sc::y
117
+ selector.sc::max_y = presentation.sc::y
118
+
119
+ selector.sc::min_width = presentation.sc::width
120
+ selector.sc::max_width = presentation.sc::width
121
+ selector.sc::min_height = presentation.sc::height
122
+ selector.sc::max_height = presentation.sc::height
123
+ selector.sc::min_font_size = presentation.sc::font_size
124
+ selector.sc::max_font_size = presentation.sc::font_size
125
+ selector.sc::min_font_weight = presentation.sc::font_weight
126
+ selector.sc::max_font_weight = presentation.sc::font_weight
127
+ selector.sc::font_family = presentation.sc::font_family
128
+
129
+ selector.sc::tag = ["text"] if presentation.sc::font_family.first
130
+ special_tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
131
+ selector.sc::tag = special_tag if special_tag.size > 0
132
+ end
87
133
 
88
- selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
89
134
  selector.sc::attribute = fragment_selector.sc::attribute
90
135
 
91
136
  selector
92
137
  end
138
+
139
+ def path_for path, parent_path, sample
140
+ return "./." if path == parent_path
141
+ return path if ["", "/html", "/html/body"].include?(path)
142
+
143
+ node = sample[:content].search(path).first
144
+ conditions = []
145
+ if node[:class]
146
+ conditions += node[:class].split(" ").map {|c| "contains(concat(' ',normalize-space(@class),' '),concat(' ','#{c.strip}',' '))" }
147
+ else
148
+ conditions += ["not(@class)"]
149
+ end
150
+ if node[:id]
151
+ conditions += ["contains(@id,'#{node[:id].strip}')"]
152
+ else
153
+ conditions += ["not(@id)"]
154
+ end
155
+ selector = "/#{node.name}[#{conditions * " and "}]"
156
+ index = nil
157
+ results = node.parent.search(".#{selector}")
158
+ results.each_with_index { |n,i| index = i+1 if n.path == path }
159
+
160
+ previous_path = path.split("/")[0..-2] * "/"
161
+ suffix = results.size > 1 ? "[#{index}]" : ""
162
+
163
+ path_for(previous_path, parent_path, sample) + selector + suffix
164
+ end
93
165
  end
94
166
  end
@@ -37,11 +37,13 @@ module Scrappy
37
37
  app.post '/extractors' do
38
38
  if params[:html]
39
39
  # Generate extractor automatically
40
- iconv = Iconv.new(params[:encoding], 'UTF-8')
41
- html = iconv.iconv(params[:html])
42
- puts params[:html]
43
- puts params[:uri]
44
- raise Exception, "Automatic generation of extractors is not supported yet"
40
+ html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
41
+ samples = [{ :html=>html, :uri=>params[:uri] }]
42
+ extractor = agent.train_xpath(*samples)
43
+ # Train
44
+ Scrappy::App.add_extractor extractor
45
+ # Optimize
46
+ Scrappy::App.replace_extractor agent.optimize_extractors(Scrappy::Kb.extractors, samples), samples
45
47
  else
46
48
  # Store the given extractor
47
49
  Scrappy::App.add_extractor RDF::Parser.parse(:yarf,params[:rdf])
@@ -58,14 +60,30 @@ module Scrappy
58
60
  # Patterns
59
61
 
60
62
  app.get '/patterns' do
61
- @uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
62
- Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
63
- map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
+ @patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
64
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
64
65
  haml :patterns
65
66
  end
66
67
 
67
- app.delete '/patterns/*' do |uri|
68
- Scrappy::App.delete_pattern uri
68
+ app.get '/patterns/visual' do
69
+ @patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
70
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
71
+ html = @patterns.map { |pattern| render_fragment(pattern) } * ""
72
+ "<html><body>#{html}</body></html>"
73
+ end
74
+
75
+ app.get '/patterns/*' do |id|
76
+ "<html><body>#{render_fragment(Scrappy::Kb.patterns[id])}</body></html>"
77
+ end
78
+
79
+ app.delete '/patterns' do
80
+ Scrappy::App.delete_patterns
81
+ flash[:notice] = "Patterns deleted"
82
+ redirect "#{settings.base_uri}/patterns"
83
+ end
84
+
85
+ app.delete '/patterns/*' do |id|
86
+ Scrappy::App.delete_pattern id
69
87
  flash[:notice] = "Pattern deleted"
70
88
  redirect "#{settings.base_uri}/patterns"
71
89
  end
@@ -78,9 +96,34 @@ module Scrappy
78
96
  end
79
97
 
80
98
  app.get '/samples/:id' do |id|
99
+ Nokogiri::HTML(Scrappy::App.samples[id.to_i][:html], nil, 'utf-8').search("*").map do |node|
100
+ next if node.text?
101
+ text = node.children.map { |n| n.content if n.text? } * " "
102
+ x = node[:vx].to_i
103
+ y = node[:vy].to_i
104
+ w = node[:vw].to_i
105
+ h = node[:vh].to_i
106
+ font = node[:vfont]
107
+ size = node[:vsize].to_i
108
+ weight = node[:vweight].to_i
109
+ color = "#555"
110
+ color = "#55f" if node.name == "a"
111
+ style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid gray; color: #{color};"
112
+ style += "background-color: #f00; opacity: 0.2;" if node.name == "img"
113
+ style += "text-decoration: underline;" if node.name == "a"
114
+ "<div style='#{style}'>#{text}</div>"
115
+ end * ""
116
+ end
117
+
118
+ app.get '/samples/:id/raw' do |id|
81
119
  Scrappy::App.samples[id.to_i][:html]
82
120
  end
83
-
121
+
122
+ app.get '/samples/:id/annotations' do |id|
123
+ headers 'Content-Type' => 'text/plain'
124
+ RDF::Graph.new(Scrappy::App.samples[id.to_i][:output] || []).serialize(:yarf)
125
+ end
126
+
84
127
  app.get '/samples/:id/:kb_type' do |id,kb_type|
85
128
  kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
86
129
  sample = Scrappy::App.samples[id.to_i]
@@ -88,19 +131,109 @@ module Scrappy
88
131
  RDF::Graph.new(agent.extract(sample[:uri], sample[:html], kb, Agent::Options.referenceable)).serialize(:yarf)
89
132
  end
90
133
 
91
- app.post '/samples/:id/train' do |id|
92
- new_extractor = agent.train Scrappy::App.samples[id.to_i]
93
- Scrappy::App.add_pattern new_extractor
134
+ app.post '/samples/annotate' do
135
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }.each do |sample|
136
+ sample[:output] = agent.extract(sample[:uri], sample[:html], Scrappy::Kb.extractors)
137
+ end
138
+ Scrappy::App.save_samples
139
+ flash[:notice] = "Samples annotated"
140
+ redirect "#{settings.base_uri}/samples"
141
+ end
142
+
143
+ app.post '/samples/train/:kb_type' do |kb_type|
144
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
145
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
146
+ if kb_type == "patterns"
147
+ Scrappy::App.add_patterns agent.train(*samples)
148
+ else
149
+ Scrappy::App.add_extractor agent.train_xpath(*samples)
150
+ end
94
151
  flash[:notice] = "Training completed"
95
152
  redirect "#{settings.base_uri}/samples"
96
153
  end
97
154
 
98
- app.post '/samples/:id/optimize' do |id|
99
- Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
100
- Scrappy::App.save_patterns Scrappy::Kb.patterns
155
+ app.post '/samples/optimize/:kb_type' do |kb_type|
156
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
157
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
158
+ if kb_type == "patterns"
159
+ Scrappy::App.save_patterns agent.optimize_patterns(kb, samples)
160
+ else
161
+ Scrappy::App.replace_extractor agent.optimize_extractors(kb, samples), samples
162
+ end
101
163
  flash[:notice] = "Optimization completed"
102
164
  redirect "#{settings.base_uri}/samples"
103
165
  end
166
+
167
+ app.post '/samples/test/:kb_type' do |kb_type|
168
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
169
+ @results = {}
170
+ @missing = []
171
+ @wrong = []
172
+ output = RDF::Parser.parse(:ntriples, params["output"].to_s).triples
173
+ extraction = []
174
+ (params['samples'] || []).each do |i|
175
+ sample = Scrappy::App.samples[i.to_i]
176
+ output += sample[:output] || []
177
+ extraction += agent.extract(sample[:uri], sample[:html], kb)
178
+ end
179
+
180
+ output = output.uniq
181
+ extraction = extraction.uniq
182
+
183
+ predicates = output.map { |s,p,o| p }.uniq
184
+ types = output.map { |s,p,o| o if p == ID('rdf:type') }.compact.uniq
185
+
186
+ predicates.each do |predicate|
187
+ new_output = output.select { |s,p,o| p==predicate }
188
+ new_extraction = extraction.select { |s,p,o| p==predicate }
189
+ precision, recall, fscore = agent.send :metrics, new_output, new_extraction
190
+ @results[predicate] ||= Hash.new(0.0)
191
+ @results[predicate][:count] += 1
192
+ @results[predicate][:fscore] += fscore
193
+ @results[predicate][:precision] += precision
194
+ @results[predicate][:recall] += recall
195
+ end
196
+
197
+ types.each do |type|
198
+ new_output = output.select { |s,p,o| p==ID("rdf:type") and o==type }
199
+ new_extraction = extraction.select { |s,p,o| p==ID("rdf:type") and o==type }
200
+
201
+ precision, recall, fscore = agent.send :metrics, new_output, new_extraction
202
+ @results[type] ||= Hash.new(0.0)
203
+ @results[type][:count] += 1
204
+ @results[type][:fscore] += fscore
205
+ @results[type][:precision] += precision
206
+ @results[type][:recall] += recall
207
+ end
208
+
209
+ precision, recall, fscore = agent.send :metrics, output, extraction
210
+ @results[:total] ||= Hash.new(0.0)
211
+ @results[:total][:count] += 1
212
+ @results[:total][:fscore] += fscore
213
+ @results[:total][:precision] += precision
214
+ @results[:total][:recall] += recall
215
+
216
+ @missing += output - extraction
217
+ @wrong += extraction - output
218
+
219
+ # Here we get sth like: { :'dc:title'=>{:fscore=>0.3, ...}, :total=>{:fscore=>0.4, ...} }
220
+ @results.each do |key, result|
221
+ count = result[:count]
222
+ result.each do |k,v|
223
+ result[k] /= count
224
+ end
225
+ end
226
+
227
+ @total = output.size
228
+ @extracted = extraction.size
229
+ @correct = @extracted - @wrong.size
230
+
231
+ @missing = RDF::Graph.new(@missing)
232
+ @wrong = RDF::Graph.new(@wrong)
233
+
234
+ flash.now[:notice] = "Testing completed"
235
+ haml :test
236
+ end
104
237
 
105
238
  app.post '/samples' do
106
239
  html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
@@ -114,6 +247,36 @@ module Scrappy
114
247
  flash[:notice] = "Sample deleted"
115
248
  redirect "#{settings.base_uri}/samples"
116
249
  end
250
+
251
+ def render_fragment fragment, selected_branch=nil
252
+ label = if fragment.sc::relation.first
253
+ fragment.sc::relation.map {|id| RDF::ID.compress(id)} * ', '
254
+ else
255
+ fragment.sc::type.map {|id| RDF::ID.compress(id)} * ', '
256
+ end
257
+ subfragments = [selected_branch || [:min, :max]].flatten.map do |branch|
258
+ fragment.sc::subfragment.map { |f| render_fragment(f, branch) } * ""
259
+ end * ""
260
+
261
+ [selected_branch || [:min, :max]].flatten.map do |branch|
262
+ fragment.sc::selector.map do |selector|
263
+ x,y,w,h,font,size,weight,color = case branch
264
+ when :min then
265
+ [selector.sc::min_relative_x.first, selector.sc::min_relative_y.first, selector.sc::min_width.first, selector.sc::min_height.first, selector.sc::font_family.first, selector.sc::min_font_size.first, selector.sc::min_font_weight.first, :blue]
266
+ when :max then
267
+ [selector.sc::max_relative_x.first, selector.sc::max_relative_y.first, selector.sc::max_width.first, selector.sc::max_height.first, selector.sc::font_family.first, selector.sc::max_font_size.first, selector.sc::max_font_weight.first, :red]
268
+ end
269
+ style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid #{color}; color: #555;"
270
+ "<div style='#{style}'>#{label}#{subfragments}</div>"
271
+ end * ""
272
+ end * ""
273
+ end
274
+
275
+ def percentage value
276
+ "%.2f%" % (value * 100.0)
277
+ end
278
+
279
+ app.helpers Admin
117
280
  end
118
281
  end
119
282
  end
@@ -29,28 +29,4 @@ class String
29
29
  tr("-", "_").
30
30
  downcase
31
31
  end
32
- end
33
-
34
- class Array
35
- # Return true if a given array has the same elements as this one
36
- def equivalent? array
37
- self.all? { |i| array.include?(i) } and
38
- array.all? { |i| self.include?(i) }
39
- end
40
- end
41
-
42
- module RDF
43
- class Node
44
- def self.mix *nodes
45
- id = nodes.first
46
- graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
47
- triples + node.graph.triples.map do |s,p,o|
48
- [ s==node.id ? id : s,
49
- p==node.id ? id : p,
50
- o==node.id ? id : o ]
51
- end
52
- end )
53
- graph[id]
54
- end
55
- end
56
32
  end