scrappy 0.3.5 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,43 +3,79 @@ module Scrappy
3
3
  # Generates visual patterns
4
4
  def train *samples
5
5
  RDF::Graph.new( samples.inject([]) do |triples, sample|
6
- triples + train_sample(sample).triples
6
+ triples + train_sample( sample ).triples
7
7
  end )
8
8
  end
9
-
9
+
10
+ # Generate XPath fragments
11
+ def train_xpath *samples
12
+ RDF::Graph.new( samples.inject([]) do |triples, sample|
13
+ triples + train_sample(sample, true).triples
14
+ end )
15
+ end
16
+
10
17
  private
11
- def train_sample sample
12
- results = RDF::Graph.new extract(sample[:uri], sample[:html], Scrappy::Kb.extractors, :minimum)
18
+ def train_sample sample, xpath=false
19
+ sample = sample.merge(:content=>Nokogiri::HTML(sample[:html], nil, 'utf-8'))
20
+ results = RDF::Graph.new extract(sample[:uri], sample[:html], xpath ? Scrappy::Kb.patterns : Scrappy::Kb.extractors, :minimum)
13
21
 
14
22
  typed_nodes = results.find(nil, Node("rdf:type"), [])
15
23
  non_root_nodes = results.find([], [], nil)
16
24
 
17
25
  nodes = typed_nodes - non_root_nodes
18
-
26
+
27
+ superfragment = Node(nil)
28
+ selector = Node(nil)
29
+ identifier = Node(nil)
30
+ selector.rdf::type = Node('sc:UriSelector')
31
+ selector.rdf::value = sample[:uri]
32
+ identifier.rdf::type = Node('sc:BaseUriSelector')
33
+ superfragment.rdf::type = Node('sc:Fragment')
34
+ superfragment.sc::selector = selector
35
+ superfragment.sc::identifier = identifier
36
+ superfragment.graph << selector
37
+ superfragment.graph << identifier
38
+
19
39
  RDF::Graph.new( nodes.inject([]) do |triples, node|
20
- triples + fragment_for(node).graph.triples
21
- end )
40
+ fragment = fragment_for(node, sample, xpath)
41
+ # Include a superfragment that limits the fragment to a specified URI
42
+ if xpath
43
+ other_triples = [ [superfragment.id, ID('sc:subfragment'), fragment.id] ]
44
+ else
45
+ other_triples = []
46
+ end
47
+
48
+ triples + fragment.graph.triples + other_triples
49
+ end + (xpath ? superfragment.graph.triples : []) )
22
50
  end
23
51
 
24
- def fragment_for node, parent=nil
52
+ def fragment_for node, sample, xpath, parent=nil, parent_path=nil
25
53
  fragment = Node(nil)
54
+ node_path = node.sc::source.first.sc::selector.first.sc::path.first
26
55
  node.keys.each do |predicate|
27
56
  case predicate
28
57
  when ID("sc:source") then
29
- selector = selector_for(node.sc::source.first, parent)
58
+ selector = selector_for(node.sc::source.first, sample, xpath, parent, parent_path)
30
59
  fragment.graph << selector
31
60
  fragment.sc::selector = selector
32
61
  when ID("sc:uri") then
33
- selector = selector_for(node.sc::uri.first.sc::source.first, node)
62
+ selector = selector_for(node.sc::uri.first.sc::source.first, sample, xpath, node, node_path)
34
63
  fragment.graph << selector
35
64
  fragment.sc::identifier = selector
36
65
  when ID("rdf:type") then
37
66
  fragment.sc::type = node.rdf::type
38
67
  else
39
68
  if node[predicate].map(&:class).uniq.first == RDF::Node
69
+ done = []
40
70
  node[predicate].map do |subnode|
41
- subfragment = fragment_for(subnode, node)
71
+ selector = subnode.sc::source.first.sc::selector.first
72
+ next if done.include?( {}.merge(selector) )
73
+ done << {}.merge(selector)
74
+
75
+ subfragment = fragment_for(subnode, sample, xpath, node, node_path)
42
76
  subfragment.sc::relation = Node(predicate)
77
+ subfragment.sc::min_cardinality = "1"
78
+ subfragment.sc::max_cardinality = "1"
43
79
 
44
80
  fragment.graph << subfragment
45
81
  fragment.sc::subfragment += [subfragment]
@@ -48,47 +84,83 @@ module Scrappy
48
84
  end
49
85
  end
50
86
  fragment.rdf::type = Node("sc:Fragment")
51
- fragment.sc::min_cardinality = "1"
52
- fragment.sc::max_cardinality = "1"
87
+
53
88
  fragment
54
89
  end
55
90
 
56
- def selector_for fragment, parent=nil
91
+ def selector_for fragment, sample, xpath=false, parent=nil, parent_path=nil
57
92
  fragment_selector = fragment.sc::selector.first
58
93
  presentation = fragment.sc::presentation.first
59
94
 
60
95
  selector = Node(nil)
61
- selector.rdf::type = Node("sc:VisualSelector")
62
-
63
- origin_x = parent ? parent.sc::source.first.sc::presentation.first.sc::x.first.to_i : 0
64
- origin_y = parent ? parent.sc::source.first.sc::presentation.first.sc::y.first.to_i : 0
65
-
66
- relative_x = presentation.sc::x.first.to_i - origin_x
67
- relative_y = presentation.sc::y.first.to_i - origin_y
68
96
 
69
- selector.sc::min_relative_x = relative_x.to_s
70
- selector.sc::max_relative_x = relative_x.to_s
71
- selector.sc::min_relative_y = relative_y.to_s
72
- selector.sc::max_relative_y = relative_y.to_s
73
- selector.sc::min_x = presentation.sc::x
74
- selector.sc::max_x = presentation.sc::x
75
- selector.sc::min_y = presentation.sc::y
76
- selector.sc::max_y = presentation.sc::y
77
-
78
- selector.sc::min_width = presentation.sc::width
79
- selector.sc::max_width = presentation.sc::width
80
- selector.sc::min_height = presentation.sc::height
81
- selector.sc::max_height = presentation.sc::height
82
- selector.sc::min_font_size = presentation.sc::font_size
83
- selector.sc::max_font_size = presentation.sc::font_size
84
- selector.sc::min_font_weight = presentation.sc::font_weight
85
- selector.sc::max_font_weight = presentation.sc::font_weight
86
- selector.sc::font_family = presentation.sc::font_family
97
+ if xpath
98
+ selector.rdf::type = Node("sc:XPathSelector")
99
+ selector.sc::text = presentation.sc::text
100
+ selector.rdf::value = path_for fragment_selector.sc::path.first, parent_path, sample
101
+ else
102
+ selector.rdf::type = Node("sc:VisualSelector")
103
+
104
+ origin_x = parent ? parent.sc::source.first.sc::presentation.first.sc::x.first.to_i : 0
105
+ origin_y = parent ? parent.sc::source.first.sc::presentation.first.sc::y.first.to_i : 0
106
+
107
+ relative_x = presentation.sc::x.first.to_i - origin_x
108
+ relative_y = presentation.sc::y.first.to_i - origin_y
109
+
110
+ selector.sc::min_relative_x = relative_x.to_s
111
+ selector.sc::max_relative_x = relative_x.to_s
112
+ selector.sc::min_relative_y = relative_y.to_s
113
+ selector.sc::max_relative_y = relative_y.to_s
114
+ selector.sc::min_x = presentation.sc::x
115
+ selector.sc::max_x = presentation.sc::x
116
+ selector.sc::min_y = presentation.sc::y
117
+ selector.sc::max_y = presentation.sc::y
118
+
119
+ selector.sc::min_width = presentation.sc::width
120
+ selector.sc::max_width = presentation.sc::width
121
+ selector.sc::min_height = presentation.sc::height
122
+ selector.sc::max_height = presentation.sc::height
123
+ selector.sc::min_font_size = presentation.sc::font_size
124
+ selector.sc::max_font_size = presentation.sc::font_size
125
+ selector.sc::min_font_weight = presentation.sc::font_weight
126
+ selector.sc::max_font_weight = presentation.sc::font_weight
127
+ selector.sc::font_family = presentation.sc::font_family
128
+
129
+ selector.sc::tag = ["text"] if presentation.sc::font_family.first
130
+ special_tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
131
+ selector.sc::tag = special_tag if special_tag.size > 0
132
+ end
87
133
 
88
- selector.sc::tag = fragment_selector.sc::tag.select { |tag| ["a","img"].include?(tag) }
89
134
  selector.sc::attribute = fragment_selector.sc::attribute
90
135
 
91
136
  selector
92
137
  end
138
+
139
+ def path_for path, parent_path, sample
140
+ return "./." if path == parent_path
141
+ return path if ["", "/html", "/html/body"].include?(path)
142
+
143
+ node = sample[:content].search(path).first
144
+ conditions = []
145
+ if node[:class]
146
+ conditions += node[:class].split(" ").map {|c| "contains(concat(' ',normalize-space(@class),' '),concat(' ','#{c.strip}',' '))" }
147
+ else
148
+ conditions += ["not(@class)"]
149
+ end
150
+ if node[:id]
151
+ conditions += ["contains(@id,'#{node[:id].strip}')"]
152
+ else
153
+ conditions += ["not(@id)"]
154
+ end
155
+ selector = "/#{node.name}[#{conditions * " and "}]"
156
+ index = nil
157
+ results = node.parent.search(".#{selector}")
158
+ results.each_with_index { |n,i| index = i+1 if n.path == path }
159
+
160
+ previous_path = path.split("/")[0..-2] * "/"
161
+ suffix = results.size > 1 ? "[#{index}]" : ""
162
+
163
+ path_for(previous_path, parent_path, sample) + selector + suffix
164
+ end
93
165
  end
94
166
  end
@@ -37,11 +37,13 @@ module Scrappy
37
37
  app.post '/extractors' do
38
38
  if params[:html]
39
39
  # Generate extractor automatically
40
- iconv = Iconv.new(params[:encoding], 'UTF-8')
41
- html = iconv.iconv(params[:html])
42
- puts params[:html]
43
- puts params[:uri]
44
- raise Exception, "Automatic generation of extractors is not supported yet"
40
+ html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
41
+ samples = [{ :html=>html, :uri=>params[:uri] }]
42
+ extractor = agent.train_xpath(*samples)
43
+ # Train
44
+ Scrappy::App.add_extractor extractor
45
+ # Optimize
46
+ Scrappy::App.replace_extractor agent.optimize_extractors(Scrappy::Kb.extractors, samples), samples
45
47
  else
46
48
  # Store the given extractor
47
49
  Scrappy::App.add_extractor RDF::Parser.parse(:yarf,params[:rdf])
@@ -58,14 +60,30 @@ module Scrappy
58
60
  # Patterns
59
61
 
60
62
  app.get '/patterns' do
61
- @uris = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
62
- Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) ).
63
- map { |node| node.sc::type }.flatten.map(&:to_s).sort
63
+ @patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
64
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
64
65
  haml :patterns
65
66
  end
66
67
 
67
- app.delete '/patterns/*' do |uri|
68
- Scrappy::App.delete_pattern uri
68
+ app.get '/patterns/visual' do
69
+ @patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
70
+ Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
71
+ html = @patterns.map { |pattern| render_fragment(pattern) } * ""
72
+ "<html><body>#{html}</body></html>"
73
+ end
74
+
75
+ app.get '/patterns/*' do |id|
76
+ "<html><body>#{render_fragment(Scrappy::Kb.patterns[id])}</body></html>"
77
+ end
78
+
79
+ app.delete '/patterns' do
80
+ Scrappy::App.delete_patterns
81
+ flash[:notice] = "Patterns deleted"
82
+ redirect "#{settings.base_uri}/patterns"
83
+ end
84
+
85
+ app.delete '/patterns/*' do |id|
86
+ Scrappy::App.delete_pattern id
69
87
  flash[:notice] = "Pattern deleted"
70
88
  redirect "#{settings.base_uri}/patterns"
71
89
  end
@@ -78,9 +96,34 @@ module Scrappy
78
96
  end
79
97
 
80
98
  app.get '/samples/:id' do |id|
99
+ Nokogiri::HTML(Scrappy::App.samples[id.to_i][:html], nil, 'utf-8').search("*").map do |node|
100
+ next if node.text?
101
+ text = node.children.map { |n| n.content if n.text? } * " "
102
+ x = node[:vx].to_i
103
+ y = node[:vy].to_i
104
+ w = node[:vw].to_i
105
+ h = node[:vh].to_i
106
+ font = node[:vfont]
107
+ size = node[:vsize].to_i
108
+ weight = node[:vweight].to_i
109
+ color = "#555"
110
+ color = "#55f" if node.name == "a"
111
+ style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid gray; color: #{color};"
112
+ style += "background-color: #f00; opacity: 0.2;" if node.name == "img"
113
+ style += "text-decoration: underline;" if node.name == "a"
114
+ "<div style='#{style}'>#{text}</div>"
115
+ end * ""
116
+ end
117
+
118
+ app.get '/samples/:id/raw' do |id|
81
119
  Scrappy::App.samples[id.to_i][:html]
82
120
  end
83
-
121
+
122
+ app.get '/samples/:id/annotations' do |id|
123
+ headers 'Content-Type' => 'text/plain'
124
+ RDF::Graph.new(Scrappy::App.samples[id.to_i][:output] || []).serialize(:yarf)
125
+ end
126
+
84
127
  app.get '/samples/:id/:kb_type' do |id,kb_type|
85
128
  kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
86
129
  sample = Scrappy::App.samples[id.to_i]
@@ -88,19 +131,109 @@ module Scrappy
88
131
  RDF::Graph.new(agent.extract(sample[:uri], sample[:html], kb, Agent::Options.referenceable)).serialize(:yarf)
89
132
  end
90
133
 
91
- app.post '/samples/:id/train' do |id|
92
- new_extractor = agent.train Scrappy::App.samples[id.to_i]
93
- Scrappy::App.add_pattern new_extractor
134
+ app.post '/samples/annotate' do
135
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }.each do |sample|
136
+ sample[:output] = agent.extract(sample[:uri], sample[:html], Scrappy::Kb.extractors)
137
+ end
138
+ Scrappy::App.save_samples
139
+ flash[:notice] = "Samples annotated"
140
+ redirect "#{settings.base_uri}/samples"
141
+ end
142
+
143
+ app.post '/samples/train/:kb_type' do |kb_type|
144
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
145
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
146
+ if kb_type == "patterns"
147
+ Scrappy::App.add_patterns agent.train(*samples)
148
+ else
149
+ Scrappy::App.add_extractor agent.train_xpath(*samples)
150
+ end
94
151
  flash[:notice] = "Training completed"
95
152
  redirect "#{settings.base_uri}/samples"
96
153
  end
97
154
 
98
- app.post '/samples/:id/optimize' do |id|
99
- Scrappy::Kb.patterns = agent.optimize_patterns(Scrappy::Kb.patterns, Scrappy::App.samples[id.to_i])
100
- Scrappy::App.save_patterns Scrappy::Kb.patterns
155
+ app.post '/samples/optimize/:kb_type' do |kb_type|
156
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
157
+ samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
158
+ if kb_type == "patterns"
159
+ Scrappy::App.save_patterns agent.optimize_patterns(kb, samples)
160
+ else
161
+ Scrappy::App.replace_extractor agent.optimize_extractors(kb, samples), samples
162
+ end
101
163
  flash[:notice] = "Optimization completed"
102
164
  redirect "#{settings.base_uri}/samples"
103
165
  end
166
+
167
+ app.post '/samples/test/:kb_type' do |kb_type|
168
+ kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.extractors)
169
+ @results = {}
170
+ @missing = []
171
+ @wrong = []
172
+ output = RDF::Parser.parse(:ntriples, params["output"].to_s).triples
173
+ extraction = []
174
+ (params['samples'] || []).each do |i|
175
+ sample = Scrappy::App.samples[i.to_i]
176
+ output += sample[:output] || []
177
+ extraction += agent.extract(sample[:uri], sample[:html], kb)
178
+ end
179
+
180
+ output = output.uniq
181
+ extraction = extraction.uniq
182
+
183
+ predicates = output.map { |s,p,o| p }.uniq
184
+ types = output.map { |s,p,o| o if p == ID('rdf:type') }.compact.uniq
185
+
186
+ predicates.each do |predicate|
187
+ new_output = output.select { |s,p,o| p==predicate }
188
+ new_extraction = extraction.select { |s,p,o| p==predicate }
189
+ precision, recall, fscore = agent.send :metrics, new_output, new_extraction
190
+ @results[predicate] ||= Hash.new(0.0)
191
+ @results[predicate][:count] += 1
192
+ @results[predicate][:fscore] += fscore
193
+ @results[predicate][:precision] += precision
194
+ @results[predicate][:recall] += recall
195
+ end
196
+
197
+ types.each do |type|
198
+ new_output = output.select { |s,p,o| p==ID("rdf:type") and o==type }
199
+ new_extraction = extraction.select { |s,p,o| p==ID("rdf:type") and o==type }
200
+
201
+ precision, recall, fscore = agent.send :metrics, new_output, new_extraction
202
+ @results[type] ||= Hash.new(0.0)
203
+ @results[type][:count] += 1
204
+ @results[type][:fscore] += fscore
205
+ @results[type][:precision] += precision
206
+ @results[type][:recall] += recall
207
+ end
208
+
209
+ precision, recall, fscore = agent.send :metrics, output, extraction
210
+ @results[:total] ||= Hash.new(0.0)
211
+ @results[:total][:count] += 1
212
+ @results[:total][:fscore] += fscore
213
+ @results[:total][:precision] += precision
214
+ @results[:total][:recall] += recall
215
+
216
+ @missing += output - extraction
217
+ @wrong += extraction - output
218
+
219
+ # Here we get sth like: { :'dc:title'=>{:fscore=>0.3, ...}, :total=>{:fscore=>0.4, ...} }
220
+ @results.each do |key, result|
221
+ count = result[:count]
222
+ result.each do |k,v|
223
+ result[k] /= count
224
+ end
225
+ end
226
+
227
+ @total = output.size
228
+ @extracted = extraction.size
229
+ @correct = @extracted - @wrong.size
230
+
231
+ @missing = RDF::Graph.new(@missing)
232
+ @wrong = RDF::Graph.new(@wrong)
233
+
234
+ flash.now[:notice] = "Testing completed"
235
+ haml :test
236
+ end
104
237
 
105
238
  app.post '/samples' do
106
239
  html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
@@ -114,6 +247,36 @@ module Scrappy
114
247
  flash[:notice] = "Sample deleted"
115
248
  redirect "#{settings.base_uri}/samples"
116
249
  end
250
+
251
+ def render_fragment fragment, selected_branch=nil
252
+ label = if fragment.sc::relation.first
253
+ fragment.sc::relation.map {|id| RDF::ID.compress(id)} * ', '
254
+ else
255
+ fragment.sc::type.map {|id| RDF::ID.compress(id)} * ', '
256
+ end
257
+ subfragments = [selected_branch || [:min, :max]].flatten.map do |branch|
258
+ fragment.sc::subfragment.map { |f| render_fragment(f, branch) } * ""
259
+ end * ""
260
+
261
+ [selected_branch || [:min, :max]].flatten.map do |branch|
262
+ fragment.sc::selector.map do |selector|
263
+ x,y,w,h,font,size,weight,color = case branch
264
+ when :min then
265
+ [selector.sc::min_relative_x.first, selector.sc::min_relative_y.first, selector.sc::min_width.first, selector.sc::min_height.first, selector.sc::font_family.first, selector.sc::min_font_size.first, selector.sc::min_font_weight.first, :blue]
266
+ when :max then
267
+ [selector.sc::max_relative_x.first, selector.sc::max_relative_y.first, selector.sc::max_width.first, selector.sc::max_height.first, selector.sc::font_family.first, selector.sc::max_font_size.first, selector.sc::max_font_weight.first, :red]
268
+ end
269
+ style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid #{color}; color: #555;"
270
+ "<div style='#{style}'>#{label}#{subfragments}</div>"
271
+ end * ""
272
+ end * ""
273
+ end
274
+
275
+ def percentage value
276
+ "%.2f%" % (value * 100.0)
277
+ end
278
+
279
+ app.helpers Admin
117
280
  end
118
281
  end
119
282
  end
@@ -29,28 +29,4 @@ class String
29
29
  tr("-", "_").
30
30
  downcase
31
31
  end
32
- end
33
-
34
- class Array
35
- # Return true if a given array has the same elements as this one
36
- def equivalent? array
37
- self.all? { |i| array.include?(i) } and
38
- array.all? { |i| self.include?(i) }
39
- end
40
- end
41
-
42
- module RDF
43
- class Node
44
- def self.mix *nodes
45
- id = nodes.first
46
- graph = RDF::Graph.new( nodes.inject([]) do |triples, node|
47
- triples + node.graph.triples.map do |s,p,o|
48
- [ s==node.id ? id : s,
49
- p==node.id ? id : p,
50
- o==node.id ? id : o ]
51
- end
52
- end )
53
- graph[id]
54
- end
55
- end
56
32
  end