pagerecognizer 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61a30cc5e39e171b8eabdf26490475ce3fb041b9
4
- data.tar.gz: e64d2e3700730de8e9d3fe4d4de57f4743ca98b9
3
+ metadata.gz: 17ef706811d7513a3f7f6a109feacb59bbae91dc
4
+ data.tar.gz: e6890fcd6c6bfdd6d042513f02dea280e5c436ae
5
5
  SHA512:
6
- metadata.gz: cd4370f97135ac3df6376c2df1dcbb39cd25a35fac0ad172d29fa86764d8fb82d7ec28da7d4aaf0a547f1684cdda34f2d4de1b14de5c66ed784bb798643e96cd
7
- data.tar.gz: e517075c5eb9d4efdb5bc851865136776df7dda1eacaecb96e3d89f211c63c6a5429b0ee4286280d150f7ea338662ceebdc0f9b0039d7df8a6a2778b349629f6
6
+ metadata.gz: 74fc2c48871a01192e4ebbd80f603a170b1f953f61252f8bf21627bc3abb8555beb12572ab786754f6f015870bdaa6cec54daab23d15068c7aa0152b70c612aa
7
+ data.tar.gz: 3e6a080c6740075bab1c111127249ade7429ac844fcf1c0b99979c7a535cd5013cf9bd86629957ee7bcd4467828d023831a51df078037dff5cfc777c6a81be9e
@@ -4,14 +4,16 @@ module PageRecognizer
4
4
  end
5
5
  require "logger"
6
6
  self.logger = Logger.new STDOUT
7
+ self.logger.formatter = ->(severity, datetime, progname, msg){ "#{datetime.strftime "%H%M%S"} #{severity.to_s[0]} #{msg}\n" }
8
+ self.logger.level = ENV.fetch("LOGLEVEL_PageRecognizer", "FATAL").to_sym
7
9
 
8
10
  module Dumpable
9
11
  def dump
10
- "<html><body>#{
12
+ "<html><body style='white-space: nowrap'>#{
11
13
  map.with_index do |n, i|
12
- "<div style='position: absolute; background-color: hsla(#{
14
+ "<div id='#{i}' style='position: absolute; background-color: hsla(#{
13
15
  360 * i / size
14
- },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
16
+ },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{i} #{
15
17
  n.node.tag_name.upcase
16
18
  }</div>"
17
19
  end.join
@@ -29,92 +31,110 @@ module PageRecognizer
29
31
  end.extend Dumpable
30
32
  end
31
33
 
32
- def recognize
34
+ def self.rgb2hsv r, g, b # [<256, <256, <256]
35
+ # http://stackoverflow.com/q/41926874/322020
36
+ r, g, b = [r, g, b].map{ |_| _.fdiv 255 }
37
+ min, max = [r, g, b].minmax
38
+ chroma = max - min
39
+ [
40
+ 60.0 * ( chroma.zero? ? 0 : case max
41
+ when r ; (g - b) / chroma
42
+ when g ; (b - r) / chroma + 2
43
+ when b ; (r - g) / chroma + 4
44
+ else 0
45
+ end % 6 ),
46
+ chroma.zero? ? 0.0 : chroma / max,
47
+ max,
48
+ ] # [<=360, <=1, <=1]
49
+ end
50
+ def self.dist h1, s1, v1, h2, s2, v2 # [<256, <256, <256]
51
+ # https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
52
+ c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0 # chroma
53
+ z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
54
+ a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
55
+ x2 = Math::sin(a) * c2
56
+ y1, y2 = c1, Math::cos(a) * c2
57
+ x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
58
+ end
59
+
60
+ private def recognize
33
61
  logger = Module.nesting.first.logger
62
+ logger.info "method #{__method__}..."
34
63
 
35
64
  nodes = []
36
65
  try = lambda do
37
- prev = nodes
38
- code = "( function(node) {
39
- var x = scrollX, y = scrollY;
40
- var _tap = function(x, f){ f(); return x };
41
- var f = function(node) {
42
- node.scrollIntoView();
43
- var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
44
- var child_nodes = Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 });
45
- var clickable;
46
- if (node.nodeName == 'svg') {
47
- var states = child_nodes.map( function(n){
48
- return _tap(n.style ? n.style.display : '', function(){ n.style.display = 'none' } );
49
- } );
50
- clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
51
- var _zip = function(a, b){ return a.map( function(e, i) { return [e, b[i]] } ) };
52
- _zip(child_nodes, states).forEach( function(_){ _[0].style.display = _[1] } );
53
- } else {
54
- clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
55
- };
56
- rect.top += scrollY;
57
- rect.left += scrollX;
58
- return [ [
59
- rect.top, rect.left, rect.width, rect.height, clickable, node
60
- ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
61
- };
62
- return _tap(f(node), function(){ scrollTo(x, y) });
63
- } )(arguments[0])"
64
- str = Struct.new :top, :left, :width, :height, :clickable, :node
65
- nodes = page.evaluate(code, self).map{ |s| str.new *s }
66
- nodes.size == prev.size
67
- end
66
+ str = Struct.new :node, :visible, :top, :left, :width, :height, :area do
67
+ def texts
68
+ node.page.evaluate(<<~HEREDOC, node).map(&JSON.method(:load)).map do |text, rect1, rect2, style|
69
+ (function(node){
70
+ let result = [], range = document.createRange();
71
+ for (
72
+ let iterator = document.evaluate('.//text()', node, null, XPathResult.ANY_TYPE, null);
73
+ text = iterator.iterateNext();
74
+ ) {
75
+ range.selectNode(text);
76
+ result.push(JSON.stringify( [
77
+ text.wholeText,
78
+ range.getBoundingClientRect(),
79
+ text.parentNode.getBoundingClientRect(),
80
+ getComputedStyle(text.parentNode),
81
+ ] ));
82
+ }
83
+ return result;
84
+ })(arguments[0])
85
+ HEREDOC
68
86
 
69
- if defined? Selenium::WebDriver::Wait
70
- Selenium::WebDriver::Wait.new(
71
- message: "number of DOM elements didn't stop to change"
72
- ).until &try
73
- else
74
- t = Time.now
75
- until try.call
76
- fail "number of DOM elements didn't stop to change" if Time.now > t + 5
77
- end
78
- end
79
- logger.info "#{nodes.size} DOM nodes found"
87
+ # google SERP has 1x1 nodes with text _<>
88
+ next if rect1["width"] < 2 || rect1["height"] < 2
89
+ next if rect2["width"] < 2 || rect2["height"] < 2
80
90
 
81
- nodes.select! &:clickable
82
- nodes.reject do |n|
83
- nodes.any? do |nn|
84
- cs = [
85
- nn.top <=> n.top,
86
- nn.left <=> n.left,
87
- n.left + n.width <=> nn.left + nn.width,
88
- n.top + n.height <=> nn.top + nn.height,
89
- ]
90
- cs.include?(1) && !cs.include?(-1)
91
+ color = style["color"]
92
+ fail color unless /\Argba?\((?<red>\d+), (?<green>\d+), (?<blue>\d+)(, 0(\.\d+)?)?\)\z/ =~ color
93
+ closest_color = { # https://en.wikipedia.org/wiki/Web_colors#Basic_colors
94
+ white: [0, 0, 100],
95
+ silver: [0, 0, 75],
96
+ gray: [0, 0, 50],
97
+ black: [0, 0, 0],
98
+ red: [0, 100, 100],
99
+ maroon: [0, 100, 50],
100
+ yellow: [60, 100, 100],
101
+ olive: [60, 100, 50],
102
+ lime: [120, 100, 100],
103
+ green: [120, 100, 50],
104
+ aqua: [180, 100, 100],
105
+ teal: [180, 100, 50],
106
+ blue: [240, 100, 100],
107
+ navy: [240, 100, 50],
108
+ fuchsia: [300, 100, 100],
109
+ purple: [300, 100, 50],
110
+ }.to_a.min_by do |_, (h1, s1, v1)|
111
+ h2, s2, v2 = PageRecognizer.rgb2hsv(red.to_i, green.to_i, blue.to_i)
112
+ PageRecognizer.dist h1*255/360, s1*256/100, v1*256/100, h2*255/360, s2*255, v2*255
113
+ end.first
114
+ [text, style, closest_color, rect1]
115
+ end.compact
116
+ end
91
117
  end
92
- end.extend Dumpable
93
- end
94
-
95
- private def recognize_more
96
- logger = Module.nesting.first.logger
97
-
98
- nodes = []
99
- try = lambda do
100
- prev = nodes
101
- code = "( function(node) {
102
- var x = scrollX, y = scrollY;
103
- var _tap = function(x, f){ f(); return x };
104
- var f = function(node) {
105
- node.scrollIntoView();
106
- var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
107
- rect.top += scrollY;
108
- rect.left += scrollX;
109
- return [ [
110
- node, JSON.stringify([rect.top, rect.left, rect.width, rect.height])
111
- ] ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
112
- };
113
- return _tap(f(node), function(){ scrollTo(x, y) });
114
- } )(arguments[0])"
115
- str = Struct.new :node, :top, :left, :width, :height
116
- nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
117
- nodes.size == prev.size
118
+ prev = nodes.size
119
+ t = page.evaluate(<<~HEREDOC, self)
120
+ ( function(node) {
121
+ var x = scrollX, y = scrollY;
122
+ var _tap = function(x, f){ f(); return x };
123
+ var f = function(node) {
124
+ node.scrollIntoView();
125
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
126
+ rect.top += scrollY;
127
+ rect.left += scrollX;
128
+ return [
129
+ node, JSON.stringify([rect.top, rect.left, rect.width, rect.height]), ("visible" == getComputedStyle(node).visibility)
130
+ ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
131
+ };
132
+ return _tap(f(node), function(){ scrollTo(x, y) });
133
+ } )(arguments[0])
134
+ HEREDOC
135
+ logger.debug [t.size / 3, prev]
136
+ nodes = t.each_slice(3).map{ |node, rect, visible| str.new(node, visible, *JSON.load(rect)).tap{ |_| _.area = _.width * _.height } }
137
+ nodes.size == prev
118
138
  end
119
139
 
120
140
  if defined? Selenium::WebDriver::Wait
@@ -128,9 +148,9 @@ module PageRecognizer
128
148
  end
129
149
  end
130
150
  logger.info "#{nodes.size} DOM nodes found"
131
-
132
- nodes.reject!{ |i| i.height.zero? || i.width.zero? }
133
- nodes
151
+ nodes.reject!{ |_| _.height.zero? || _.width.zero? || !_.visible }
152
+ logger.info "visible nodes: #{nodes.size}"
153
+ nodes.extend Dumpable
134
154
  end
135
155
 
136
156
  logging_error = Class.new RuntimeError do
@@ -143,8 +163,9 @@ module PageRecognizer
143
163
  end
144
164
  class ErrorNotEnoughNodes < logging_error ; end
145
165
 
146
- private def split heuristics, hh, ww, tt, ll
166
+ private def split hh, ww, tt, ll, heuristics, try_min, dump, &filter
147
167
  logger = Module.nesting.first.logger
168
+ logger.info heuristics
148
169
 
149
170
  unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
150
171
  ->(&b){ b.call }
@@ -159,82 +180,253 @@ module PageRecognizer
159
180
  end
160
181
  end
161
182
  end
162
- all = unstale.call do recognize_more end.sort_by(&tt)
163
- logger.info "all nodes: #{all.size}"
164
- rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
165
- inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
166
- raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
167
- logger.info "inside nodes: #{inside.size}"
168
- nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
169
- logger.info "good nodes: #{nodes.size}" # only those that might be containers
170
183
 
171
- large = nodes#.select{ |i| i[ww] > nodes.map(&ww).max / 4 }
172
- logger.info "large enough and unique: #{large.size}"
184
+ nodes = unstale.call do recognize end.sort_by{ |_| [_[tt], _[ll]] }
185
+ File.write "#{dump}.all.htm", nodes.extend(Dumpable).dump if dump
186
+
187
+
188
+ nodes = unstale.call do nodes.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name } end.uniq{ |_| [_[hh], _[ww], _[tt], _[ll]] }
189
+ logger.info "good and unique: #{nodes.size}" # only those that might be containers
190
+ File.write "#{dump}.nodes.htm", nodes.extend(Dumpable).dump if dump
173
191
 
174
192
  interfere = lambda do |a, b|
175
193
  a[tt] < b[tt] + b[hh] &&
176
194
  b[tt] < a[tt] + a[hh]
177
195
  end
178
196
 
179
- rest = large.select.with_index do |a, i|
180
- large.each_with_index.none? do |b, j|
197
+
198
+ rest = nodes.select.with_index do |a, i|
199
+ nodes.each_with_index.none? do |b, j|
181
200
  next if i == j
182
201
  a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
183
- large.all?{ |c| interfere[a, c] == interfere[b, c] }
202
+ a[ll] >= b[ll] && a[ll] + a[ww] <= b[ll] + b[ww] &&
203
+ nodes.all?{ |c| interfere[a, c] == interfere[b, c] }
184
204
  end
185
205
  end
186
206
  logger.info "not nested: #{rest.size}"
187
- # rest = rest.sample 50
207
+ File.write "#{dump}.rest1.htm", rest.extend(Dumpable).dump if dump
208
+
209
+ # 8 = max_results - 1, 3 = (from row size diff euristic)
210
+ if try_min
211
+ rest = rest.reject{ |_| _[hh] + _[hh]/3*(try_min - 1) > (rest.map{ |_| _[tt] + _[hh] }.max - rest.map(&tt).min) }
212
+ logger.info "small enough: #{rest.size}"
213
+ end
214
+ File.write "#{dump}.rest2.htm", rest.extend(Dumpable).dump if dump
188
215
 
189
- # adding the :area field for faster upcoming computations
190
- struct = Struct.new *large.first.members, :area
191
- rest.map!{ |i| struct.new *i.values, i.width * i.height }
216
+ rest.select! &filter
217
+ logger.info "filtered: #{rest.size}"
218
+ File.write "#{dump}.filtered.htm", rest.extend(Dumpable).dump if dump
219
+
220
+ rest.sort_by!(&:area).reverse!
221
+ File.write "#{dump}.sorted.htm", rest.extend(Dumpable).dump if dump
192
222
 
193
223
  require "pcbr"
194
224
  pcbr = PCBR.new
195
225
  is = []
196
- max, past = 0, []
226
+ max, past = 0, Set.new
197
227
  prev = nil
198
228
  time = Time.now
199
229
  loop do
200
- rest.each_with_index do |node, i|
201
- next if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
230
+ si = (0...rest.size).reject do |i|
231
+ # I don't shrink pcbr so this should be a safe optimization
232
+ next true if is.last > i unless is.empty?
233
+ # also we've sorted from large to small so it does not get stuck with the half of the page below the largest node
234
+
235
+ next (logger.debug [i, 2]; true) if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
236
+ next (logger.debug [i, 3]; true) if is.any?{ |j| rest[i][ww] > rest[j][ww] * 2 } if heuristics.include? :WIDTH
237
+ next (logger.debug [i, 4]; true) if is.any?{ |j| rest[j][ww] > rest[i][ww] * 2 } if heuristics.include? :WIDTH
238
+ next (logger.debug [i, 5]; true) if is.any?{ |j| rest[i][hh] > rest[j][hh] * 3 }
239
+ next (logger.debug [i, 6]; true) if is.any?{ |j| rest[j][hh] > rest[i][hh] * 3 }
240
+ end
241
+ logger.debug [is, si]
242
+ si.each do |i|
202
243
  sol = rest.values_at *is, i
244
+ unless pcbr.set.include? [*is, i].sort
245
+ logger.debug [is, i, sol.map(&:area).reduce(:+)]
203
246
  pcbr.store [*is, i].sort, [
204
247
  *( is.size if heuristics.include? :SIZE ),
205
- *( sol.map(&:area).inject(:+) if heuristics.include? :AREA ),
206
- *( -sol.product(sol).map{ |s1, s2| (s1.width - s2.width ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :WIDTH ),
207
- *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
208
- *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
209
- ] unless pcbr.table.assoc [*is, i].sort
248
+ *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
249
+ # https://en.wikipedia.org/wiki/Mean_absolute_difference
250
+ *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
251
+ *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
252
+ ]
253
+ logger.debug "pcbr.table.size: #{pcbr.table.size}"
254
+ if si.none? do |j|
255
+ next if j <= i
256
+ next true if interfere[rest[i], rest[j]]
257
+ next true if rest[i][ww] > rest[j][ww] * 2 if heuristics.include? :WIDTH
258
+ next true if rest[j][ww] > rest[i][ww] * 2 if heuristics.include? :WIDTH
259
+ next true if rest[i][hh] > rest[j][hh] * 3
260
+ next true if rest[j][hh] > rest[i][hh] * 3
261
+ end
262
+ logger.debug "forced"
263
+ break
264
+ end
265
+ end
210
266
  end
211
- if prev && Time.now - time > 1 && (Time.now - prev > (prev - time))
212
- m = pcbr.table.reject{ |i| i.first.size == 1 }.map(&:last).max
213
- break if 1 == pcbr.table.count{ |i| i.last == m } || Time.now - time > 5
267
+ if prev && Time.now - time > 5
268
+ logger.debug "check"
269
+ break logger.info "break 0" if Time.now - time > 30
270
+ break logger.info "break 1" if Time.now - prev > 10
271
+ m = pcbr.table.reject{ |i| i.first.size < 2 }.map(&:last).max
272
+ break logger.info "break 2" if Time.now - prev > (prev - time) && 1 == pcbr.table.count{ |i| i.last == m }
214
273
  end
215
- break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.inject(:+) }.max_by(&:last)
274
+ break logger.info "done" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
275
+ logger.debug "next: #{t}"
276
+ past.add (is = t.first).map{ |i| 2**i }.reduce(:+)
216
277
  if t.last > max
217
278
  prev, max = Time.now, t.last
279
+ logger.debug "new max: #{max}"
218
280
  logger.debug [Time.now - time, max, t.first]
219
281
  end
220
- past.push (is = t.first).map{ |i| 2**i }.inject(:+)
221
282
  end
222
283
  # TODO: if multiple with max score, take the max by area
223
- unless best = pcbr.table.reject{ |is,| is.size == 1 }.max_by(&:last)
224
- raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, inside: inside, nodes: nodes, large: large, rest: rest
284
+ unless best = pcbr.table.reject{ |is,| is.size < 2 }.max_by(&:last)
285
+ raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, nodes: nodes, rest: rest
225
286
  end
226
- rest.values_at(*best.first).extend(Dumpable)
287
+ pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
288
+ logger.info best
289
+ logger.info "splitted in #{best.first.size}"
290
+ rest.values_at(*best.first).sort_by(&tt).extend Dumpable
227
291
  end
228
292
 
229
- def rows *heuristics
230
- heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
231
- split heuristics, :height, :width, :top, :left
293
+ def rows heuristics, try_min: nil, dump: nil, &b
294
+ split :height, :width, :top, :left, heuristics, try_min, dump, &b
232
295
  end
233
- def cols *heuristics
234
- heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
235
- split heuristics, :width, :height, :left, :top
296
+ def cols heuristics, try_min: nil, dump: nil, &b
297
+ split :width, :height, :left, :top, heuristics, try_min, dump, &b
236
298
  end
237
299
 
300
+ def self.piles z
301
+ max = nil
302
+ result = [current = []]
303
+ z.map.with_index.sort.each do |x|
304
+ if !max || max > x[0][0]
305
+ current.push x
306
+ max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
307
+ else
308
+ result.push current = [x]
309
+ max = x[0][0] + x[0][1]
310
+ end
311
+ end
312
+ result.map{ |_| _.map &:last }
313
+ end
314
+
315
+ module Gridable
316
+ def rows
317
+ Module.nesting[1].piles(map{ |n| [n.top, n.height] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
318
+ end
319
+ def cols
320
+ Module.nesting[1].piles(map{ |n| [n.left, n.width] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
321
+ end
322
+ end
323
+
324
+ def grid dump = nil
325
+ logger = Module.nesting.first.logger
326
+
327
+ all = recognize
328
+ logger.info "all nodes: #{all.size}"
329
+ File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
330
+
331
+ # adding the fields for faster upcoming computations
332
+ struct = Struct.new *all.first.members, :midx, :midy
333
+ all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
334
+ all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
335
+
336
+ rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
337
+ inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
338
+ raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
339
+ logger.info "inside nodes: #{inside.size}"
340
+ File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
341
+ good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
342
+ logger.info "good and unique: #{good.size}" # only those that might be containers
343
+ File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
344
+
345
+ # large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
346
+ # logger.info "large enough: #{large.size}"
347
+
348
+ interfere = lambda do |a, b|
349
+ a.top < b.top + b.height &&
350
+ b.top < a.top + a.height &&
351
+ a.left < b.left + b.width &&
352
+ b.left < a.left + a.width
353
+ end
354
+
355
+ rest = good.select.with_index do |a, i|
356
+ good.each_with_index.none? do |b, j|
357
+ next if i == j
358
+ a.top >= b.top && a.top + a.height <= b.top + b.height &&
359
+ a.left >= b.left && a.left + a.width <= b.left + b.width &&
360
+ good.all?{ |c| interfere[a, c] == interfere[b, c] }
361
+ end
362
+ end
363
+ logger.info "not nested: #{rest.size}"
364
+ File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
365
+ begin
366
+ prev = rest.size
367
+ rest.select!.with_index do |a, i|
368
+ rest.each_with_index.any? do |b, j|
369
+ cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
370
+ i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
371
+ end and
372
+ rest.each_with_index.any? do |b, j|
373
+ ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
374
+ i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
375
+ end
376
+ end
377
+ end until prev == rest.size
378
+ logger.info "gridable: #{rest.size}"
379
+ File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
380
+
381
+ require "pcbr"
382
+ pcbr = PCBR.new
383
+ max, past = 0, []
384
+ prev = nil
385
+ prev_max = nil
386
+ time = Time.now
387
+ heuristics = %i{ SIZE AREA }
388
+ inter = lambda do |a1, a2, b1, b2|
389
+ c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
390
+ [c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
391
+ end
392
+ lp = lambda do |is|
393
+ past.push is.map{ |i| 2**i }.reduce(:+)
394
+ rest.size.times do |ij|
395
+ next if ij <= is.last unless is.empty?
396
+ sorted = is + [ij]
397
+ next if pcbr.set.include? sorted
398
+ next if is.any?{ |j| interfere[rest[ij], rest[j]] }
399
+ sol = rest.values_at *sorted
400
+ xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
401
+ yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
402
+ next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
403
+ pcbr.store sorted, [
404
+ *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
405
+ xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
406
+ yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
407
+ ]
408
+ if prev && Time.now - time > 3
409
+ logger.debug "check"
410
+ break logger.info "break 0" if Time.now - time > 30
411
+ break logger.info "break 1" if Time.now - prev > 10
412
+ m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
413
+ break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
414
+ end
415
+
416
+ break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
417
+ logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
418
+ if t.last > max && t.first != prev_max
419
+ prev, max, prev_max = Time.now, t.last, t.first
420
+ logger.debug [pcbr.table.size, max, t.first]
421
+ end
422
+ lp.call t.first
423
+ end
424
+ end
425
+ lp.call []
426
+ # TODO: if multiple with max score, take the max by area
427
+ pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
428
+ rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
429
+ end
238
430
 
239
431
  end
240
432
 
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "pagerecognizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.1.0"
4
4
  spec.summary = "visual HTML page structure recognizer"
5
5
 
6
6
  spec.author = "Victor Maslov aka Nakilon"
@@ -8,16 +8,15 @@ Gem::Specification.new do |spec|
8
8
  spec.license = "MIT"
9
9
  spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
10
10
 
11
- spec.add_dependency "nokogiri"
12
- spec.add_dependency "pcbr"
13
11
  spec.add_dependency "ferrum"
12
+ spec.add_dependency "nokogiri"
13
+ spec.add_dependency "pcbr", "~>0.4.2"
14
14
  spec.add_development_dependency "minitest"
15
15
 
16
16
  spec.add_development_dependency "ruby-prof"
17
17
  spec.add_development_dependency "byebug"
18
18
  spec.add_development_dependency "mll"
19
19
 
20
- spec.require_path = "lib"
21
20
  spec.test_file = "test.rb"
22
21
  spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
23
22
  end
data/test.rb CHANGED
@@ -1,28 +1,72 @@
1
1
  require "minitest/autorun"
2
+
2
3
  require "ferrum"
3
4
  require_relative "lib/pagerecognizer"
4
- Ferrum::Node.include PageRecognizer
5
+ PageRecognizer.logger.level = :INFO
5
6
 
6
7
  describe PageRecognizer do
7
- it "google" do
8
- browser = Ferrum::Browser.new **(ENV.has_key?("FERRUM_NO_SANDBOX") ? {browser_options: {"no-sandbox": nil}} : {})
9
- browser.goto "about:blank"
10
- browser.execute "document.write(#{File.read("google.htm").inspect})"
11
- results = browser.at_css("body").rows
12
- width = results.group_by(&:width).max_by{ |w, g| g.size }.first
13
- assert_equal [
14
- ["https://www.ruby-lang.org/ru/", "Ruby это... динамический язык программирования с о"],
15
- ["https://ru.wikibooks.org/wiki/Ruby", "Этот учебник намерен осветить все тонкости програм"],
16
- ["https://habr.com/ru/post/433672/", "19 дек. 2018 г. - Взрывной рост интереса к Ruby ос"],
17
- ["https://habr.com/ru/hub/ruby/", "Ruby (англ. Ruby — «Рубин») — динамический, рефлек"],
18
- ["https://web-creator.ru/articles/ruby", "Ruby разрабатывался на Linux, но работает на многи"],
19
- ["http://rusrails.ru/", "Ruby on Rails руководства, учебники, статьи на рус"],
20
- ["https://vc.ru/dev/72391-pochemu-my-vybiraem-ruby-d", "20 июн. 2019 г. - Ruby on Rails одним из первых на"],
21
- ["https://tproger.ru/tag/ruby/", "Django или Ruby on Rails: какой фреймворк выбрать?"],
22
- ["https://rubyrussia.club/", "Главная российская конференция о Ruby. Расширяем г"]
23
- ], results.select{ |r| r.width == width }.map(&:node).map(&:rows).map{ |link, desc| [
24
- link.node.at_css("a").property("href")[0,50],
25
- desc.node.text[0,50],
8
+ before do
9
+ options = {}
10
+ options[:browser_options] = {"no-sandbox": nil} if ENV.has_key? "FERRUM_NO_SANDBOX"
11
+ options[:headless] = false if ENV.has_key? "HEADFULL"
12
+ @browser = Ferrum::Browser.new **options
13
+ end
14
+ after do
15
+ @browser&.quit
16
+ end
17
+ [
18
+ ["google1.htm", [
19
+ ["https://ru.wikipedia.org/wiki/Ruby#:~:te", "Ruby Википедия"],
20
+ ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
21
+ ["https://ru.wikibooks.org/wiki/Ruby", "Ruby Викиучебник"],
22
+ ["https://habr.com/ru/post/433672/", "Пацаны, так Ruby умер или нет? / Хабр - Habr"],
23
+ ["https://habr.com/ru/hub/ruby/", "Ruby Динамический высокоуровневый язык..."],
24
+ ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
25
+ ["http://rusrails.ru/", "Rusrails: Ruby on Rails по-русски"],
26
+ ["https://vc.ru/dev/72391-pochemu-my-vybir", "Почему мы выбираем Ruby для наших проектов..."],
27
+ ["https://tproger.ru/tag/ruby/", "Ruby — всё по этой теме для программистов..."],
28
+ ["https://rubyrussia.club/", "RubyRussia"],
29
+ ] ],
30
+ ["google2.mht", [
31
+ ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
32
+ ["https://ru.wikipedia.org/wiki/Ruby", "Ruby - Википедия"],
33
+ ["https://evrone.ru/why-ruby", "5 причин, почему мы выбираем Ruby - evrone.ru"],
34
+ ["https://habr.com/ru/hub/ruby/", "Ruby — Динамический высокоуровневый язык..."],
35
+ ["https://ru.wikibooks.org/wiki/Ruby", "Ruby - Викиучебник"],
36
+ ["https://context.reverso.net/%D0%BF%D0%B5", "ruby - Перевод на русский - примеры английский..."],
37
+ ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
38
+ ["https://ru.hexlet.io/courses/ruby", "Введение в Ruby - Хекслет"],
39
+ ["https://rubyrush.ru/articles/what-is-rub", "Что такое Ruby on Rails?"],
40
+ ] ],
41
+ ].each do |filename, expectation|
42
+ it "google rows #{filename}" do
43
+ @browser.goto "file://#{File.expand_path filename}"
44
+ results = @browser.at_css("body").rows([:AREA, :SIZE], try_min: 9) do |node|
45
+ texts = node.texts
46
+ next if texts.none?{ |_, _, color, | :black == color }
47
+ _, group = texts.group_by{ |_, style, | style["fontSize"].to_i }.to_a.max_by(&:first)
48
+ next unless group
49
+ next unless group.size == 1 && %i{ blue navy }.include?(group[0][2])
50
+ true
51
+ end
52
+ assert_equal expectation, results.reject{ |_| _.node.at_css "img" }.map{ |result| [
53
+ result.node.at_css("a").property("href")[0,40],
54
+ result.texts.max_by{ |_, style, | style["fontStyle"].to_i }[0].sub(/(.{40}) .+/, "\\1..."),
26
55
  ] }
56
+ end
57
+ end
58
+ [
59
+ ["youtube.htm", %w{ Главная В\ тренде Подписки Библиотека История }, 8],
60
+ ["youtube2.mht", %w{ Главная Навигатор Shorts Подписки Библиотека История }, 10],
61
+ ].each do |filename, expected_navigation, rows|
62
+ it "youtube rows grid #{filename}" do
63
+ @browser.goto "file://#{File.expand_path filename}"
64
+ assert_equal expected_navigation, @browser.at_css("ytd-mini-guide-renderer").rows([:AREA, :SIZE]){ |_| !_.node.text.strip.empty? }.map{ |nav| nav.texts.first[0] }
65
+ grid = @browser.at_css("#content").grid
66
+ assert_equal 3*rows, grid.size
67
+ assert_equal [3]*rows, grid.rows.map(&:size)
68
+ assert_equal [rows]*3, grid.cols.map(&:size)
69
+ grid.each{ |n| n.to_h.values_at(:width, :height).each{ |_| assert_in_delta 250, _, 50 } }
70
+ end
27
71
  end
28
72
  end
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pagerecognizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Maslov aka Nakilon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-24 00:00:00.000000000 Z
11
+ date: 2022-05-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: ferrum
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
@@ -25,7 +25,7 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: pcbr
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -39,19 +39,19 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: ferrum
42
+ name: pcbr
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: 0.4.2
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: 0.4.2
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: minitest
57
57
  requirement: !ruby/object:Gem::Requirement