pagerecognizer 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 61a30cc5e39e171b8eabdf26490475ce3fb041b9
4
- data.tar.gz: e64d2e3700730de8e9d3fe4d4de57f4743ca98b9
3
+ metadata.gz: 17ef706811d7513a3f7f6a109feacb59bbae91dc
4
+ data.tar.gz: e6890fcd6c6bfdd6d042513f02dea280e5c436ae
5
5
  SHA512:
6
- metadata.gz: cd4370f97135ac3df6376c2df1dcbb39cd25a35fac0ad172d29fa86764d8fb82d7ec28da7d4aaf0a547f1684cdda34f2d4de1b14de5c66ed784bb798643e96cd
7
- data.tar.gz: e517075c5eb9d4efdb5bc851865136776df7dda1eacaecb96e3d89f211c63c6a5429b0ee4286280d150f7ea338662ceebdc0f9b0039d7df8a6a2778b349629f6
6
+ metadata.gz: 74fc2c48871a01192e4ebbd80f603a170b1f953f61252f8bf21627bc3abb8555beb12572ab786754f6f015870bdaa6cec54daab23d15068c7aa0152b70c612aa
7
+ data.tar.gz: 3e6a080c6740075bab1c111127249ade7429ac844fcf1c0b99979c7a535cd5013cf9bd86629957ee7bcd4467828d023831a51df078037dff5cfc777c6a81be9e
@@ -4,14 +4,16 @@ module PageRecognizer
4
4
  end
5
5
  require "logger"
6
6
  self.logger = Logger.new STDOUT
7
+ self.logger.formatter = ->(severity, datetime, progname, msg){ "#{datetime.strftime "%H%M%S"} #{severity.to_s[0]} #{msg}\n" }
8
+ self.logger.level = ENV.fetch("LOGLEVEL_PageRecognizer", "FATAL").to_sym
7
9
 
8
10
  module Dumpable
9
11
  def dump
10
- "<html><body>#{
12
+ "<html><body style='white-space: nowrap'>#{
11
13
  map.with_index do |n, i|
12
- "<div style='position: absolute; background-color: hsla(#{
14
+ "<div id='#{i}' style='position: absolute; background-color: hsla(#{
13
15
  360 * i / size
14
- },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
16
+ },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{i} #{
15
17
  n.node.tag_name.upcase
16
18
  }</div>"
17
19
  end.join
@@ -29,92 +31,110 @@ module PageRecognizer
29
31
  end.extend Dumpable
30
32
  end
31
33
 
32
- def recognize
34
+ def self.rgb2hsv r, g, b # [<256, <256, <256]
35
+ # http://stackoverflow.com/q/41926874/322020
36
+ r, g, b = [r, g, b].map{ |_| _.fdiv 255 }
37
+ min, max = [r, g, b].minmax
38
+ chroma = max - min
39
+ [
40
+ 60.0 * ( chroma.zero? ? 0 : case max
41
+ when r ; (g - b) / chroma
42
+ when g ; (b - r) / chroma + 2
43
+ when b ; (r - g) / chroma + 4
44
+ else 0
45
+ end % 6 ),
46
+ chroma.zero? ? 0.0 : chroma / max,
47
+ max,
48
+ ] # [<=360, <=1, <=1]
49
+ end
50
+ def self.dist h1, s1, v1, h2, s2, v2 # [<256, <256, <256]
51
+ # https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
52
+ c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0 # chroma
53
+ z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
54
+ a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
55
+ x2 = Math::sin(a) * c2
56
+ y1, y2 = c1, Math::cos(a) * c2
57
+ x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
58
+ end
59
+
60
+ private def recognize
33
61
  logger = Module.nesting.first.logger
62
+ logger.info "method #{__method__}..."
34
63
 
35
64
  nodes = []
36
65
  try = lambda do
37
- prev = nodes
38
- code = "( function(node) {
39
- var x = scrollX, y = scrollY;
40
- var _tap = function(x, f){ f(); return x };
41
- var f = function(node) {
42
- node.scrollIntoView();
43
- var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
44
- var child_nodes = Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 });
45
- var clickable;
46
- if (node.nodeName == 'svg') {
47
- var states = child_nodes.map( function(n){
48
- return _tap(n.style ? n.style.display : '', function(){ n.style.display = 'none' } );
49
- } );
50
- clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
51
- var _zip = function(a, b){ return a.map( function(e, i) { return [e, b[i]] } ) };
52
- _zip(child_nodes, states).forEach( function(_){ _[0].style.display = _[1] } );
53
- } else {
54
- clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
55
- };
56
- rect.top += scrollY;
57
- rect.left += scrollX;
58
- return [ [
59
- rect.top, rect.left, rect.width, rect.height, clickable, node
60
- ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
61
- };
62
- return _tap(f(node), function(){ scrollTo(x, y) });
63
- } )(arguments[0])"
64
- str = Struct.new :top, :left, :width, :height, :clickable, :node
65
- nodes = page.evaluate(code, self).map{ |s| str.new *s }
66
- nodes.size == prev.size
67
- end
66
+ str = Struct.new :node, :visible, :top, :left, :width, :height, :area do
67
+ def texts
68
+ node.page.evaluate(<<~HEREDOC, node).map(&JSON.method(:load)).map do |text, rect1, rect2, style|
69
+ (function(node){
70
+ let result = [], range = document.createRange();
71
+ for (
72
+ let iterator = document.evaluate('.//text()', node, null, XPathResult.ANY_TYPE, null);
73
+ text = iterator.iterateNext();
74
+ ) {
75
+ range.selectNode(text);
76
+ result.push(JSON.stringify( [
77
+ text.wholeText,
78
+ range.getBoundingClientRect(),
79
+ text.parentNode.getBoundingClientRect(),
80
+ getComputedStyle(text.parentNode),
81
+ ] ));
82
+ }
83
+ return result;
84
+ })(arguments[0])
85
+ HEREDOC
68
86
 
69
- if defined? Selenium::WebDriver::Wait
70
- Selenium::WebDriver::Wait.new(
71
- message: "number of DOM elements didn't stop to change"
72
- ).until &try
73
- else
74
- t = Time.now
75
- until try.call
76
- fail "number of DOM elements didn't stop to change" if Time.now > t + 5
77
- end
78
- end
79
- logger.info "#{nodes.size} DOM nodes found"
87
+ # google SERP has 1x1 nodes with text _<>
88
+ next if rect1["width"] < 2 || rect1["height"] < 2
89
+ next if rect2["width"] < 2 || rect2["height"] < 2
80
90
 
81
- nodes.select! &:clickable
82
- nodes.reject do |n|
83
- nodes.any? do |nn|
84
- cs = [
85
- nn.top <=> n.top,
86
- nn.left <=> n.left,
87
- n.left + n.width <=> nn.left + nn.width,
88
- n.top + n.height <=> nn.top + nn.height,
89
- ]
90
- cs.include?(1) && !cs.include?(-1)
91
+ color = style["color"]
92
+ fail color unless /\Argba?\((?<red>\d+), (?<green>\d+), (?<blue>\d+)(, 0(\.\d+)?)?\)\z/ =~ color
93
+ closest_color = { # https://en.wikipedia.org/wiki/Web_colors#Basic_colors
94
+ white: [0, 0, 100],
95
+ silver: [0, 0, 75],
96
+ gray: [0, 0, 50],
97
+ black: [0, 0, 0],
98
+ red: [0, 100, 100],
99
+ maroon: [0, 100, 50],
100
+ yellow: [60, 100, 100],
101
+ olive: [60, 100, 50],
102
+ lime: [120, 100, 100],
103
+ green: [120, 100, 50],
104
+ aqua: [180, 100, 100],
105
+ teal: [180, 100, 50],
106
+ blue: [240, 100, 100],
107
+ navy: [240, 100, 50],
108
+ fuchsia: [300, 100, 100],
109
+ purple: [300, 100, 50],
110
+ }.to_a.min_by do |_, (h1, s1, v1)|
111
+ h2, s2, v2 = PageRecognizer.rgb2hsv(red.to_i, green.to_i, blue.to_i)
112
+ PageRecognizer.dist h1*255/360, s1*256/100, v1*256/100, h2*255/360, s2*255, v2*255
113
+ end.first
114
+ [text, style, closest_color, rect1]
115
+ end.compact
116
+ end
91
117
  end
92
- end.extend Dumpable
93
- end
94
-
95
- private def recognize_more
96
- logger = Module.nesting.first.logger
97
-
98
- nodes = []
99
- try = lambda do
100
- prev = nodes
101
- code = "( function(node) {
102
- var x = scrollX, y = scrollY;
103
- var _tap = function(x, f){ f(); return x };
104
- var f = function(node) {
105
- node.scrollIntoView();
106
- var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
107
- rect.top += scrollY;
108
- rect.left += scrollX;
109
- return [ [
110
- node, JSON.stringify([rect.top, rect.left, rect.width, rect.height])
111
- ] ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
112
- };
113
- return _tap(f(node), function(){ scrollTo(x, y) });
114
- } )(arguments[0])"
115
- str = Struct.new :node, :top, :left, :width, :height
116
- nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
117
- nodes.size == prev.size
118
+ prev = nodes.size
119
+ t = page.evaluate(<<~HEREDOC, self)
120
+ ( function(node) {
121
+ var x = scrollX, y = scrollY;
122
+ var _tap = function(x, f){ f(); return x };
123
+ var f = function(node) {
124
+ node.scrollIntoView();
125
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
126
+ rect.top += scrollY;
127
+ rect.left += scrollX;
128
+ return [
129
+ node, JSON.stringify([rect.top, rect.left, rect.width, rect.height]), ("visible" == getComputedStyle(node).visibility)
130
+ ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
131
+ };
132
+ return _tap(f(node), function(){ scrollTo(x, y) });
133
+ } )(arguments[0])
134
+ HEREDOC
135
+ logger.debug [t.size / 3, prev]
136
+ nodes = t.each_slice(3).map{ |node, rect, visible| str.new(node, visible, *JSON.load(rect)).tap{ |_| _.area = _.width * _.height } }
137
+ nodes.size == prev
118
138
  end
119
139
 
120
140
  if defined? Selenium::WebDriver::Wait
@@ -128,9 +148,9 @@ module PageRecognizer
128
148
  end
129
149
  end
130
150
  logger.info "#{nodes.size} DOM nodes found"
131
-
132
- nodes.reject!{ |i| i.height.zero? || i.width.zero? }
133
- nodes
151
+ nodes.reject!{ |_| _.height.zero? || _.width.zero? || !_.visible }
152
+ logger.info "visible nodes: #{nodes.size}"
153
+ nodes.extend Dumpable
134
154
  end
135
155
 
136
156
  logging_error = Class.new RuntimeError do
@@ -143,8 +163,9 @@ module PageRecognizer
143
163
  end
144
164
  class ErrorNotEnoughNodes < logging_error ; end
145
165
 
146
- private def split heuristics, hh, ww, tt, ll
166
+ private def split hh, ww, tt, ll, heuristics, try_min, dump, &filter
147
167
  logger = Module.nesting.first.logger
168
+ logger.info heuristics
148
169
 
149
170
  unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
150
171
  ->(&b){ b.call }
@@ -159,82 +180,253 @@ module PageRecognizer
159
180
  end
160
181
  end
161
182
  end
162
- all = unstale.call do recognize_more end.sort_by(&tt)
163
- logger.info "all nodes: #{all.size}"
164
- rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
165
- inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
166
- raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
167
- logger.info "inside nodes: #{inside.size}"
168
- nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
169
- logger.info "good nodes: #{nodes.size}" # only those that might be containers
170
183
 
171
- large = nodes#.select{ |i| i[ww] > nodes.map(&ww).max / 4 }
172
- logger.info "large enough and unique: #{large.size}"
184
+ nodes = unstale.call do recognize end.sort_by{ |_| [_[tt], _[ll]] }
185
+ File.write "#{dump}.all.htm", nodes.extend(Dumpable).dump if dump
186
+
187
+
188
+ nodes = unstale.call do nodes.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name } end.uniq{ |_| [_[hh], _[ww], _[tt], _[ll]] }
189
+ logger.info "good and unique: #{nodes.size}" # only those that might be containers
190
+ File.write "#{dump}.nodes.htm", nodes.extend(Dumpable).dump if dump
173
191
 
174
192
  interfere = lambda do |a, b|
175
193
  a[tt] < b[tt] + b[hh] &&
176
194
  b[tt] < a[tt] + a[hh]
177
195
  end
178
196
 
179
- rest = large.select.with_index do |a, i|
180
- large.each_with_index.none? do |b, j|
197
+
198
+ rest = nodes.select.with_index do |a, i|
199
+ nodes.each_with_index.none? do |b, j|
181
200
  next if i == j
182
201
  a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
183
- large.all?{ |c| interfere[a, c] == interfere[b, c] }
202
+ a[ll] >= b[ll] && a[ll] + a[ww] <= b[ll] + b[ww] &&
203
+ nodes.all?{ |c| interfere[a, c] == interfere[b, c] }
184
204
  end
185
205
  end
186
206
  logger.info "not nested: #{rest.size}"
187
- # rest = rest.sample 50
207
+ File.write "#{dump}.rest1.htm", rest.extend(Dumpable).dump if dump
208
+
209
+ # 8 = max_results - 1, 3 = (from row size diff euristic)
210
+ if try_min
211
+ rest = rest.reject{ |_| _[hh] + _[hh]/3*(try_min - 1) > (rest.map{ |_| _[tt] + _[hh] }.max - rest.map(&tt).min) }
212
+ logger.info "small enough: #{rest.size}"
213
+ end
214
+ File.write "#{dump}.rest2.htm", rest.extend(Dumpable).dump if dump
188
215
 
189
- # adding the :area field for faster upcoming computations
190
- struct = Struct.new *large.first.members, :area
191
- rest.map!{ |i| struct.new *i.values, i.width * i.height }
216
+ rest.select! &filter
217
+ logger.info "filtered: #{rest.size}"
218
+ File.write "#{dump}.filtered.htm", rest.extend(Dumpable).dump if dump
219
+
220
+ rest.sort_by!(&:area).reverse!
221
+ File.write "#{dump}.sorted.htm", rest.extend(Dumpable).dump if dump
192
222
 
193
223
  require "pcbr"
194
224
  pcbr = PCBR.new
195
225
  is = []
196
- max, past = 0, []
226
+ max, past = 0, Set.new
197
227
  prev = nil
198
228
  time = Time.now
199
229
  loop do
200
- rest.each_with_index do |node, i|
201
- next if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
230
+ si = (0...rest.size).reject do |i|
231
+ # I don't shrink pcbr so this should be a safe optimization
232
+ next true if is.last > i unless is.empty?
233
+ # also we've sorted from large to small so it does not get stuck with the half of the page below the largest node
234
+
235
+ next (logger.debug [i, 2]; true) if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
236
+ next (logger.debug [i, 3]; true) if is.any?{ |j| rest[i][ww] > rest[j][ww] * 2 } if heuristics.include? :WIDTH
237
+ next (logger.debug [i, 4]; true) if is.any?{ |j| rest[j][ww] > rest[i][ww] * 2 } if heuristics.include? :WIDTH
238
+ next (logger.debug [i, 5]; true) if is.any?{ |j| rest[i][hh] > rest[j][hh] * 3 }
239
+ next (logger.debug [i, 6]; true) if is.any?{ |j| rest[j][hh] > rest[i][hh] * 3 }
240
+ end
241
+ logger.debug [is, si]
242
+ si.each do |i|
202
243
  sol = rest.values_at *is, i
244
+ unless pcbr.set.include? [*is, i].sort
245
+ logger.debug [is, i, sol.map(&:area).reduce(:+)]
203
246
  pcbr.store [*is, i].sort, [
204
247
  *( is.size if heuristics.include? :SIZE ),
205
- *( sol.map(&:area).inject(:+) if heuristics.include? :AREA ),
206
- *( -sol.product(sol).map{ |s1, s2| (s1.width - s2.width ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :WIDTH ),
207
- *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
208
- *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
209
- ] unless pcbr.table.assoc [*is, i].sort
248
+ *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
249
+ # https://en.wikipedia.org/wiki/Mean_absolute_difference
250
+ *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
251
+ *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
252
+ ]
253
+ logger.debug "pcbr.table.size: #{pcbr.table.size}"
254
+ if si.none? do |j|
255
+ next if j <= i
256
+ next true if interfere[rest[i], rest[j]]
257
+ next true if rest[i][ww] > rest[j][ww] * 2 if heuristics.include? :WIDTH
258
+ next true if rest[j][ww] > rest[i][ww] * 2 if heuristics.include? :WIDTH
259
+ next true if rest[i][hh] > rest[j][hh] * 3
260
+ next true if rest[j][hh] > rest[i][hh] * 3
261
+ end
262
+ logger.debug "forced"
263
+ break
264
+ end
265
+ end
210
266
  end
211
- if prev && Time.now - time > 1 && (Time.now - prev > (prev - time))
212
- m = pcbr.table.reject{ |i| i.first.size == 1 }.map(&:last).max
213
- break if 1 == pcbr.table.count{ |i| i.last == m } || Time.now - time > 5
267
+ if prev && Time.now - time > 5
268
+ logger.debug "check"
269
+ break logger.info "break 0" if Time.now - time > 30
270
+ break logger.info "break 1" if Time.now - prev > 10
271
+ m = pcbr.table.reject{ |i| i.first.size < 2 }.map(&:last).max
272
+ break logger.info "break 2" if Time.now - prev > (prev - time) && 1 == pcbr.table.count{ |i| i.last == m }
214
273
  end
215
- break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.inject(:+) }.max_by(&:last)
274
+ break logger.info "done" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
275
+ logger.debug "next: #{t}"
276
+ past.add (is = t.first).map{ |i| 2**i }.reduce(:+)
216
277
  if t.last > max
217
278
  prev, max = Time.now, t.last
279
+ logger.debug "new max: #{max}"
218
280
  logger.debug [Time.now - time, max, t.first]
219
281
  end
220
- past.push (is = t.first).map{ |i| 2**i }.inject(:+)
221
282
  end
222
283
  # TODO: if multiple with max score, take the max by area
223
- unless best = pcbr.table.reject{ |is,| is.size == 1 }.max_by(&:last)
224
- raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, inside: inside, nodes: nodes, large: large, rest: rest
284
+ unless best = pcbr.table.reject{ |is,| is.size < 2 }.max_by(&:last)
285
+ raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, nodes: nodes, rest: rest
225
286
  end
226
- rest.values_at(*best.first).extend(Dumpable)
287
+ pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
288
+ logger.info best
289
+ logger.info "splitted in #{best.first.size}"
290
+ rest.values_at(*best.first).sort_by(&tt).extend Dumpable
227
291
  end
228
292
 
229
- def rows *heuristics
230
- heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
231
- split heuristics, :height, :width, :top, :left
293
+ def rows heuristics, try_min: nil, dump: nil, &b
294
+ split :height, :width, :top, :left, heuristics, try_min, dump, &b
232
295
  end
233
- def cols *heuristics
234
- heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
235
- split heuristics, :width, :height, :left, :top
296
+ def cols heuristics, try_min: nil, dump: nil, &b
297
+ split :width, :height, :left, :top, heuristics, try_min, dump, &b
236
298
  end
237
299
 
300
+ def self.piles z
301
+ max = nil
302
+ result = [current = []]
303
+ z.map.with_index.sort.each do |x|
304
+ if !max || max > x[0][0]
305
+ current.push x
306
+ max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
307
+ else
308
+ result.push current = [x]
309
+ max = x[0][0] + x[0][1]
310
+ end
311
+ end
312
+ result.map{ |_| _.map &:last }
313
+ end
314
+
315
+ module Gridable
316
+ def rows
317
+ Module.nesting[1].piles(map{ |n| [n.top, n.height] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
318
+ end
319
+ def cols
320
+ Module.nesting[1].piles(map{ |n| [n.left, n.width] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
321
+ end
322
+ end
323
+
324
+ def grid dump = nil
325
+ logger = Module.nesting.first.logger
326
+
327
+ all = recognize
328
+ logger.info "all nodes: #{all.size}"
329
+ File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
330
+
331
+ # adding the fields for faster upcoming computations
332
+ struct = Struct.new *all.first.members, :midx, :midy
333
+ all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
334
+ all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
335
+
336
+ rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
337
+ inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
338
+ raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
339
+ logger.info "inside nodes: #{inside.size}"
340
+ File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
341
+ good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
342
+ logger.info "good and unique: #{good.size}" # only those that might be containers
343
+ File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
344
+
345
+ # large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
346
+ # logger.info "large enough: #{large.size}"
347
+
348
+ interfere = lambda do |a, b|
349
+ a.top < b.top + b.height &&
350
+ b.top < a.top + a.height &&
351
+ a.left < b.left + b.width &&
352
+ b.left < a.left + a.width
353
+ end
354
+
355
+ rest = good.select.with_index do |a, i|
356
+ good.each_with_index.none? do |b, j|
357
+ next if i == j
358
+ a.top >= b.top && a.top + a.height <= b.top + b.height &&
359
+ a.left >= b.left && a.left + a.width <= b.left + b.width &&
360
+ good.all?{ |c| interfere[a, c] == interfere[b, c] }
361
+ end
362
+ end
363
+ logger.info "not nested: #{rest.size}"
364
+ File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
365
+ begin
366
+ prev = rest.size
367
+ rest.select!.with_index do |a, i|
368
+ rest.each_with_index.any? do |b, j|
369
+ cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
370
+ i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
371
+ end and
372
+ rest.each_with_index.any? do |b, j|
373
+ ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
374
+ i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
375
+ end
376
+ end
377
+ end until prev == rest.size
378
+ logger.info "gridable: #{rest.size}"
379
+ File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
380
+
381
+ require "pcbr"
382
+ pcbr = PCBR.new
383
+ max, past = 0, []
384
+ prev = nil
385
+ prev_max = nil
386
+ time = Time.now
387
+ heuristics = %i{ SIZE AREA }
388
+ inter = lambda do |a1, a2, b1, b2|
389
+ c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
390
+ [c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
391
+ end
392
+ lp = lambda do |is|
393
+ past.push is.map{ |i| 2**i }.reduce(:+)
394
+ rest.size.times do |ij|
395
+ next if ij <= is.last unless is.empty?
396
+ sorted = is + [ij]
397
+ next if pcbr.set.include? sorted
398
+ next if is.any?{ |j| interfere[rest[ij], rest[j]] }
399
+ sol = rest.values_at *sorted
400
+ xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
401
+ yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
402
+ next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
403
+ pcbr.store sorted, [
404
+ *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
405
+ xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
406
+ yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
407
+ ]
408
+ if prev && Time.now - time > 3
409
+ logger.debug "check"
410
+ break logger.info "break 0" if Time.now - time > 30
411
+ break logger.info "break 1" if Time.now - prev > 10
412
+ m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
413
+ break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
414
+ end
415
+
416
+ break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
417
+ logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
418
+ if t.last > max && t.first != prev_max
419
+ prev, max, prev_max = Time.now, t.last, t.first
420
+ logger.debug [pcbr.table.size, max, t.first]
421
+ end
422
+ lp.call t.first
423
+ end
424
+ end
425
+ lp.call []
426
+ # TODO: if multiple with max score, take the max by area
427
+ pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
428
+ rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
429
+ end
238
430
 
239
431
  end
240
432
 
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "pagerecognizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.1.0"
4
4
  spec.summary = "visual HTML page structure recognizer"
5
5
 
6
6
  spec.author = "Victor Maslov aka Nakilon"
@@ -8,16 +8,15 @@ Gem::Specification.new do |spec|
8
8
  spec.license = "MIT"
9
9
  spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
10
10
 
11
- spec.add_dependency "nokogiri"
12
- spec.add_dependency "pcbr"
13
11
  spec.add_dependency "ferrum"
12
+ spec.add_dependency "nokogiri"
13
+ spec.add_dependency "pcbr", "~>0.4.2"
14
14
  spec.add_development_dependency "minitest"
15
15
 
16
16
  spec.add_development_dependency "ruby-prof"
17
17
  spec.add_development_dependency "byebug"
18
18
  spec.add_development_dependency "mll"
19
19
 
20
- spec.require_path = "lib"
21
20
  spec.test_file = "test.rb"
22
21
  spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
23
22
  end
data/test.rb CHANGED
@@ -1,28 +1,72 @@
1
1
  require "minitest/autorun"
2
+
2
3
  require "ferrum"
3
4
  require_relative "lib/pagerecognizer"
4
- Ferrum::Node.include PageRecognizer
5
+ PageRecognizer.logger.level = :INFO
5
6
 
6
7
  describe PageRecognizer do
7
- it "google" do
8
- browser = Ferrum::Browser.new **(ENV.has_key?("FERRUM_NO_SANDBOX") ? {browser_options: {"no-sandbox": nil}} : {})
9
- browser.goto "about:blank"
10
- browser.execute "document.write(#{File.read("google.htm").inspect})"
11
- results = browser.at_css("body").rows
12
- width = results.group_by(&:width).max_by{ |w, g| g.size }.first
13
- assert_equal [
14
- ["https://www.ruby-lang.org/ru/", "Ruby это... динамический язык программирования с о"],
15
- ["https://ru.wikibooks.org/wiki/Ruby", "Этот учебник намерен осветить все тонкости програм"],
16
- ["https://habr.com/ru/post/433672/", "19 дек. 2018 г. - Взрывной рост интереса к Ruby ос"],
17
- ["https://habr.com/ru/hub/ruby/", "Ruby (англ. Ruby — «Рубин») — динамический, рефлек"],
18
- ["https://web-creator.ru/articles/ruby", "Ruby разрабатывался на Linux, но работает на многи"],
19
- ["http://rusrails.ru/", "Ruby on Rails руководства, учебники, статьи на рус"],
20
- ["https://vc.ru/dev/72391-pochemu-my-vybiraem-ruby-d", "20 июн. 2019 г. - Ruby on Rails одним из первых на"],
21
- ["https://tproger.ru/tag/ruby/", "Django или Ruby on Rails: какой фреймворк выбрать?"],
22
- ["https://rubyrussia.club/", "Главная российская конференция о Ruby. Расширяем г"]
23
- ], results.select{ |r| r.width == width }.map(&:node).map(&:rows).map{ |link, desc| [
24
- link.node.at_css("a").property("href")[0,50],
25
- desc.node.text[0,50],
8
+ before do
9
+ options = {}
10
+ options[:browser_options] = {"no-sandbox": nil} if ENV.has_key? "FERRUM_NO_SANDBOX"
11
+ options[:headless] = false if ENV.has_key? "HEADFULL"
12
+ @browser = Ferrum::Browser.new **options
13
+ end
14
+ after do
15
+ @browser&.quit
16
+ end
17
+ [
18
+ ["google1.htm", [
19
+ ["https://ru.wikipedia.org/wiki/Ruby#:~:te", "Ruby Википедия"],
20
+ ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
21
+ ["https://ru.wikibooks.org/wiki/Ruby", "Ruby Викиучебник"],
22
+ ["https://habr.com/ru/post/433672/", "Пацаны, так Ruby умер или нет? / Хабр - Habr"],
23
+ ["https://habr.com/ru/hub/ruby/", "Ruby Динамический высокоуровневый язык..."],
24
+ ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
25
+ ["http://rusrails.ru/", "Rusrails: Ruby on Rails по-русски"],
26
+ ["https://vc.ru/dev/72391-pochemu-my-vybir", "Почему мы выбираем Ruby для наших проектов..."],
27
+ ["https://tproger.ru/tag/ruby/", "Ruby — всё по этой теме для программистов..."],
28
+ ["https://rubyrussia.club/", "RubyRussia"],
29
+ ] ],
30
+ ["google2.mht", [
31
+ ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
32
+ ["https://ru.wikipedia.org/wiki/Ruby", "Ruby - Википедия"],
33
+ ["https://evrone.ru/why-ruby", "5 причин, почему мы выбираем Ruby - evrone.ru"],
34
+ ["https://habr.com/ru/hub/ruby/", "Ruby — Динамический высокоуровневый язык..."],
35
+ ["https://ru.wikibooks.org/wiki/Ruby", "Ruby - Викиучебник"],
36
+ ["https://context.reverso.net/%D0%BF%D0%B5", "ruby - Перевод на русский - примеры английский..."],
37
+ ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
38
+ ["https://ru.hexlet.io/courses/ruby", "Введение в Ruby - Хекслет"],
39
+ ["https://rubyrush.ru/articles/what-is-rub", "Что такое Ruby on Rails?"],
40
+ ] ],
41
+ ].each do |filename, expectation|
42
+ it "google rows #{filename}" do
43
+ @browser.goto "file://#{File.expand_path filename}"
44
+ results = @browser.at_css("body").rows([:AREA, :SIZE], try_min: 9) do |node|
45
+ texts = node.texts
46
+ next if texts.none?{ |_, _, color, | :black == color }
47
+ _, group = texts.group_by{ |_, style, | style["fontSize"].to_i }.to_a.max_by(&:first)
48
+ next unless group
49
+ next unless group.size == 1 && %i{ blue navy }.include?(group[0][2])
50
+ true
51
+ end
52
+ assert_equal expectation, results.reject{ |_| _.node.at_css "img" }.map{ |result| [
53
+ result.node.at_css("a").property("href")[0,40],
54
+ result.texts.max_by{ |_, style, | style["fontStyle"].to_i }[0].sub(/(.{40}) .+/, "\\1..."),
26
55
  ] }
56
+ end
57
+ end
58
+ [
59
+ ["youtube.htm", %w{ Главная В\ тренде Подписки Библиотека История }, 8],
60
+ ["youtube2.mht", %w{ Главная Навигатор Shorts Подписки Библиотека История }, 10],
61
+ ].each do |filename, expected_navigation, rows|
62
+ it "youtube rows grid #{filename}" do
63
+ @browser.goto "file://#{File.expand_path filename}"
64
+ assert_equal expected_navigation, @browser.at_css("ytd-mini-guide-renderer").rows([:AREA, :SIZE]){ |_| !_.node.text.strip.empty? }.map{ |nav| nav.texts.first[0] }
65
+ grid = @browser.at_css("#content").grid
66
+ assert_equal 3*rows, grid.size
67
+ assert_equal [3]*rows, grid.rows.map(&:size)
68
+ assert_equal [rows]*3, grid.cols.map(&:size)
69
+ grid.each{ |n| n.to_h.values_at(:width, :height).each{ |_| assert_in_delta 250, _, 50 } }
70
+ end
27
71
  end
28
72
  end
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pagerecognizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Maslov aka Nakilon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-24 00:00:00.000000000 Z
11
+ date: 2022-05-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: ferrum
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
@@ -25,7 +25,7 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: pcbr
28
+ name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -39,19 +39,19 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: ferrum
42
+ name: pcbr
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: 0.4.2
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: 0.4.2
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: minitest
57
57
  requirement: !ruby/object:Gem::Requirement