pagerecognizer 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pagerecognizer.rb +317 -125
- data/pagerecognizer.gemspec +3 -4
- data/test.rb +64 -20
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17ef706811d7513a3f7f6a109feacb59bbae91dc
|
4
|
+
data.tar.gz: e6890fcd6c6bfdd6d042513f02dea280e5c436ae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74fc2c48871a01192e4ebbd80f603a170b1f953f61252f8bf21627bc3abb8555beb12572ab786754f6f015870bdaa6cec54daab23d15068c7aa0152b70c612aa
|
7
|
+
data.tar.gz: 3e6a080c6740075bab1c111127249ade7429ac844fcf1c0b99979c7a535cd5013cf9bd86629957ee7bcd4467828d023831a51df078037dff5cfc777c6a81be9e
|
data/lib/pagerecognizer.rb
CHANGED
@@ -4,14 +4,16 @@ module PageRecognizer
|
|
4
4
|
end
|
5
5
|
require "logger"
|
6
6
|
self.logger = Logger.new STDOUT
|
7
|
+
self.logger.formatter = ->(severity, datetime, progname, msg){ "#{datetime.strftime "%H%M%S"} #{severity.to_s[0]} #{msg}\n" }
|
8
|
+
self.logger.level = ENV.fetch("LOGLEVEL_PageRecognizer", "FATAL").to_sym
|
7
9
|
|
8
10
|
module Dumpable
|
9
11
|
def dump
|
10
|
-
"<html><body>#{
|
12
|
+
"<html><body style='white-space: nowrap'>#{
|
11
13
|
map.with_index do |n, i|
|
12
|
-
"<div style='position: absolute; background-color: hsla(#{
|
14
|
+
"<div id='#{i}' style='position: absolute; background-color: hsla(#{
|
13
15
|
360 * i / size
|
14
|
-
},100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
|
16
|
+
},100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{i} #{
|
15
17
|
n.node.tag_name.upcase
|
16
18
|
}</div>"
|
17
19
|
end.join
|
@@ -29,92 +31,110 @@ module PageRecognizer
|
|
29
31
|
end.extend Dumpable
|
30
32
|
end
|
31
33
|
|
32
|
-
def
|
34
|
+
def self.rgb2hsv r, g, b # [<256, <256, <256]
|
35
|
+
# http://stackoverflow.com/q/41926874/322020
|
36
|
+
r, g, b = [r, g, b].map{ |_| _.fdiv 255 }
|
37
|
+
min, max = [r, g, b].minmax
|
38
|
+
chroma = max - min
|
39
|
+
[
|
40
|
+
60.0 * ( chroma.zero? ? 0 : case max
|
41
|
+
when r ; (g - b) / chroma
|
42
|
+
when g ; (b - r) / chroma + 2
|
43
|
+
when b ; (r - g) / chroma + 4
|
44
|
+
else 0
|
45
|
+
end % 6 ),
|
46
|
+
chroma.zero? ? 0.0 : chroma / max,
|
47
|
+
max,
|
48
|
+
] # [<=360, <=1, <=1]
|
49
|
+
end
|
50
|
+
def self.dist h1, s1, v1, h2, s2, v2 # [<256, <256, <256]
|
51
|
+
# https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
|
52
|
+
c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0 # chroma
|
53
|
+
z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
|
54
|
+
a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
|
55
|
+
x2 = Math::sin(a) * c2
|
56
|
+
y1, y2 = c1, Math::cos(a) * c2
|
57
|
+
x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
|
58
|
+
end
|
59
|
+
|
60
|
+
private def recognize
|
33
61
|
logger = Module.nesting.first.logger
|
62
|
+
logger.info "method #{__method__}..."
|
34
63
|
|
35
64
|
nodes = []
|
36
65
|
try = lambda do
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
rect.left += scrollX;
|
58
|
-
return [ [
|
59
|
-
rect.top, rect.left, rect.width, rect.height, clickable, node
|
60
|
-
] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
|
61
|
-
};
|
62
|
-
return _tap(f(node), function(){ scrollTo(x, y) });
|
63
|
-
} )(arguments[0])"
|
64
|
-
str = Struct.new :top, :left, :width, :height, :clickable, :node
|
65
|
-
nodes = page.evaluate(code, self).map{ |s| str.new *s }
|
66
|
-
nodes.size == prev.size
|
67
|
-
end
|
66
|
+
str = Struct.new :node, :visible, :top, :left, :width, :height, :area do
|
67
|
+
def texts
|
68
|
+
node.page.evaluate(<<~HEREDOC, node).map(&JSON.method(:load)).map do |text, rect1, rect2, style|
|
69
|
+
(function(node){
|
70
|
+
let result = [], range = document.createRange();
|
71
|
+
for (
|
72
|
+
let iterator = document.evaluate('.//text()', node, null, XPathResult.ANY_TYPE, null);
|
73
|
+
text = iterator.iterateNext();
|
74
|
+
) {
|
75
|
+
range.selectNode(text);
|
76
|
+
result.push(JSON.stringify( [
|
77
|
+
text.wholeText,
|
78
|
+
range.getBoundingClientRect(),
|
79
|
+
text.parentNode.getBoundingClientRect(),
|
80
|
+
getComputedStyle(text.parentNode),
|
81
|
+
] ));
|
82
|
+
}
|
83
|
+
return result;
|
84
|
+
})(arguments[0])
|
85
|
+
HEREDOC
|
68
86
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
).until &try
|
73
|
-
else
|
74
|
-
t = Time.now
|
75
|
-
until try.call
|
76
|
-
fail "number of DOM elements didn't stop to change" if Time.now > t + 5
|
77
|
-
end
|
78
|
-
end
|
79
|
-
logger.info "#{nodes.size} DOM nodes found"
|
87
|
+
# google SERP has 1x1 nodes with text _<>
|
88
|
+
next if rect1["width"] < 2 || rect1["height"] < 2
|
89
|
+
next if rect2["width"] < 2 || rect2["height"] < 2
|
80
90
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
+
color = style["color"]
|
92
|
+
fail color unless /\Argba?\((?<red>\d+), (?<green>\d+), (?<blue>\d+)(, 0(\.\d+)?)?\)\z/ =~ color
|
93
|
+
closest_color = { # https://en.wikipedia.org/wiki/Web_colors#Basic_colors
|
94
|
+
white: [0, 0, 100],
|
95
|
+
silver: [0, 0, 75],
|
96
|
+
gray: [0, 0, 50],
|
97
|
+
black: [0, 0, 0],
|
98
|
+
red: [0, 100, 100],
|
99
|
+
maroon: [0, 100, 50],
|
100
|
+
yellow: [60, 100, 100],
|
101
|
+
olive: [60, 100, 50],
|
102
|
+
lime: [120, 100, 100],
|
103
|
+
green: [120, 100, 50],
|
104
|
+
aqua: [180, 100, 100],
|
105
|
+
teal: [180, 100, 50],
|
106
|
+
blue: [240, 100, 100],
|
107
|
+
navy: [240, 100, 50],
|
108
|
+
fuchsia: [300, 100, 100],
|
109
|
+
purple: [300, 100, 50],
|
110
|
+
}.to_a.min_by do |_, (h1, s1, v1)|
|
111
|
+
h2, s2, v2 = PageRecognizer.rgb2hsv(red.to_i, green.to_i, blue.to_i)
|
112
|
+
PageRecognizer.dist h1*255/360, s1*256/100, v1*256/100, h2*255/360, s2*255, v2*255
|
113
|
+
end.first
|
114
|
+
[text, style, closest_color, rect1]
|
115
|
+
end.compact
|
116
|
+
end
|
91
117
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
};
|
113
|
-
return _tap(f(node), function(){ scrollTo(x, y) });
|
114
|
-
} )(arguments[0])"
|
115
|
-
str = Struct.new :node, :top, :left, :width, :height
|
116
|
-
nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
|
117
|
-
nodes.size == prev.size
|
118
|
+
prev = nodes.size
|
119
|
+
t = page.evaluate(<<~HEREDOC, self)
|
120
|
+
( function(node) {
|
121
|
+
var x = scrollX, y = scrollY;
|
122
|
+
var _tap = function(x, f){ f(); return x };
|
123
|
+
var f = function(node) {
|
124
|
+
node.scrollIntoView();
|
125
|
+
var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
|
126
|
+
rect.top += scrollY;
|
127
|
+
rect.left += scrollX;
|
128
|
+
return [
|
129
|
+
node, JSON.stringify([rect.top, rect.left, rect.width, rect.height]), ("visible" == getComputedStyle(node).visibility)
|
130
|
+
].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
|
131
|
+
};
|
132
|
+
return _tap(f(node), function(){ scrollTo(x, y) });
|
133
|
+
} )(arguments[0])
|
134
|
+
HEREDOC
|
135
|
+
logger.debug [t.size / 3, prev]
|
136
|
+
nodes = t.each_slice(3).map{ |node, rect, visible| str.new(node, visible, *JSON.load(rect)).tap{ |_| _.area = _.width * _.height } }
|
137
|
+
nodes.size == prev
|
118
138
|
end
|
119
139
|
|
120
140
|
if defined? Selenium::WebDriver::Wait
|
@@ -128,9 +148,9 @@ module PageRecognizer
|
|
128
148
|
end
|
129
149
|
end
|
130
150
|
logger.info "#{nodes.size} DOM nodes found"
|
131
|
-
|
132
|
-
|
133
|
-
nodes
|
151
|
+
nodes.reject!{ |_| _.height.zero? || _.width.zero? || !_.visible }
|
152
|
+
logger.info "visible nodes: #{nodes.size}"
|
153
|
+
nodes.extend Dumpable
|
134
154
|
end
|
135
155
|
|
136
156
|
logging_error = Class.new RuntimeError do
|
@@ -143,8 +163,9 @@ module PageRecognizer
|
|
143
163
|
end
|
144
164
|
class ErrorNotEnoughNodes < logging_error ; end
|
145
165
|
|
146
|
-
private def split
|
166
|
+
private def split hh, ww, tt, ll, heuristics, try_min, dump, &filter
|
147
167
|
logger = Module.nesting.first.logger
|
168
|
+
logger.info heuristics
|
148
169
|
|
149
170
|
unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
|
150
171
|
->(&b){ b.call }
|
@@ -159,82 +180,253 @@ module PageRecognizer
|
|
159
180
|
end
|
160
181
|
end
|
161
182
|
end
|
162
|
-
all = unstale.call do recognize_more end.sort_by(&tt)
|
163
|
-
logger.info "all nodes: #{all.size}"
|
164
|
-
rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
|
165
|
-
inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
|
166
|
-
raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
|
167
|
-
logger.info "inside nodes: #{inside.size}"
|
168
|
-
nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
|
169
|
-
logger.info "good nodes: #{nodes.size}" # only those that might be containers
|
170
183
|
|
171
|
-
|
172
|
-
|
184
|
+
nodes = unstale.call do recognize end.sort_by{ |_| [_[tt], _[ll]] }
|
185
|
+
File.write "#{dump}.all.htm", nodes.extend(Dumpable).dump if dump
|
186
|
+
|
187
|
+
|
188
|
+
nodes = unstale.call do nodes.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name } end.uniq{ |_| [_[hh], _[ww], _[tt], _[ll]] }
|
189
|
+
logger.info "good and unique: #{nodes.size}" # only those that might be containers
|
190
|
+
File.write "#{dump}.nodes.htm", nodes.extend(Dumpable).dump if dump
|
173
191
|
|
174
192
|
interfere = lambda do |a, b|
|
175
193
|
a[tt] < b[tt] + b[hh] &&
|
176
194
|
b[tt] < a[tt] + a[hh]
|
177
195
|
end
|
178
196
|
|
179
|
-
|
180
|
-
|
197
|
+
|
198
|
+
rest = nodes.select.with_index do |a, i|
|
199
|
+
nodes.each_with_index.none? do |b, j|
|
181
200
|
next if i == j
|
182
201
|
a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
|
183
|
-
|
202
|
+
a[ll] >= b[ll] && a[ll] + a[ww] <= b[ll] + b[ww] &&
|
203
|
+
nodes.all?{ |c| interfere[a, c] == interfere[b, c] }
|
184
204
|
end
|
185
205
|
end
|
186
206
|
logger.info "not nested: #{rest.size}"
|
187
|
-
# rest
|
207
|
+
File.write "#{dump}.rest1.htm", rest.extend(Dumpable).dump if dump
|
208
|
+
|
209
|
+
# 8 = max_results - 1, 3 = (from row size diff euristic)
|
210
|
+
if try_min
|
211
|
+
rest = rest.reject{ |_| _[hh] + _[hh]/3*(try_min - 1) > (rest.map{ |_| _[tt] + _[hh] }.max - rest.map(&tt).min) }
|
212
|
+
logger.info "small enough: #{rest.size}"
|
213
|
+
end
|
214
|
+
File.write "#{dump}.rest2.htm", rest.extend(Dumpable).dump if dump
|
188
215
|
|
189
|
-
|
190
|
-
|
191
|
-
|
216
|
+
rest.select! &filter
|
217
|
+
logger.info "filtered: #{rest.size}"
|
218
|
+
File.write "#{dump}.filtered.htm", rest.extend(Dumpable).dump if dump
|
219
|
+
|
220
|
+
rest.sort_by!(&:area).reverse!
|
221
|
+
File.write "#{dump}.sorted.htm", rest.extend(Dumpable).dump if dump
|
192
222
|
|
193
223
|
require "pcbr"
|
194
224
|
pcbr = PCBR.new
|
195
225
|
is = []
|
196
|
-
max, past = 0,
|
226
|
+
max, past = 0, Set.new
|
197
227
|
prev = nil
|
198
228
|
time = Time.now
|
199
229
|
loop do
|
200
|
-
rest.
|
201
|
-
|
230
|
+
si = (0...rest.size).reject do |i|
|
231
|
+
# I don't shrink pcbr so this should be a safe optimization
|
232
|
+
next true if is.last > i unless is.empty?
|
233
|
+
# also we've sorted from large to small so it does not get stuck with the half of the page below the largest node
|
234
|
+
|
235
|
+
next (logger.debug [i, 2]; true) if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
|
236
|
+
next (logger.debug [i, 3]; true) if is.any?{ |j| rest[i][ww] > rest[j][ww] * 2 } if heuristics.include? :WIDTH
|
237
|
+
next (logger.debug [i, 4]; true) if is.any?{ |j| rest[j][ww] > rest[i][ww] * 2 } if heuristics.include? :WIDTH
|
238
|
+
next (logger.debug [i, 5]; true) if is.any?{ |j| rest[i][hh] > rest[j][hh] * 3 }
|
239
|
+
next (logger.debug [i, 6]; true) if is.any?{ |j| rest[j][hh] > rest[i][hh] * 3 }
|
240
|
+
end
|
241
|
+
logger.debug [is, si]
|
242
|
+
si.each do |i|
|
202
243
|
sol = rest.values_at *is, i
|
244
|
+
unless pcbr.set.include? [*is, i].sort
|
245
|
+
logger.debug [is, i, sol.map(&:area).reduce(:+)]
|
203
246
|
pcbr.store [*is, i].sort, [
|
204
247
|
*( is.size if heuristics.include? :SIZE ),
|
205
|
-
*( sol.map(&:area).
|
206
|
-
|
207
|
-
*( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.
|
208
|
-
*( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.
|
209
|
-
]
|
248
|
+
*( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
|
249
|
+
# https://en.wikipedia.org/wiki/Mean_absolute_difference
|
250
|
+
*( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
|
251
|
+
*( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
|
252
|
+
]
|
253
|
+
logger.debug "pcbr.table.size: #{pcbr.table.size}"
|
254
|
+
if si.none? do |j|
|
255
|
+
next if j <= i
|
256
|
+
next true if interfere[rest[i], rest[j]]
|
257
|
+
next true if rest[i][ww] > rest[j][ww] * 2 if heuristics.include? :WIDTH
|
258
|
+
next true if rest[j][ww] > rest[i][ww] * 2 if heuristics.include? :WIDTH
|
259
|
+
next true if rest[i][hh] > rest[j][hh] * 3
|
260
|
+
next true if rest[j][hh] > rest[i][hh] * 3
|
261
|
+
end
|
262
|
+
logger.debug "forced"
|
263
|
+
break
|
264
|
+
end
|
265
|
+
end
|
210
266
|
end
|
211
|
-
if prev && Time.now - time >
|
212
|
-
|
213
|
-
break
|
267
|
+
if prev && Time.now - time > 5
|
268
|
+
logger.debug "check"
|
269
|
+
break logger.info "break 0" if Time.now - time > 30
|
270
|
+
break logger.info "break 1" if Time.now - prev > 10
|
271
|
+
m = pcbr.table.reject{ |i| i.first.size < 2 }.map(&:last).max
|
272
|
+
break logger.info "break 2" if Time.now - prev > (prev - time) && 1 == pcbr.table.count{ |i| i.last == m }
|
214
273
|
end
|
215
|
-
break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.
|
274
|
+
break logger.info "done" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
|
275
|
+
logger.debug "next: #{t}"
|
276
|
+
past.add (is = t.first).map{ |i| 2**i }.reduce(:+)
|
216
277
|
if t.last > max
|
217
278
|
prev, max = Time.now, t.last
|
279
|
+
logger.debug "new max: #{max}"
|
218
280
|
logger.debug [Time.now - time, max, t.first]
|
219
281
|
end
|
220
|
-
past.push (is = t.first).map{ |i| 2**i }.inject(:+)
|
221
282
|
end
|
222
283
|
# TODO: if multiple with max score, take the max by area
|
223
|
-
unless best = pcbr.table.reject{ |is,| is.size
|
224
|
-
raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all,
|
284
|
+
unless best = pcbr.table.reject{ |is,| is.size < 2 }.max_by(&:last)
|
285
|
+
raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, nodes: nodes, rest: rest
|
225
286
|
end
|
226
|
-
|
287
|
+
pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
|
288
|
+
logger.info best
|
289
|
+
logger.info "splitted in #{best.first.size}"
|
290
|
+
rest.values_at(*best.first).sort_by(&tt).extend Dumpable
|
227
291
|
end
|
228
292
|
|
229
|
-
def rows
|
230
|
-
|
231
|
-
split heuristics, :height, :width, :top, :left
|
293
|
+
def rows heuristics, try_min: nil, dump: nil, &b
|
294
|
+
split :height, :width, :top, :left, heuristics, try_min, dump, &b
|
232
295
|
end
|
233
|
-
def cols
|
234
|
-
|
235
|
-
split heuristics, :width, :height, :left, :top
|
296
|
+
def cols heuristics, try_min: nil, dump: nil, &b
|
297
|
+
split :width, :height, :left, :top, heuristics, try_min, dump, &b
|
236
298
|
end
|
237
299
|
|
300
|
+
def self.piles z
|
301
|
+
max = nil
|
302
|
+
result = [current = []]
|
303
|
+
z.map.with_index.sort.each do |x|
|
304
|
+
if !max || max > x[0][0]
|
305
|
+
current.push x
|
306
|
+
max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
|
307
|
+
else
|
308
|
+
result.push current = [x]
|
309
|
+
max = x[0][0] + x[0][1]
|
310
|
+
end
|
311
|
+
end
|
312
|
+
result.map{ |_| _.map &:last }
|
313
|
+
end
|
314
|
+
|
315
|
+
module Gridable
|
316
|
+
def rows
|
317
|
+
Module.nesting[1].piles(map{ |n| [n.top, n.height] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
|
318
|
+
end
|
319
|
+
def cols
|
320
|
+
Module.nesting[1].piles(map{ |n| [n.left, n.width] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def grid dump = nil
|
325
|
+
logger = Module.nesting.first.logger
|
326
|
+
|
327
|
+
all = recognize
|
328
|
+
logger.info "all nodes: #{all.size}"
|
329
|
+
File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
|
330
|
+
|
331
|
+
# adding the fields for faster upcoming computations
|
332
|
+
struct = Struct.new *all.first.members, :midx, :midy
|
333
|
+
all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
|
334
|
+
all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
|
335
|
+
|
336
|
+
rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
|
337
|
+
inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
|
338
|
+
raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
|
339
|
+
logger.info "inside nodes: #{inside.size}"
|
340
|
+
File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
|
341
|
+
good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
|
342
|
+
logger.info "good and unique: #{good.size}" # only those that might be containers
|
343
|
+
File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
|
344
|
+
|
345
|
+
# large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
|
346
|
+
# logger.info "large enough: #{large.size}"
|
347
|
+
|
348
|
+
interfere = lambda do |a, b|
|
349
|
+
a.top < b.top + b.height &&
|
350
|
+
b.top < a.top + a.height &&
|
351
|
+
a.left < b.left + b.width &&
|
352
|
+
b.left < a.left + a.width
|
353
|
+
end
|
354
|
+
|
355
|
+
rest = good.select.with_index do |a, i|
|
356
|
+
good.each_with_index.none? do |b, j|
|
357
|
+
next if i == j
|
358
|
+
a.top >= b.top && a.top + a.height <= b.top + b.height &&
|
359
|
+
a.left >= b.left && a.left + a.width <= b.left + b.width &&
|
360
|
+
good.all?{ |c| interfere[a, c] == interfere[b, c] }
|
361
|
+
end
|
362
|
+
end
|
363
|
+
logger.info "not nested: #{rest.size}"
|
364
|
+
File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
|
365
|
+
begin
|
366
|
+
prev = rest.size
|
367
|
+
rest.select!.with_index do |a, i|
|
368
|
+
rest.each_with_index.any? do |b, j|
|
369
|
+
cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
|
370
|
+
i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
|
371
|
+
end and
|
372
|
+
rest.each_with_index.any? do |b, j|
|
373
|
+
ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
|
374
|
+
i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
|
375
|
+
end
|
376
|
+
end
|
377
|
+
end until prev == rest.size
|
378
|
+
logger.info "gridable: #{rest.size}"
|
379
|
+
File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
|
380
|
+
|
381
|
+
require "pcbr"
|
382
|
+
pcbr = PCBR.new
|
383
|
+
max, past = 0, []
|
384
|
+
prev = nil
|
385
|
+
prev_max = nil
|
386
|
+
time = Time.now
|
387
|
+
heuristics = %i{ SIZE AREA }
|
388
|
+
inter = lambda do |a1, a2, b1, b2|
|
389
|
+
c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
|
390
|
+
[c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
|
391
|
+
end
|
392
|
+
lp = lambda do |is|
|
393
|
+
past.push is.map{ |i| 2**i }.reduce(:+)
|
394
|
+
rest.size.times do |ij|
|
395
|
+
next if ij <= is.last unless is.empty?
|
396
|
+
sorted = is + [ij]
|
397
|
+
next if pcbr.set.include? sorted
|
398
|
+
next if is.any?{ |j| interfere[rest[ij], rest[j]] }
|
399
|
+
sol = rest.values_at *sorted
|
400
|
+
xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
|
401
|
+
yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
|
402
|
+
next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
|
403
|
+
pcbr.store sorted, [
|
404
|
+
*( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
|
405
|
+
xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
|
406
|
+
yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
|
407
|
+
]
|
408
|
+
if prev && Time.now - time > 3
|
409
|
+
logger.debug "check"
|
410
|
+
break logger.info "break 0" if Time.now - time > 30
|
411
|
+
break logger.info "break 1" if Time.now - prev > 10
|
412
|
+
m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
|
413
|
+
break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
|
414
|
+
end
|
415
|
+
|
416
|
+
break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
|
417
|
+
logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
|
418
|
+
if t.last > max && t.first != prev_max
|
419
|
+
prev, max, prev_max = Time.now, t.last, t.first
|
420
|
+
logger.debug [pcbr.table.size, max, t.first]
|
421
|
+
end
|
422
|
+
lp.call t.first
|
423
|
+
end
|
424
|
+
end
|
425
|
+
lp.call []
|
426
|
+
# TODO: if multiple with max score, take the max by area
|
427
|
+
pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
|
428
|
+
rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
|
429
|
+
end
|
238
430
|
|
239
431
|
end
|
240
432
|
|
data/pagerecognizer.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "pagerecognizer"
|
3
|
-
spec.version = "0.0
|
3
|
+
spec.version = "0.1.0"
|
4
4
|
spec.summary = "visual HTML page structure recognizer"
|
5
5
|
|
6
6
|
spec.author = "Victor Maslov aka Nakilon"
|
@@ -8,16 +8,15 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.license = "MIT"
|
9
9
|
spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
|
10
10
|
|
11
|
-
spec.add_dependency "nokogiri"
|
12
|
-
spec.add_dependency "pcbr"
|
13
11
|
spec.add_dependency "ferrum"
|
12
|
+
spec.add_dependency "nokogiri"
|
13
|
+
spec.add_dependency "pcbr", "~>0.4.2"
|
14
14
|
spec.add_development_dependency "minitest"
|
15
15
|
|
16
16
|
spec.add_development_dependency "ruby-prof"
|
17
17
|
spec.add_development_dependency "byebug"
|
18
18
|
spec.add_development_dependency "mll"
|
19
19
|
|
20
|
-
spec.require_path = "lib"
|
21
20
|
spec.test_file = "test.rb"
|
22
21
|
spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
|
23
22
|
end
|
data/test.rb
CHANGED
@@ -1,28 +1,72 @@
|
|
1
1
|
require "minitest/autorun"
|
2
|
+
|
2
3
|
require "ferrum"
|
3
4
|
require_relative "lib/pagerecognizer"
|
4
|
-
|
5
|
+
PageRecognizer.logger.level = :INFO
|
5
6
|
|
6
7
|
describe PageRecognizer do
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
["
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
8
|
+
before do
|
9
|
+
options = {}
|
10
|
+
options[:browser_options] = {"no-sandbox": nil} if ENV.has_key? "FERRUM_NO_SANDBOX"
|
11
|
+
options[:headless] = false if ENV.has_key? "HEADFULL"
|
12
|
+
@browser = Ferrum::Browser.new **options
|
13
|
+
end
|
14
|
+
after do
|
15
|
+
@browser&.quit
|
16
|
+
end
|
17
|
+
[
|
18
|
+
["google1.htm", [
|
19
|
+
["https://ru.wikipedia.org/wiki/Ruby#:~:te", "Ruby — Википедия"],
|
20
|
+
["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
|
21
|
+
["https://ru.wikibooks.org/wiki/Ruby", "Ruby — Викиучебник"],
|
22
|
+
["https://habr.com/ru/post/433672/", "Пацаны, так Ruby умер или нет? / Хабр - Habr"],
|
23
|
+
["https://habr.com/ru/hub/ruby/", "Ruby – Динамический высокоуровневый язык..."],
|
24
|
+
["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
|
25
|
+
["http://rusrails.ru/", "Rusrails: Ruby on Rails по-русски"],
|
26
|
+
["https://vc.ru/dev/72391-pochemu-my-vybir", "Почему мы выбираем Ruby для наших проектов..."],
|
27
|
+
["https://tproger.ru/tag/ruby/", "Ruby — всё по этой теме для программистов..."],
|
28
|
+
["https://rubyrussia.club/", "RubyRussia"],
|
29
|
+
] ],
|
30
|
+
["google2.mht", [
|
31
|
+
["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
|
32
|
+
["https://ru.wikipedia.org/wiki/Ruby", "Ruby - Википедия"],
|
33
|
+
["https://evrone.ru/why-ruby", "5 причин, почему мы выбираем Ruby - evrone.ru"],
|
34
|
+
["https://habr.com/ru/hub/ruby/", "Ruby — Динамический высокоуровневый язык..."],
|
35
|
+
["https://ru.wikibooks.org/wiki/Ruby", "Ruby - Викиучебник"],
|
36
|
+
["https://context.reverso.net/%D0%BF%D0%B5", "ruby - Перевод на русский - примеры английский..."],
|
37
|
+
["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
|
38
|
+
["https://ru.hexlet.io/courses/ruby", "Введение в Ruby - Хекслет"],
|
39
|
+
["https://rubyrush.ru/articles/what-is-rub", "Что такое Ruby on Rails?"],
|
40
|
+
] ],
|
41
|
+
].each do |filename, expectation|
|
42
|
+
it "google rows #{filename}" do
|
43
|
+
@browser.goto "file://#{File.expand_path filename}"
|
44
|
+
results = @browser.at_css("body").rows([:AREA, :SIZE], try_min: 9) do |node|
|
45
|
+
texts = node.texts
|
46
|
+
next if texts.none?{ |_, _, color, | :black == color }
|
47
|
+
_, group = texts.group_by{ |_, style, | style["fontSize"].to_i }.to_a.max_by(&:first)
|
48
|
+
next unless group
|
49
|
+
next unless group.size == 1 && %i{ blue navy }.include?(group[0][2])
|
50
|
+
true
|
51
|
+
end
|
52
|
+
assert_equal expectation, results.reject{ |_| _.node.at_css "img" }.map{ |result| [
|
53
|
+
result.node.at_css("a").property("href")[0,40],
|
54
|
+
result.texts.max_by{ |_, style, | style["fontStyle"].to_i }[0].sub(/(.{40}) .+/, "\\1..."),
|
26
55
|
] }
|
56
|
+
end
|
57
|
+
end
|
58
|
+
[
|
59
|
+
["youtube.htm", %w{ Главная В\ тренде Подписки Библиотека История }, 8],
|
60
|
+
["youtube2.mht", %w{ Главная Навигатор Shorts Подписки Библиотека История }, 10],
|
61
|
+
].each do |filename, expected_navigation, rows|
|
62
|
+
it "youtube rows grid #{filename}" do
|
63
|
+
@browser.goto "file://#{File.expand_path filename}"
|
64
|
+
assert_equal expected_navigation, @browser.at_css("ytd-mini-guide-renderer").rows([:AREA, :SIZE]){ |_| !_.node.text.strip.empty? }.map{ |nav| nav.texts.first[0] }
|
65
|
+
grid = @browser.at_css("#content").grid
|
66
|
+
assert_equal 3*rows, grid.size
|
67
|
+
assert_equal [3]*rows, grid.rows.map(&:size)
|
68
|
+
assert_equal [rows]*3, grid.cols.map(&:size)
|
69
|
+
grid.each{ |n| n.to_h.values_at(:width, :height).each{ |_| assert_in_delta 250, _, 50 } }
|
70
|
+
end
|
27
71
|
end
|
28
72
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pagerecognizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Maslov aka Nakilon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: ferrum
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,19 +39,19 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: pcbr
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.4.2
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.4.2
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|