pagerecognizer 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pagerecognizer.rb +317 -125
- data/pagerecognizer.gemspec +3 -4
- data/test.rb +64 -20
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17ef706811d7513a3f7f6a109feacb59bbae91dc
|
4
|
+
data.tar.gz: e6890fcd6c6bfdd6d042513f02dea280e5c436ae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74fc2c48871a01192e4ebbd80f603a170b1f953f61252f8bf21627bc3abb8555beb12572ab786754f6f015870bdaa6cec54daab23d15068c7aa0152b70c612aa
|
7
|
+
data.tar.gz: 3e6a080c6740075bab1c111127249ade7429ac844fcf1c0b99979c7a535cd5013cf9bd86629957ee7bcd4467828d023831a51df078037dff5cfc777c6a81be9e
|
data/lib/pagerecognizer.rb
CHANGED
@@ -4,14 +4,16 @@ module PageRecognizer
|
|
4
4
|
end
|
5
5
|
require "logger"
|
6
6
|
self.logger = Logger.new STDOUT
|
7
|
+
self.logger.formatter = ->(severity, datetime, progname, msg){ "#{datetime.strftime "%H%M%S"} #{severity.to_s[0]} #{msg}\n" }
|
8
|
+
self.logger.level = ENV.fetch("LOGLEVEL_PageRecognizer", "FATAL").to_sym
|
7
9
|
|
8
10
|
module Dumpable
|
9
11
|
def dump
|
10
|
-
"<html><body>#{
|
12
|
+
"<html><body style='white-space: nowrap'>#{
|
11
13
|
map.with_index do |n, i|
|
12
|
-
"<div style='position: absolute; background-color: hsla(#{
|
14
|
+
"<div id='#{i}' style='position: absolute; background-color: hsla(#{
|
13
15
|
360 * i / size
|
14
|
-
},100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
|
16
|
+
},100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{i} #{
|
15
17
|
n.node.tag_name.upcase
|
16
18
|
}</div>"
|
17
19
|
end.join
|
@@ -29,92 +31,110 @@ module PageRecognizer
|
|
29
31
|
end.extend Dumpable
|
30
32
|
end
|
31
33
|
|
32
|
-
def
|
34
|
+
def self.rgb2hsv r, g, b # [<256, <256, <256]
|
35
|
+
# http://stackoverflow.com/q/41926874/322020
|
36
|
+
r, g, b = [r, g, b].map{ |_| _.fdiv 255 }
|
37
|
+
min, max = [r, g, b].minmax
|
38
|
+
chroma = max - min
|
39
|
+
[
|
40
|
+
60.0 * ( chroma.zero? ? 0 : case max
|
41
|
+
when r ; (g - b) / chroma
|
42
|
+
when g ; (b - r) / chroma + 2
|
43
|
+
when b ; (r - g) / chroma + 4
|
44
|
+
else 0
|
45
|
+
end % 6 ),
|
46
|
+
chroma.zero? ? 0.0 : chroma / max,
|
47
|
+
max,
|
48
|
+
] # [<=360, <=1, <=1]
|
49
|
+
end
|
50
|
+
def self.dist h1, s1, v1, h2, s2, v2 # [<256, <256, <256]
|
51
|
+
# https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
|
52
|
+
c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0 # chroma
|
53
|
+
z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
|
54
|
+
a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
|
55
|
+
x2 = Math::sin(a) * c2
|
56
|
+
y1, y2 = c1, Math::cos(a) * c2
|
57
|
+
x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
|
58
|
+
end
|
59
|
+
|
60
|
+
private def recognize
|
33
61
|
logger = Module.nesting.first.logger
|
62
|
+
logger.info "method #{__method__}..."
|
34
63
|
|
35
64
|
nodes = []
|
36
65
|
try = lambda do
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
rect.left += scrollX;
|
58
|
-
return [ [
|
59
|
-
rect.top, rect.left, rect.width, rect.height, clickable, node
|
60
|
-
] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
|
61
|
-
};
|
62
|
-
return _tap(f(node), function(){ scrollTo(x, y) });
|
63
|
-
} )(arguments[0])"
|
64
|
-
str = Struct.new :top, :left, :width, :height, :clickable, :node
|
65
|
-
nodes = page.evaluate(code, self).map{ |s| str.new *s }
|
66
|
-
nodes.size == prev.size
|
67
|
-
end
|
66
|
+
str = Struct.new :node, :visible, :top, :left, :width, :height, :area do
|
67
|
+
def texts
|
68
|
+
node.page.evaluate(<<~HEREDOC, node).map(&JSON.method(:load)).map do |text, rect1, rect2, style|
|
69
|
+
(function(node){
|
70
|
+
let result = [], range = document.createRange();
|
71
|
+
for (
|
72
|
+
let iterator = document.evaluate('.//text()', node, null, XPathResult.ANY_TYPE, null);
|
73
|
+
text = iterator.iterateNext();
|
74
|
+
) {
|
75
|
+
range.selectNode(text);
|
76
|
+
result.push(JSON.stringify( [
|
77
|
+
text.wholeText,
|
78
|
+
range.getBoundingClientRect(),
|
79
|
+
text.parentNode.getBoundingClientRect(),
|
80
|
+
getComputedStyle(text.parentNode),
|
81
|
+
] ));
|
82
|
+
}
|
83
|
+
return result;
|
84
|
+
})(arguments[0])
|
85
|
+
HEREDOC
|
68
86
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
).until &try
|
73
|
-
else
|
74
|
-
t = Time.now
|
75
|
-
until try.call
|
76
|
-
fail "number of DOM elements didn't stop to change" if Time.now > t + 5
|
77
|
-
end
|
78
|
-
end
|
79
|
-
logger.info "#{nodes.size} DOM nodes found"
|
87
|
+
# google SERP has 1x1 nodes with text _<>
|
88
|
+
next if rect1["width"] < 2 || rect1["height"] < 2
|
89
|
+
next if rect2["width"] < 2 || rect2["height"] < 2
|
80
90
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
+
color = style["color"]
|
92
|
+
fail color unless /\Argba?\((?<red>\d+), (?<green>\d+), (?<blue>\d+)(, 0(\.\d+)?)?\)\z/ =~ color
|
93
|
+
closest_color = { # https://en.wikipedia.org/wiki/Web_colors#Basic_colors
|
94
|
+
white: [0, 0, 100],
|
95
|
+
silver: [0, 0, 75],
|
96
|
+
gray: [0, 0, 50],
|
97
|
+
black: [0, 0, 0],
|
98
|
+
red: [0, 100, 100],
|
99
|
+
maroon: [0, 100, 50],
|
100
|
+
yellow: [60, 100, 100],
|
101
|
+
olive: [60, 100, 50],
|
102
|
+
lime: [120, 100, 100],
|
103
|
+
green: [120, 100, 50],
|
104
|
+
aqua: [180, 100, 100],
|
105
|
+
teal: [180, 100, 50],
|
106
|
+
blue: [240, 100, 100],
|
107
|
+
navy: [240, 100, 50],
|
108
|
+
fuchsia: [300, 100, 100],
|
109
|
+
purple: [300, 100, 50],
|
110
|
+
}.to_a.min_by do |_, (h1, s1, v1)|
|
111
|
+
h2, s2, v2 = PageRecognizer.rgb2hsv(red.to_i, green.to_i, blue.to_i)
|
112
|
+
PageRecognizer.dist h1*255/360, s1*256/100, v1*256/100, h2*255/360, s2*255, v2*255
|
113
|
+
end.first
|
114
|
+
[text, style, closest_color, rect1]
|
115
|
+
end.compact
|
116
|
+
end
|
91
117
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
};
|
113
|
-
return _tap(f(node), function(){ scrollTo(x, y) });
|
114
|
-
} )(arguments[0])"
|
115
|
-
str = Struct.new :node, :top, :left, :width, :height
|
116
|
-
nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
|
117
|
-
nodes.size == prev.size
|
118
|
+
prev = nodes.size
|
119
|
+
t = page.evaluate(<<~HEREDOC, self)
|
120
|
+
( function(node) {
|
121
|
+
var x = scrollX, y = scrollY;
|
122
|
+
var _tap = function(x, f){ f(); return x };
|
123
|
+
var f = function(node) {
|
124
|
+
node.scrollIntoView();
|
125
|
+
var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
|
126
|
+
rect.top += scrollY;
|
127
|
+
rect.left += scrollX;
|
128
|
+
return [
|
129
|
+
node, JSON.stringify([rect.top, rect.left, rect.width, rect.height]), ("visible" == getComputedStyle(node).visibility)
|
130
|
+
].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
|
131
|
+
};
|
132
|
+
return _tap(f(node), function(){ scrollTo(x, y) });
|
133
|
+
} )(arguments[0])
|
134
|
+
HEREDOC
|
135
|
+
logger.debug [t.size / 3, prev]
|
136
|
+
nodes = t.each_slice(3).map{ |node, rect, visible| str.new(node, visible, *JSON.load(rect)).tap{ |_| _.area = _.width * _.height } }
|
137
|
+
nodes.size == prev
|
118
138
|
end
|
119
139
|
|
120
140
|
if defined? Selenium::WebDriver::Wait
|
@@ -128,9 +148,9 @@ module PageRecognizer
|
|
128
148
|
end
|
129
149
|
end
|
130
150
|
logger.info "#{nodes.size} DOM nodes found"
|
131
|
-
|
132
|
-
|
133
|
-
nodes
|
151
|
+
nodes.reject!{ |_| _.height.zero? || _.width.zero? || !_.visible }
|
152
|
+
logger.info "visible nodes: #{nodes.size}"
|
153
|
+
nodes.extend Dumpable
|
134
154
|
end
|
135
155
|
|
136
156
|
logging_error = Class.new RuntimeError do
|
@@ -143,8 +163,9 @@ module PageRecognizer
|
|
143
163
|
end
|
144
164
|
class ErrorNotEnoughNodes < logging_error ; end
|
145
165
|
|
146
|
-
private def split
|
166
|
+
private def split hh, ww, tt, ll, heuristics, try_min, dump, &filter
|
147
167
|
logger = Module.nesting.first.logger
|
168
|
+
logger.info heuristics
|
148
169
|
|
149
170
|
unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
|
150
171
|
->(&b){ b.call }
|
@@ -159,82 +180,253 @@ module PageRecognizer
|
|
159
180
|
end
|
160
181
|
end
|
161
182
|
end
|
162
|
-
all = unstale.call do recognize_more end.sort_by(&tt)
|
163
|
-
logger.info "all nodes: #{all.size}"
|
164
|
-
rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
|
165
|
-
inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
|
166
|
-
raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
|
167
|
-
logger.info "inside nodes: #{inside.size}"
|
168
|
-
nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
|
169
|
-
logger.info "good nodes: #{nodes.size}" # only those that might be containers
|
170
183
|
|
171
|
-
|
172
|
-
|
184
|
+
nodes = unstale.call do recognize end.sort_by{ |_| [_[tt], _[ll]] }
|
185
|
+
File.write "#{dump}.all.htm", nodes.extend(Dumpable).dump if dump
|
186
|
+
|
187
|
+
|
188
|
+
nodes = unstale.call do nodes.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name } end.uniq{ |_| [_[hh], _[ww], _[tt], _[ll]] }
|
189
|
+
logger.info "good and unique: #{nodes.size}" # only those that might be containers
|
190
|
+
File.write "#{dump}.nodes.htm", nodes.extend(Dumpable).dump if dump
|
173
191
|
|
174
192
|
interfere = lambda do |a, b|
|
175
193
|
a[tt] < b[tt] + b[hh] &&
|
176
194
|
b[tt] < a[tt] + a[hh]
|
177
195
|
end
|
178
196
|
|
179
|
-
|
180
|
-
|
197
|
+
|
198
|
+
rest = nodes.select.with_index do |a, i|
|
199
|
+
nodes.each_with_index.none? do |b, j|
|
181
200
|
next if i == j
|
182
201
|
a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
|
183
|
-
|
202
|
+
a[ll] >= b[ll] && a[ll] + a[ww] <= b[ll] + b[ww] &&
|
203
|
+
nodes.all?{ |c| interfere[a, c] == interfere[b, c] }
|
184
204
|
end
|
185
205
|
end
|
186
206
|
logger.info "not nested: #{rest.size}"
|
187
|
-
# rest
|
207
|
+
File.write "#{dump}.rest1.htm", rest.extend(Dumpable).dump if dump
|
208
|
+
|
209
|
+
# 8 = max_results - 1, 3 = (from row size diff euristic)
|
210
|
+
if try_min
|
211
|
+
rest = rest.reject{ |_| _[hh] + _[hh]/3*(try_min - 1) > (rest.map{ |_| _[tt] + _[hh] }.max - rest.map(&tt).min) }
|
212
|
+
logger.info "small enough: #{rest.size}"
|
213
|
+
end
|
214
|
+
File.write "#{dump}.rest2.htm", rest.extend(Dumpable).dump if dump
|
188
215
|
|
189
|
-
|
190
|
-
|
191
|
-
|
216
|
+
rest.select! &filter
|
217
|
+
logger.info "filtered: #{rest.size}"
|
218
|
+
File.write "#{dump}.filtered.htm", rest.extend(Dumpable).dump if dump
|
219
|
+
|
220
|
+
rest.sort_by!(&:area).reverse!
|
221
|
+
File.write "#{dump}.sorted.htm", rest.extend(Dumpable).dump if dump
|
192
222
|
|
193
223
|
require "pcbr"
|
194
224
|
pcbr = PCBR.new
|
195
225
|
is = []
|
196
|
-
max, past = 0,
|
226
|
+
max, past = 0, Set.new
|
197
227
|
prev = nil
|
198
228
|
time = Time.now
|
199
229
|
loop do
|
200
|
-
rest.
|
201
|
-
|
230
|
+
si = (0...rest.size).reject do |i|
|
231
|
+
# I don't shrink pcbr so this should be a safe optimization
|
232
|
+
next true if is.last > i unless is.empty?
|
233
|
+
# also we've sorted from large to small so it does not get stuck with the half of the page below the largest node
|
234
|
+
|
235
|
+
next (logger.debug [i, 2]; true) if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
|
236
|
+
next (logger.debug [i, 3]; true) if is.any?{ |j| rest[i][ww] > rest[j][ww] * 2 } if heuristics.include? :WIDTH
|
237
|
+
next (logger.debug [i, 4]; true) if is.any?{ |j| rest[j][ww] > rest[i][ww] * 2 } if heuristics.include? :WIDTH
|
238
|
+
next (logger.debug [i, 5]; true) if is.any?{ |j| rest[i][hh] > rest[j][hh] * 3 }
|
239
|
+
next (logger.debug [i, 6]; true) if is.any?{ |j| rest[j][hh] > rest[i][hh] * 3 }
|
240
|
+
end
|
241
|
+
logger.debug [is, si]
|
242
|
+
si.each do |i|
|
202
243
|
sol = rest.values_at *is, i
|
244
|
+
unless pcbr.set.include? [*is, i].sort
|
245
|
+
logger.debug [is, i, sol.map(&:area).reduce(:+)]
|
203
246
|
pcbr.store [*is, i].sort, [
|
204
247
|
*( is.size if heuristics.include? :SIZE ),
|
205
|
-
*( sol.map(&:area).
|
206
|
-
|
207
|
-
*( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.
|
208
|
-
*( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.
|
209
|
-
]
|
248
|
+
*( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
|
249
|
+
# https://en.wikipedia.org/wiki/Mean_absolute_difference
|
250
|
+
*( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
|
251
|
+
*( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
|
252
|
+
]
|
253
|
+
logger.debug "pcbr.table.size: #{pcbr.table.size}"
|
254
|
+
if si.none? do |j|
|
255
|
+
next if j <= i
|
256
|
+
next true if interfere[rest[i], rest[j]]
|
257
|
+
next true if rest[i][ww] > rest[j][ww] * 2 if heuristics.include? :WIDTH
|
258
|
+
next true if rest[j][ww] > rest[i][ww] * 2 if heuristics.include? :WIDTH
|
259
|
+
next true if rest[i][hh] > rest[j][hh] * 3
|
260
|
+
next true if rest[j][hh] > rest[i][hh] * 3
|
261
|
+
end
|
262
|
+
logger.debug "forced"
|
263
|
+
break
|
264
|
+
end
|
265
|
+
end
|
210
266
|
end
|
211
|
-
if prev && Time.now - time >
|
212
|
-
|
213
|
-
break
|
267
|
+
if prev && Time.now - time > 5
|
268
|
+
logger.debug "check"
|
269
|
+
break logger.info "break 0" if Time.now - time > 30
|
270
|
+
break logger.info "break 1" if Time.now - prev > 10
|
271
|
+
m = pcbr.table.reject{ |i| i.first.size < 2 }.map(&:last).max
|
272
|
+
break logger.info "break 2" if Time.now - prev > (prev - time) && 1 == pcbr.table.count{ |i| i.last == m }
|
214
273
|
end
|
215
|
-
break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.
|
274
|
+
break logger.info "done" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
|
275
|
+
logger.debug "next: #{t}"
|
276
|
+
past.add (is = t.first).map{ |i| 2**i }.reduce(:+)
|
216
277
|
if t.last > max
|
217
278
|
prev, max = Time.now, t.last
|
279
|
+
logger.debug "new max: #{max}"
|
218
280
|
logger.debug [Time.now - time, max, t.first]
|
219
281
|
end
|
220
|
-
past.push (is = t.first).map{ |i| 2**i }.inject(:+)
|
221
282
|
end
|
222
283
|
# TODO: if multiple with max score, take the max by area
|
223
|
-
unless best = pcbr.table.reject{ |is,| is.size
|
224
|
-
raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all,
|
284
|
+
unless best = pcbr.table.reject{ |is,| is.size < 2 }.max_by(&:last)
|
285
|
+
raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, nodes: nodes, rest: rest
|
225
286
|
end
|
226
|
-
|
287
|
+
pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
|
288
|
+
logger.info best
|
289
|
+
logger.info "splitted in #{best.first.size}"
|
290
|
+
rest.values_at(*best.first).sort_by(&tt).extend Dumpable
|
227
291
|
end
|
228
292
|
|
229
|
-
def rows
|
230
|
-
|
231
|
-
split heuristics, :height, :width, :top, :left
|
293
|
+
def rows heuristics, try_min: nil, dump: nil, &b
|
294
|
+
split :height, :width, :top, :left, heuristics, try_min, dump, &b
|
232
295
|
end
|
233
|
-
def cols
|
234
|
-
|
235
|
-
split heuristics, :width, :height, :left, :top
|
296
|
+
def cols heuristics, try_min: nil, dump: nil, &b
|
297
|
+
split :width, :height, :left, :top, heuristics, try_min, dump, &b
|
236
298
|
end
|
237
299
|
|
300
|
+
def self.piles z
|
301
|
+
max = nil
|
302
|
+
result = [current = []]
|
303
|
+
z.map.with_index.sort.each do |x|
|
304
|
+
if !max || max > x[0][0]
|
305
|
+
current.push x
|
306
|
+
max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
|
307
|
+
else
|
308
|
+
result.push current = [x]
|
309
|
+
max = x[0][0] + x[0][1]
|
310
|
+
end
|
311
|
+
end
|
312
|
+
result.map{ |_| _.map &:last }
|
313
|
+
end
|
314
|
+
|
315
|
+
module Gridable
|
316
|
+
def rows
|
317
|
+
Module.nesting[1].piles(map{ |n| [n.top, n.height] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
|
318
|
+
end
|
319
|
+
def cols
|
320
|
+
Module.nesting[1].piles(map{ |n| [n.left, n.width] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def grid dump = nil
|
325
|
+
logger = Module.nesting.first.logger
|
326
|
+
|
327
|
+
all = recognize
|
328
|
+
logger.info "all nodes: #{all.size}"
|
329
|
+
File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
|
330
|
+
|
331
|
+
# adding the fields for faster upcoming computations
|
332
|
+
struct = Struct.new *all.first.members, :midx, :midy
|
333
|
+
all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
|
334
|
+
all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
|
335
|
+
|
336
|
+
rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
|
337
|
+
inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
|
338
|
+
raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
|
339
|
+
logger.info "inside nodes: #{inside.size}"
|
340
|
+
File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
|
341
|
+
good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
|
342
|
+
logger.info "good and unique: #{good.size}" # only those that might be containers
|
343
|
+
File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
|
344
|
+
|
345
|
+
# large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
|
346
|
+
# logger.info "large enough: #{large.size}"
|
347
|
+
|
348
|
+
interfere = lambda do |a, b|
|
349
|
+
a.top < b.top + b.height &&
|
350
|
+
b.top < a.top + a.height &&
|
351
|
+
a.left < b.left + b.width &&
|
352
|
+
b.left < a.left + a.width
|
353
|
+
end
|
354
|
+
|
355
|
+
rest = good.select.with_index do |a, i|
|
356
|
+
good.each_with_index.none? do |b, j|
|
357
|
+
next if i == j
|
358
|
+
a.top >= b.top && a.top + a.height <= b.top + b.height &&
|
359
|
+
a.left >= b.left && a.left + a.width <= b.left + b.width &&
|
360
|
+
good.all?{ |c| interfere[a, c] == interfere[b, c] }
|
361
|
+
end
|
362
|
+
end
|
363
|
+
logger.info "not nested: #{rest.size}"
|
364
|
+
File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
|
365
|
+
begin
|
366
|
+
prev = rest.size
|
367
|
+
rest.select!.with_index do |a, i|
|
368
|
+
rest.each_with_index.any? do |b, j|
|
369
|
+
cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
|
370
|
+
i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
|
371
|
+
end and
|
372
|
+
rest.each_with_index.any? do |b, j|
|
373
|
+
ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
|
374
|
+
i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
|
375
|
+
end
|
376
|
+
end
|
377
|
+
end until prev == rest.size
|
378
|
+
logger.info "gridable: #{rest.size}"
|
379
|
+
File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
|
380
|
+
|
381
|
+
require "pcbr"
|
382
|
+
pcbr = PCBR.new
|
383
|
+
max, past = 0, []
|
384
|
+
prev = nil
|
385
|
+
prev_max = nil
|
386
|
+
time = Time.now
|
387
|
+
heuristics = %i{ SIZE AREA }
|
388
|
+
inter = lambda do |a1, a2, b1, b2|
|
389
|
+
c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
|
390
|
+
[c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
|
391
|
+
end
|
392
|
+
lp = lambda do |is|
|
393
|
+
past.push is.map{ |i| 2**i }.reduce(:+)
|
394
|
+
rest.size.times do |ij|
|
395
|
+
next if ij <= is.last unless is.empty?
|
396
|
+
sorted = is + [ij]
|
397
|
+
next if pcbr.set.include? sorted
|
398
|
+
next if is.any?{ |j| interfere[rest[ij], rest[j]] }
|
399
|
+
sol = rest.values_at *sorted
|
400
|
+
xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
|
401
|
+
yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
|
402
|
+
next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
|
403
|
+
pcbr.store sorted, [
|
404
|
+
*( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
|
405
|
+
xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
|
406
|
+
yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
|
407
|
+
]
|
408
|
+
if prev && Time.now - time > 3
|
409
|
+
logger.debug "check"
|
410
|
+
break logger.info "break 0" if Time.now - time > 30
|
411
|
+
break logger.info "break 1" if Time.now - prev > 10
|
412
|
+
m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
|
413
|
+
break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
|
414
|
+
end
|
415
|
+
|
416
|
+
break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
|
417
|
+
logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
|
418
|
+
if t.last > max && t.first != prev_max
|
419
|
+
prev, max, prev_max = Time.now, t.last, t.first
|
420
|
+
logger.debug [pcbr.table.size, max, t.first]
|
421
|
+
end
|
422
|
+
lp.call t.first
|
423
|
+
end
|
424
|
+
end
|
425
|
+
lp.call []
|
426
|
+
# TODO: if multiple with max score, take the max by area
|
427
|
+
pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
|
428
|
+
rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
|
429
|
+
end
|
238
430
|
|
239
431
|
end
|
240
432
|
|
data/pagerecognizer.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "pagerecognizer"
|
3
|
-
spec.version = "0.0
|
3
|
+
spec.version = "0.1.0"
|
4
4
|
spec.summary = "visual HTML page structure recognizer"
|
5
5
|
|
6
6
|
spec.author = "Victor Maslov aka Nakilon"
|
@@ -8,16 +8,15 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.license = "MIT"
|
9
9
|
spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
|
10
10
|
|
11
|
-
spec.add_dependency "nokogiri"
|
12
|
-
spec.add_dependency "pcbr"
|
13
11
|
spec.add_dependency "ferrum"
|
12
|
+
spec.add_dependency "nokogiri"
|
13
|
+
spec.add_dependency "pcbr", "~>0.4.2"
|
14
14
|
spec.add_development_dependency "minitest"
|
15
15
|
|
16
16
|
spec.add_development_dependency "ruby-prof"
|
17
17
|
spec.add_development_dependency "byebug"
|
18
18
|
spec.add_development_dependency "mll"
|
19
19
|
|
20
|
-
spec.require_path = "lib"
|
21
20
|
spec.test_file = "test.rb"
|
22
21
|
spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
|
23
22
|
end
|
data/test.rb
CHANGED
@@ -1,28 +1,72 @@
|
|
1
1
|
require "minitest/autorun"
|
2
|
+
|
2
3
|
require "ferrum"
|
3
4
|
require_relative "lib/pagerecognizer"
|
4
|
-
|
5
|
+
PageRecognizer.logger.level = :INFO
|
5
6
|
|
6
7
|
describe PageRecognizer do
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
["
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
8
|
+
before do
|
9
|
+
options = {}
|
10
|
+
options[:browser_options] = {"no-sandbox": nil} if ENV.has_key? "FERRUM_NO_SANDBOX"
|
11
|
+
options[:headless] = false if ENV.has_key? "HEADFULL"
|
12
|
+
@browser = Ferrum::Browser.new **options
|
13
|
+
end
|
14
|
+
after do
|
15
|
+
@browser&.quit
|
16
|
+
end
|
17
|
+
[
|
18
|
+
["google1.htm", [
|
19
|
+
["https://ru.wikipedia.org/wiki/Ruby#:~:te", "Ruby — Википедия"],
|
20
|
+
["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
|
21
|
+
["https://ru.wikibooks.org/wiki/Ruby", "Ruby — Викиучебник"],
|
22
|
+
["https://habr.com/ru/post/433672/", "Пацаны, так Ruby умер или нет? / Хабр - Habr"],
|
23
|
+
["https://habr.com/ru/hub/ruby/", "Ruby – Динамический высокоуровневый язык..."],
|
24
|
+
["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
|
25
|
+
["http://rusrails.ru/", "Rusrails: Ruby on Rails по-русски"],
|
26
|
+
["https://vc.ru/dev/72391-pochemu-my-vybir", "Почему мы выбираем Ruby для наших проектов..."],
|
27
|
+
["https://tproger.ru/tag/ruby/", "Ruby — всё по этой теме для программистов..."],
|
28
|
+
["https://rubyrussia.club/", "RubyRussia"],
|
29
|
+
] ],
|
30
|
+
["google2.mht", [
|
31
|
+
["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
|
32
|
+
["https://ru.wikipedia.org/wiki/Ruby", "Ruby - Википедия"],
|
33
|
+
["https://evrone.ru/why-ruby", "5 причин, почему мы выбираем Ruby - evrone.ru"],
|
34
|
+
["https://habr.com/ru/hub/ruby/", "Ruby — Динамический высокоуровневый язык..."],
|
35
|
+
["https://ru.wikibooks.org/wiki/Ruby", "Ruby - Викиучебник"],
|
36
|
+
["https://context.reverso.net/%D0%BF%D0%B5", "ruby - Перевод на русский - примеры английский..."],
|
37
|
+
["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
|
38
|
+
["https://ru.hexlet.io/courses/ruby", "Введение в Ruby - Хекслет"],
|
39
|
+
["https://rubyrush.ru/articles/what-is-rub", "Что такое Ruby on Rails?"],
|
40
|
+
] ],
|
41
|
+
].each do |filename, expectation|
|
42
|
+
it "google rows #{filename}" do
|
43
|
+
@browser.goto "file://#{File.expand_path filename}"
|
44
|
+
results = @browser.at_css("body").rows([:AREA, :SIZE], try_min: 9) do |node|
|
45
|
+
texts = node.texts
|
46
|
+
next if texts.none?{ |_, _, color, | :black == color }
|
47
|
+
_, group = texts.group_by{ |_, style, | style["fontSize"].to_i }.to_a.max_by(&:first)
|
48
|
+
next unless group
|
49
|
+
next unless group.size == 1 && %i{ blue navy }.include?(group[0][2])
|
50
|
+
true
|
51
|
+
end
|
52
|
+
assert_equal expectation, results.reject{ |_| _.node.at_css "img" }.map{ |result| [
|
53
|
+
result.node.at_css("a").property("href")[0,40],
|
54
|
+
result.texts.max_by{ |_, style, | style["fontStyle"].to_i }[0].sub(/(.{40}) .+/, "\\1..."),
|
26
55
|
] }
|
56
|
+
end
|
57
|
+
end
|
58
|
+
[
|
59
|
+
["youtube.htm", %w{ Главная В\ тренде Подписки Библиотека История }, 8],
|
60
|
+
["youtube2.mht", %w{ Главная Навигатор Shorts Подписки Библиотека История }, 10],
|
61
|
+
].each do |filename, expected_navigation, rows|
|
62
|
+
it "youtube rows grid #{filename}" do
|
63
|
+
@browser.goto "file://#{File.expand_path filename}"
|
64
|
+
assert_equal expected_navigation, @browser.at_css("ytd-mini-guide-renderer").rows([:AREA, :SIZE]){ |_| !_.node.text.strip.empty? }.map{ |nav| nav.texts.first[0] }
|
65
|
+
grid = @browser.at_css("#content").grid
|
66
|
+
assert_equal 3*rows, grid.size
|
67
|
+
assert_equal [3]*rows, grid.rows.map(&:size)
|
68
|
+
assert_equal [rows]*3, grid.cols.map(&:size)
|
69
|
+
grid.each{ |n| n.to_h.values_at(:width, :height).each{ |_| assert_in_delta 250, _, 50 } }
|
70
|
+
end
|
27
71
|
end
|
28
72
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pagerecognizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Maslov aka Nakilon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: ferrum
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,19 +39,19 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: pcbr
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.4.2
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.4.2
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|