pagerecognizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 61a30cc5e39e171b8eabdf26490475ce3fb041b9
4
+ data.tar.gz: e64d2e3700730de8e9d3fe4d4de57f4743ca98b9
5
+ SHA512:
6
+ metadata.gz: cd4370f97135ac3df6376c2df1dcbb39cd25a35fac0ad172d29fa86764d8fb82d7ec28da7d4aaf0a547f1684cdda34f2d4de1b14de5c66ed784bb798643e96cd
7
+ data.tar.gz: e517075c5eb9d4efdb5bc851865136776df7dda1eacaecb96e3d89f211c63c6a5429b0ee4286280d150f7ea338662ceebdc0f9b0039d7df8a6a2778b349629f6
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Victor Maslov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,248 @@
1
+ module PageRecognizer
2
+ class << self
3
+ attr_accessor :logger
4
+ end
5
+ require "logger"
6
+ self.logger = Logger.new STDOUT
7
+
8
+ module Dumpable
9
+ def dump
10
+ "<html><body>#{
11
+ map.with_index do |n, i|
12
+ "<div style='position: absolute; background-color: hsla(#{
13
+ 360 * i / size
14
+ },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
15
+ n.node.tag_name.upcase
16
+ }</div>"
17
+ end.join
18
+ }</body></html>"
19
+ end
20
+ end
21
+
22
+ def self.load str
23
+ require "nokogiri"
24
+ Nokogiri::HTML(str).css("div").map do |n|
25
+ Struct.new(*%i{ node top left width height }).new Struct.new(:tag_name).new(n.text),
26
+ *n[:style].scan(/(\S+): ([^\;]+)/).to_h.values_at(
27
+ *%w{ top left width height }
28
+ ).map(&:to_f)
29
+ end.extend Dumpable
30
+ end
31
+
32
+ def recognize
33
+ logger = Module.nesting.first.logger
34
+
35
+ nodes = []
36
+ try = lambda do
37
+ prev = nodes
38
+ code = "( function(node) {
39
+ var x = scrollX, y = scrollY;
40
+ var _tap = function(x, f){ f(); return x };
41
+ var f = function(node) {
42
+ node.scrollIntoView();
43
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
44
+ var child_nodes = Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 });
45
+ var clickable;
46
+ if (node.nodeName == 'svg') {
47
+ var states = child_nodes.map( function(n){
48
+ return _tap(n.style ? n.style.display : '', function(){ n.style.display = 'none' } );
49
+ } );
50
+ clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
51
+ var _zip = function(a, b){ return a.map( function(e, i) { return [e, b[i]] } ) };
52
+ _zip(child_nodes, states).forEach( function(_){ _[0].style.display = _[1] } );
53
+ } else {
54
+ clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
55
+ };
56
+ rect.top += scrollY;
57
+ rect.left += scrollX;
58
+ return [ [
59
+ rect.top, rect.left, rect.width, rect.height, clickable, node
60
+ ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
61
+ };
62
+ return _tap(f(node), function(){ scrollTo(x, y) });
63
+ } )(arguments[0])"
64
+ str = Struct.new :top, :left, :width, :height, :clickable, :node
65
+ nodes = page.evaluate(code, self).map{ |s| str.new *s }
66
+ nodes.size == prev.size
67
+ end
68
+
69
+ if defined? Selenium::WebDriver::Wait
70
+ Selenium::WebDriver::Wait.new(
71
+ message: "number of DOM elements didn't stop to change"
72
+ ).until &try
73
+ else
74
+ t = Time.now
75
+ until try.call
76
+ fail "number of DOM elements didn't stop to change" if Time.now > t + 5
77
+ end
78
+ end
79
+ logger.info "#{nodes.size} DOM nodes found"
80
+
81
+ nodes.select! &:clickable
82
+ nodes.reject do |n|
83
+ nodes.any? do |nn|
84
+ cs = [
85
+ nn.top <=> n.top,
86
+ nn.left <=> n.left,
87
+ n.left + n.width <=> nn.left + nn.width,
88
+ n.top + n.height <=> nn.top + nn.height,
89
+ ]
90
+ cs.include?(1) && !cs.include?(-1)
91
+ end
92
+ end.extend Dumpable
93
+ end
94
+
95
+ private def recognize_more
96
+ logger = Module.nesting.first.logger
97
+
98
+ nodes = []
99
+ try = lambda do
100
+ prev = nodes
101
+ code = "( function(node) {
102
+ var x = scrollX, y = scrollY;
103
+ var _tap = function(x, f){ f(); return x };
104
+ var f = function(node) {
105
+ node.scrollIntoView();
106
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
107
+ rect.top += scrollY;
108
+ rect.left += scrollX;
109
+ return [ [
110
+ node, JSON.stringify([rect.top, rect.left, rect.width, rect.height])
111
+ ] ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
112
+ };
113
+ return _tap(f(node), function(){ scrollTo(x, y) });
114
+ } )(arguments[0])"
115
+ str = Struct.new :node, :top, :left, :width, :height
116
+ nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
117
+ nodes.size == prev.size
118
+ end
119
+
120
+ if defined? Selenium::WebDriver::Wait
121
+ Selenium::WebDriver::Wait.new(
122
+ message: "number of DOM elements didn't stop to change"
123
+ ).until &try
124
+ else
125
+ t = Time.now
126
+ until try.call
127
+ fail "number of DOM elements didn't stop to change" if Time.now > t + 10
128
+ end
129
+ end
130
+ logger.info "#{nodes.size} DOM nodes found"
131
+
132
+ nodes.reject!{ |i| i.height.zero? || i.width.zero? }
133
+ nodes
134
+ end
135
+
136
+ logging_error = Class.new RuntimeError do
137
+ attr_reader :dumps
138
+ def initialize msg, arrays
139
+ Module.nesting.first.logger.error "#{self.class}: #{msg}"
140
+ @dumps = arrays.map{ |name, array| [name, array.extend(Dumpable).dump] }.to_h
141
+ super msg
142
+ end
143
+ end
144
+ class ErrorNotEnoughNodes < logging_error ; end
145
+
146
+ private def split heuristics, hh, ww, tt, ll
147
+ logger = Module.nesting.first.logger
148
+
149
+ unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
150
+ ->(&b){ b.call }
151
+ else
152
+ lambda do |&try|
153
+ t = Time.now
154
+ begin
155
+ try.call
156
+ rescue Selenium::WebDriver::Error::StaleElementReferenceError
157
+ raise if Time.now > t + 10
158
+ retry
159
+ end
160
+ end
161
+ end
162
+ all = unstale.call do recognize_more end.sort_by(&tt)
163
+ logger.info "all nodes: #{all.size}"
164
+ rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
165
+ inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
166
+ raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
167
+ logger.info "inside nodes: #{inside.size}"
168
+ nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
169
+ logger.info "good nodes: #{nodes.size}" # only those that might be containers
170
+
171
+ large = nodes#.select{ |i| i[ww] > nodes.map(&ww).max / 4 }
172
+ logger.info "large enough and unique: #{large.size}"
173
+
174
+ interfere = lambda do |a, b|
175
+ a[tt] < b[tt] + b[hh] &&
176
+ b[tt] < a[tt] + a[hh]
177
+ end
178
+
179
+ rest = large.select.with_index do |a, i|
180
+ large.each_with_index.none? do |b, j|
181
+ next if i == j
182
+ a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
183
+ large.all?{ |c| interfere[a, c] == interfere[b, c] }
184
+ end
185
+ end
186
+ logger.info "not nested: #{rest.size}"
187
+ # rest = rest.sample 50
188
+
189
+ # adding the :area field for faster upcoming computations
190
+ struct = Struct.new *large.first.members, :area
191
+ rest.map!{ |i| struct.new *i.values, i.width * i.height }
192
+
193
+ require "pcbr"
194
+ pcbr = PCBR.new
195
+ is = []
196
+ max, past = 0, []
197
+ prev = nil
198
+ time = Time.now
199
+ loop do
200
+ rest.each_with_index do |node, i|
201
+ next if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
202
+ sol = rest.values_at *is, i
203
+ pcbr.store [*is, i].sort, [
204
+ *( is.size if heuristics.include? :SIZE ),
205
+ *( sol.map(&:area).inject(:+) if heuristics.include? :AREA ),
206
+ *( -sol.product(sol).map{ |s1, s2| (s1.width - s2.width ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :WIDTH ),
207
+ *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
208
+ *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
209
+ ] unless pcbr.table.assoc [*is, i].sort
210
+ end
211
+ if prev && Time.now - time > 1 && (Time.now - prev > (prev - time))
212
+ m = pcbr.table.reject{ |i| i.first.size == 1 }.map(&:last).max
213
+ break if 1 == pcbr.table.count{ |i| i.last == m } || Time.now - time > 5
214
+ end
215
+ break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.inject(:+) }.max_by(&:last)
216
+ if t.last > max
217
+ prev, max = Time.now, t.last
218
+ logger.debug [Time.now - time, max, t.first]
219
+ end
220
+ past.push (is = t.first).map{ |i| 2**i }.inject(:+)
221
+ end
222
+ # TODO: if multiple with max score, take the max by area
223
+ unless best = pcbr.table.reject{ |is,| is.size == 1 }.max_by(&:last)
224
+ raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, inside: inside, nodes: nodes, large: large, rest: rest
225
+ end
226
+ rest.values_at(*best.first).extend(Dumpable)
227
+ end
228
+
229
+ def rows *heuristics
230
+ heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
231
+ split heuristics, :height, :width, :top, :left
232
+ end
233
+ def cols *heuristics
234
+ heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
235
+ split heuristics, :width, :height, :left, :top
236
+ end
237
+
238
+
239
+ end
240
+
241
+ if defined? Ferrum::Frame::Runtime
242
+ Ferrum::Node.include PageRecognizer
243
+ Ferrum::Frame::Runtime.module_eval do
244
+ def cyclic? object_id
245
+ @page.command "Runtime.callFunctionOn", objectId: object_id, returnByValue: true, functionDeclaration: "function(){return false}"
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,23 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "pagerecognizer"
3
+ spec.version = "0.0.1"
4
+ spec.summary = "visual HTML page structure recognizer"
5
+
6
+ spec.author = "Victor Maslov aka Nakilon"
7
+ spec.email = "nakilon@gmail.com"
8
+ spec.license = "MIT"
9
+ spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
10
+
11
+ spec.add_dependency "nokogiri"
12
+ spec.add_dependency "pcbr"
13
+ spec.add_dependency "ferrum"
14
+ spec.add_development_dependency "minitest"
15
+
16
+ spec.add_development_dependency "ruby-prof"
17
+ spec.add_development_dependency "byebug"
18
+ spec.add_development_dependency "mll"
19
+
20
+ spec.require_path = "lib"
21
+ spec.test_file = "test.rb"
22
+ spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
23
+ end
data/test.rb ADDED
@@ -0,0 +1,28 @@
1
+ require "minitest/autorun"
2
+ require "ferrum"
3
+ require_relative "lib/pagerecognizer"
4
+ Ferrum::Node.include PageRecognizer
5
+
6
+ describe PageRecognizer do
7
+ it "google" do
8
+ browser = Ferrum::Browser.new **(ENV.has_key?("FERRUM_NO_SANDBOX") ? {browser_options: {"no-sandbox": nil}} : {})
9
+ browser.goto "about:blank"
10
+ browser.execute "document.write(#{File.read("google.htm").inspect})"
11
+ results = browser.at_css("body").rows
12
+ width = results.group_by(&:width).max_by{ |w, g| g.size }.first
13
+ assert_equal [
14
+ ["https://www.ruby-lang.org/ru/", "Ruby это... динамический язык программирования с о"],
15
+ ["https://ru.wikibooks.org/wiki/Ruby", "Этот учебник намерен осветить все тонкости програм"],
16
+ ["https://habr.com/ru/post/433672/", "19 дек. 2018 г. - Взрывной рост интереса к Ruby ос"],
17
+ ["https://habr.com/ru/hub/ruby/", "Ruby (англ. Ruby — «Рубин») — динамический, рефлек"],
18
+ ["https://web-creator.ru/articles/ruby", "Ruby разрабатывался на Linux, но работает на многи"],
19
+ ["http://rusrails.ru/", "Ruby on Rails руководства, учебники, статьи на рус"],
20
+ ["https://vc.ru/dev/72391-pochemu-my-vybiraem-ruby-d", "20 июн. 2019 г. - Ruby on Rails одним из первых на"],
21
+ ["https://tproger.ru/tag/ruby/", "Django или Ruby on Rails: какой фреймворк выбрать?"],
22
+ ["https://rubyrussia.club/", "Главная российская конференция о Ruby. Расширяем г"]
23
+ ], results.select{ |r| r.width == width }.map(&:node).map(&:rows).map{ |link, desc| [
24
+ link.node.at_css("a").property("href")[0,50],
25
+ desc.node.text[0,50],
26
+ ] }
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pagerecognizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Victor Maslov aka Nakilon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pcbr
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ferrum
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ruby-prof
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: byebug
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mll
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description:
112
+ email: nakilon@gmail.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - LICENSE
118
+ - lib/pagerecognizer.rb
119
+ - pagerecognizer.gemspec
120
+ - test.rb
121
+ homepage:
122
+ licenses:
123
+ - MIT
124
+ metadata:
125
+ source_code_uri: https://github.com/nakilon/pagerecognizer
126
+ post_install_message:
127
+ rdoc_options: []
128
+ require_paths:
129
+ - lib
130
+ required_ruby_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ requirements: []
141
+ rubyforge_project:
142
+ rubygems_version: 2.5.2.3
143
+ signing_key:
144
+ specification_version: 4
145
+ summary: visual HTML page structure recognizer
146
+ test_files:
147
+ - test.rb