pagerecognizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 61a30cc5e39e171b8eabdf26490475ce3fb041b9
4
+ data.tar.gz: e64d2e3700730de8e9d3fe4d4de57f4743ca98b9
5
+ SHA512:
6
+ metadata.gz: cd4370f97135ac3df6376c2df1dcbb39cd25a35fac0ad172d29fa86764d8fb82d7ec28da7d4aaf0a547f1684cdda34f2d4de1b14de5c66ed784bb798643e96cd
7
+ data.tar.gz: e517075c5eb9d4efdb5bc851865136776df7dda1eacaecb96e3d89f211c63c6a5429b0ee4286280d150f7ea338662ceebdc0f9b0039d7df8a6a2778b349629f6
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Victor Maslov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,248 @@
1
+ module PageRecognizer
2
+ class << self
3
+ attr_accessor :logger
4
+ end
5
+ require "logger"
6
+ self.logger = Logger.new STDOUT
7
+
8
+ module Dumpable
9
+ def dump
10
+ "<html><body>#{
11
+ map.with_index do |n, i|
12
+ "<div style='position: absolute; background-color: hsla(#{
13
+ 360 * i / size
14
+ },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
15
+ n.node.tag_name.upcase
16
+ }</div>"
17
+ end.join
18
+ }</body></html>"
19
+ end
20
+ end
21
+
22
+ def self.load str
23
+ require "nokogiri"
24
+ Nokogiri::HTML(str).css("div").map do |n|
25
+ Struct.new(*%i{ node top left width height }).new Struct.new(:tag_name).new(n.text),
26
+ *n[:style].scan(/(\S+): ([^\;]+)/).to_h.values_at(
27
+ *%w{ top left width height }
28
+ ).map(&:to_f)
29
+ end.extend Dumpable
30
+ end
31
+
32
+ def recognize
33
+ logger = Module.nesting.first.logger
34
+
35
+ nodes = []
36
+ try = lambda do
37
+ prev = nodes
38
+ code = "( function(node) {
39
+ var x = scrollX, y = scrollY;
40
+ var _tap = function(x, f){ f(); return x };
41
+ var f = function(node) {
42
+ node.scrollIntoView();
43
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
44
+ var child_nodes = Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 });
45
+ var clickable;
46
+ if (node.nodeName == 'svg') {
47
+ var states = child_nodes.map( function(n){
48
+ return _tap(n.style ? n.style.display : '', function(){ n.style.display = 'none' } );
49
+ } );
50
+ clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
51
+ var _zip = function(a, b){ return a.map( function(e, i) { return [e, b[i]] } ) };
52
+ _zip(child_nodes, states).forEach( function(_){ _[0].style.display = _[1] } );
53
+ } else {
54
+ clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
55
+ };
56
+ rect.top += scrollY;
57
+ rect.left += scrollX;
58
+ return [ [
59
+ rect.top, rect.left, rect.width, rect.height, clickable, node
60
+ ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
61
+ };
62
+ return _tap(f(node), function(){ scrollTo(x, y) });
63
+ } )(arguments[0])"
64
+ str = Struct.new :top, :left, :width, :height, :clickable, :node
65
+ nodes = page.evaluate(code, self).map{ |s| str.new *s }
66
+ nodes.size == prev.size
67
+ end
68
+
69
+ if defined? Selenium::WebDriver::Wait
70
+ Selenium::WebDriver::Wait.new(
71
+ message: "number of DOM elements didn't stop to change"
72
+ ).until &try
73
+ else
74
+ t = Time.now
75
+ until try.call
76
+ fail "number of DOM elements didn't stop to change" if Time.now > t + 5
77
+ end
78
+ end
79
+ logger.info "#{nodes.size} DOM nodes found"
80
+
81
+ nodes.select! &:clickable
82
+ nodes.reject do |n|
83
+ nodes.any? do |nn|
84
+ cs = [
85
+ nn.top <=> n.top,
86
+ nn.left <=> n.left,
87
+ n.left + n.width <=> nn.left + nn.width,
88
+ n.top + n.height <=> nn.top + nn.height,
89
+ ]
90
+ cs.include?(1) && !cs.include?(-1)
91
+ end
92
+ end.extend Dumpable
93
+ end
94
+
95
+ private def recognize_more
96
+ logger = Module.nesting.first.logger
97
+
98
+ nodes = []
99
+ try = lambda do
100
+ prev = nodes
101
+ code = "( function(node) {
102
+ var x = scrollX, y = scrollY;
103
+ var _tap = function(x, f){ f(); return x };
104
+ var f = function(node) {
105
+ node.scrollIntoView();
106
+ var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
107
+ rect.top += scrollY;
108
+ rect.left += scrollX;
109
+ return [ [
110
+ node, JSON.stringify([rect.top, rect.left, rect.width, rect.height])
111
+ ] ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
112
+ };
113
+ return _tap(f(node), function(){ scrollTo(x, y) });
114
+ } )(arguments[0])"
115
+ str = Struct.new :node, :top, :left, :width, :height
116
+ nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
117
+ nodes.size == prev.size
118
+ end
119
+
120
+ if defined? Selenium::WebDriver::Wait
121
+ Selenium::WebDriver::Wait.new(
122
+ message: "number of DOM elements didn't stop to change"
123
+ ).until &try
124
+ else
125
+ t = Time.now
126
+ until try.call
127
+ fail "number of DOM elements didn't stop to change" if Time.now > t + 10
128
+ end
129
+ end
130
+ logger.info "#{nodes.size} DOM nodes found"
131
+
132
+ nodes.reject!{ |i| i.height.zero? || i.width.zero? }
133
+ nodes
134
+ end
135
+
136
+ logging_error = Class.new RuntimeError do
137
+ attr_reader :dumps
138
+ def initialize msg, arrays
139
+ Module.nesting.first.logger.error "#{self.class}: #{msg}"
140
+ @dumps = arrays.map{ |name, array| [name, array.extend(Dumpable).dump] }.to_h
141
+ super msg
142
+ end
143
+ end
144
+ class ErrorNotEnoughNodes < logging_error ; end
145
+
146
+ private def split heuristics, hh, ww, tt, ll
147
+ logger = Module.nesting.first.logger
148
+
149
+ unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
150
+ ->(&b){ b.call }
151
+ else
152
+ lambda do |&try|
153
+ t = Time.now
154
+ begin
155
+ try.call
156
+ rescue Selenium::WebDriver::Error::StaleElementReferenceError
157
+ raise if Time.now > t + 10
158
+ retry
159
+ end
160
+ end
161
+ end
162
+ all = unstale.call do recognize_more end.sort_by(&tt)
163
+ logger.info "all nodes: #{all.size}"
164
+ rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
165
+ inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
166
+ raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
167
+ logger.info "inside nodes: #{inside.size}"
168
+ nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
169
+ logger.info "good nodes: #{nodes.size}" # only those that might be containers
170
+
171
+ large = nodes#.select{ |i| i[ww] > nodes.map(&ww).max / 4 }
172
+ logger.info "large enough and unique: #{large.size}"
173
+
174
+ interfere = lambda do |a, b|
175
+ a[tt] < b[tt] + b[hh] &&
176
+ b[tt] < a[tt] + a[hh]
177
+ end
178
+
179
+ rest = large.select.with_index do |a, i|
180
+ large.each_with_index.none? do |b, j|
181
+ next if i == j
182
+ a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
183
+ large.all?{ |c| interfere[a, c] == interfere[b, c] }
184
+ end
185
+ end
186
+ logger.info "not nested: #{rest.size}"
187
+ # rest = rest.sample 50
188
+
189
+ # adding the :area field for faster upcoming computations
190
+ struct = Struct.new *large.first.members, :area
191
+ rest.map!{ |i| struct.new *i.values, i.width * i.height }
192
+
193
+ require "pcbr"
194
+ pcbr = PCBR.new
195
+ is = []
196
+ max, past = 0, []
197
+ prev = nil
198
+ time = Time.now
199
+ loop do
200
+ rest.each_with_index do |node, i|
201
+ next if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
202
+ sol = rest.values_at *is, i
203
+ pcbr.store [*is, i].sort, [
204
+ *( is.size if heuristics.include? :SIZE ),
205
+ *( sol.map(&:area).inject(:+) if heuristics.include? :AREA ),
206
+ *( -sol.product(sol).map{ |s1, s2| (s1.width - s2.width ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :WIDTH ),
207
+ *( -sol.product(sol).map{ |s1, s2| (s1.height - s2.height ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
208
+ *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
209
+ ] unless pcbr.table.assoc [*is, i].sort
210
+ end
211
+ if prev && Time.now - time > 1 && (Time.now - prev > (prev - time))
212
+ m = pcbr.table.reject{ |i| i.first.size == 1 }.map(&:last).max
213
+ break if 1 == pcbr.table.count{ |i| i.last == m } || Time.now - time > 5
214
+ end
215
+ break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.inject(:+) }.max_by(&:last)
216
+ if t.last > max
217
+ prev, max = Time.now, t.last
218
+ logger.debug [Time.now - time, max, t.first]
219
+ end
220
+ past.push (is = t.first).map{ |i| 2**i }.inject(:+)
221
+ end
222
+ # TODO: if multiple with max score, take the max by area
223
+ unless best = pcbr.table.reject{ |is,| is.size == 1 }.max_by(&:last)
224
+ raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, inside: inside, nodes: nodes, large: large, rest: rest
225
+ end
226
+ rest.values_at(*best.first).extend(Dumpable)
227
+ end
228
+
229
+ def rows *heuristics
230
+ heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
231
+ split heuristics, :height, :width, :top, :left
232
+ end
233
+ def cols *heuristics
234
+ heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
235
+ split heuristics, :width, :height, :left, :top
236
+ end
237
+
238
+
239
+ end
240
+
241
+ if defined? Ferrum::Frame::Runtime
242
+ Ferrum::Node.include PageRecognizer
243
+ Ferrum::Frame::Runtime.module_eval do
244
+ def cyclic? object_id
245
+ @page.command "Runtime.callFunctionOn", objectId: object_id, returnByValue: true, functionDeclaration: "function(){return false}"
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,23 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "pagerecognizer"
3
+ spec.version = "0.0.1"
4
+ spec.summary = "visual HTML page structure recognizer"
5
+
6
+ spec.author = "Victor Maslov aka Nakilon"
7
+ spec.email = "nakilon@gmail.com"
8
+ spec.license = "MIT"
9
+ spec.metadata = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
10
+
11
+ spec.add_dependency "nokogiri"
12
+ spec.add_dependency "pcbr"
13
+ spec.add_dependency "ferrum"
14
+ spec.add_development_dependency "minitest"
15
+
16
+ spec.add_development_dependency "ruby-prof"
17
+ spec.add_development_dependency "byebug"
18
+ spec.add_development_dependency "mll"
19
+
20
+ spec.require_path = "lib"
21
+ spec.test_file = "test.rb"
22
+ spec.files = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
23
+ end
data/test.rb ADDED
@@ -0,0 +1,28 @@
1
+ require "minitest/autorun"
2
+ require "ferrum"
3
+ require_relative "lib/pagerecognizer"
4
+ Ferrum::Node.include PageRecognizer
5
+
6
+ describe PageRecognizer do
7
+ it "google" do
8
+ browser = Ferrum::Browser.new **(ENV.has_key?("FERRUM_NO_SANDBOX") ? {browser_options: {"no-sandbox": nil}} : {})
9
+ browser.goto "about:blank"
10
+ browser.execute "document.write(#{File.read("google.htm").inspect})"
11
+ results = browser.at_css("body").rows
12
+ width = results.group_by(&:width).max_by{ |w, g| g.size }.first
13
+ assert_equal [
14
+ ["https://www.ruby-lang.org/ru/", "Ruby это... динамический язык программирования с о"],
15
+ ["https://ru.wikibooks.org/wiki/Ruby", "Этот учебник намерен осветить все тонкости програм"],
16
+ ["https://habr.com/ru/post/433672/", "19 дек. 2018 г. - Взрывной рост интереса к Ruby ос"],
17
+ ["https://habr.com/ru/hub/ruby/", "Ruby (англ. Ruby — «Рубин») — динамический, рефлек"],
18
+ ["https://web-creator.ru/articles/ruby", "Ruby разрабатывался на Linux, но работает на многи"],
19
+ ["http://rusrails.ru/", "Ruby on Rails руководства, учебники, статьи на рус"],
20
+ ["https://vc.ru/dev/72391-pochemu-my-vybiraem-ruby-d", "20 июн. 2019 г. - Ruby on Rails одним из первых на"],
21
+ ["https://tproger.ru/tag/ruby/", "Django или Ruby on Rails: какой фреймворк выбрать?"],
22
+ ["https://rubyrussia.club/", "Главная российская конференция о Ruby. Расширяем г"]
23
+ ], results.select{ |r| r.width == width }.map(&:node).map(&:rows).map{ |link, desc| [
24
+ link.node.at_css("a").property("href")[0,50],
25
+ desc.node.text[0,50],
26
+ ] }
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pagerecognizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Victor Maslov aka Nakilon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pcbr
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: ferrum
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ruby-prof
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: byebug
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: mll
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description:
112
+ email: nakilon@gmail.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - LICENSE
118
+ - lib/pagerecognizer.rb
119
+ - pagerecognizer.gemspec
120
+ - test.rb
121
+ homepage:
122
+ licenses:
123
+ - MIT
124
+ metadata:
125
+ source_code_uri: https://github.com/nakilon/pagerecognizer
126
+ post_install_message:
127
+ rdoc_options: []
128
+ require_paths:
129
+ - lib
130
+ required_ruby_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ requirements: []
141
+ rubyforge_project:
142
+ rubygems_version: 2.5.2.3
143
+ signing_key:
144
+ specification_version: 4
145
+ summary: visual HTML page structure recognizer
146
+ test_files:
147
+ - test.rb