pagerecognizer 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/pagerecognizer.rb +317 -125
 - data/pagerecognizer.gemspec +3 -4
 - data/test.rb +64 -20
 - metadata +9 -9
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 17ef706811d7513a3f7f6a109feacb59bbae91dc
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: e6890fcd6c6bfdd6d042513f02dea280e5c436ae
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 74fc2c48871a01192e4ebbd80f603a170b1f953f61252f8bf21627bc3abb8555beb12572ab786754f6f015870bdaa6cec54daab23d15068c7aa0152b70c612aa
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 3e6a080c6740075bab1c111127249ade7429ac844fcf1c0b99979c7a535cd5013cf9bd86629957ee7bcd4467828d023831a51df078037dff5cfc777c6a81be9e
         
     | 
    
        data/lib/pagerecognizer.rb
    CHANGED
    
    | 
         @@ -4,14 +4,16 @@ module PageRecognizer 
     | 
|
| 
       4 
4 
     | 
    
         
             
              end
         
     | 
| 
       5 
5 
     | 
    
         
             
              require "logger"
         
     | 
| 
       6 
6 
     | 
    
         
             
              self.logger = Logger.new STDOUT
         
     | 
| 
      
 7 
     | 
    
         
            +
              self.logger.formatter = ->(severity, datetime, progname, msg){ "#{datetime.strftime "%H%M%S"} #{severity.to_s[0]} #{msg}\n" }
         
     | 
| 
      
 8 
     | 
    
         
            +
              self.logger.level = ENV.fetch("LOGLEVEL_PageRecognizer", "FATAL").to_sym
         
     | 
| 
       7 
9 
     | 
    
         | 
| 
       8 
10 
     | 
    
         
             
              module Dumpable
         
     | 
| 
       9 
11 
     | 
    
         
             
                def dump
         
     | 
| 
       10 
     | 
    
         
            -
                  "<html><body>#{
         
     | 
| 
      
 12 
     | 
    
         
            +
                  "<html><body style='white-space: nowrap'>#{
         
     | 
| 
       11 
13 
     | 
    
         
             
                    map.with_index do |n, i|
         
     | 
| 
       12 
     | 
    
         
            -
                      "<div style='position: absolute; background-color: hsla(#{
         
     | 
| 
      
 14 
     | 
    
         
            +
                      "<div id='#{i}' style='position: absolute; background-color: hsla(#{
         
     | 
| 
       13 
15 
     | 
    
         
             
                        360 * i / size
         
     | 
| 
       14 
     | 
    
         
            -
                      },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{
         
     | 
| 
      
 16 
     | 
    
         
            +
                      },100%,50%,0.5); top: #{n.top}; left: #{n.left}; width: #{n.width}; height: #{n.height}'>#{i} #{
         
     | 
| 
       15 
17 
     | 
    
         
             
                        n.node.tag_name.upcase
         
     | 
| 
       16 
18 
     | 
    
         
             
                      }</div>"
         
     | 
| 
       17 
19 
     | 
    
         
             
                    end.join
         
     | 
| 
         @@ -29,92 +31,110 @@ module PageRecognizer 
     | 
|
| 
       29 
31 
     | 
    
         
             
                end.extend Dumpable
         
     | 
| 
       30 
32 
     | 
    
         
             
              end
         
     | 
| 
       31 
33 
     | 
    
         | 
| 
       32 
     | 
    
         
            -
              def  
     | 
| 
      
 34 
     | 
    
         
            +
              def self.rgb2hsv r, g, b   # [<256, <256, <256]
         
     | 
| 
      
 35 
     | 
    
         
            +
                # http://stackoverflow.com/q/41926874/322020
         
     | 
| 
      
 36 
     | 
    
         
            +
                r, g, b  = [r, g, b].map{ |_| _.fdiv 255 }
         
     | 
| 
      
 37 
     | 
    
         
            +
                min, max = [r, g, b].minmax
         
     | 
| 
      
 38 
     | 
    
         
            +
                chroma   = max - min
         
     | 
| 
      
 39 
     | 
    
         
            +
                [
         
     | 
| 
      
 40 
     | 
    
         
            +
                  60.0 * ( chroma.zero? ? 0 : case max
         
     | 
| 
      
 41 
     | 
    
         
            +
                    when r ; (g - b) / chroma
         
     | 
| 
      
 42 
     | 
    
         
            +
                    when g ; (b - r) / chroma + 2
         
     | 
| 
      
 43 
     | 
    
         
            +
                    when b ; (r - g) / chroma + 4
         
     | 
| 
      
 44 
     | 
    
         
            +
                    else 0
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end % 6 ),
         
     | 
| 
      
 46 
     | 
    
         
            +
                  chroma.zero? ? 0.0 : chroma / max,
         
     | 
| 
      
 47 
     | 
    
         
            +
                  max,
         
     | 
| 
      
 48 
     | 
    
         
            +
                ]   # [<=360, <=1, <=1]
         
     | 
| 
      
 49 
     | 
    
         
            +
              end
         
     | 
| 
      
 50 
     | 
    
         
            +
              def self.dist h1, s1, v1, h2, s2, v2   # [<256, <256, <256]
         
     | 
| 
      
 51 
     | 
    
         
            +
                # https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
         
     | 
| 
      
 52 
     | 
    
         
            +
                c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0   # chroma
         
     | 
| 
      
 53 
     | 
    
         
            +
                z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
         
     | 
| 
      
 54 
     | 
    
         
            +
                a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
         
     | 
| 
      
 55 
     | 
    
         
            +
                    x2 =     Math::sin(a) * c2
         
     | 
| 
      
 56 
     | 
    
         
            +
                y1, y2 = c1, Math::cos(a) * c2
         
     | 
| 
      
 57 
     | 
    
         
            +
                x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
         
     | 
| 
      
 58 
     | 
    
         
            +
              end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
              private def recognize
         
     | 
| 
       33 
61 
     | 
    
         
             
                logger = Module.nesting.first.logger
         
     | 
| 
      
 62 
     | 
    
         
            +
                logger.info "method #{__method__}..."
         
     | 
| 
       34 
63 
     | 
    
         | 
| 
       35 
64 
     | 
    
         
             
                nodes = []
         
     | 
| 
       36 
65 
     | 
    
         
             
                try = lambda do
         
     | 
| 
       37 
     | 
    
         
            -
                   
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
             
     | 
| 
       53 
     | 
    
         
            -
             
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
                       
     | 
| 
       57 
     | 
    
         
            -
                      rect.left += scrollX;
         
     | 
| 
       58 
     | 
    
         
            -
                      return [ [
         
     | 
| 
       59 
     | 
    
         
            -
                        rect.top, rect.left, rect.width, rect.height, clickable, node
         
     | 
| 
       60 
     | 
    
         
            -
                      ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
         
     | 
| 
       61 
     | 
    
         
            -
                    };
         
     | 
| 
       62 
     | 
    
         
            -
                    return _tap(f(node), function(){ scrollTo(x, y) });
         
     | 
| 
       63 
     | 
    
         
            -
                  } )(arguments[0])"
         
     | 
| 
       64 
     | 
    
         
            -
                  str = Struct.new :top, :left, :width, :height, :clickable, :node
         
     | 
| 
       65 
     | 
    
         
            -
                  nodes = page.evaluate(code, self).map{ |s| str.new *s }
         
     | 
| 
       66 
     | 
    
         
            -
                  nodes.size == prev.size
         
     | 
| 
       67 
     | 
    
         
            -
                end
         
     | 
| 
      
 66 
     | 
    
         
            +
                  str = Struct.new :node, :visible, :top, :left, :width, :height, :area do
         
     | 
| 
      
 67 
     | 
    
         
            +
                    def texts
         
     | 
| 
      
 68 
     | 
    
         
            +
                      node.page.evaluate(<<~HEREDOC, node).map(&JSON.method(:load)).map do |text, rect1, rect2, style|
         
     | 
| 
      
 69 
     | 
    
         
            +
                        (function(node){
         
     | 
| 
      
 70 
     | 
    
         
            +
                          let result = [], range = document.createRange();
         
     | 
| 
      
 71 
     | 
    
         
            +
                          for (
         
     | 
| 
      
 72 
     | 
    
         
            +
                            let iterator = document.evaluate('.//text()', node, null, XPathResult.ANY_TYPE, null);
         
     | 
| 
      
 73 
     | 
    
         
            +
                            text = iterator.iterateNext();
         
     | 
| 
      
 74 
     | 
    
         
            +
                          ) {
         
     | 
| 
      
 75 
     | 
    
         
            +
                            range.selectNode(text);
         
     | 
| 
      
 76 
     | 
    
         
            +
                            result.push(JSON.stringify( [
         
     | 
| 
      
 77 
     | 
    
         
            +
                              text.wholeText,
         
     | 
| 
      
 78 
     | 
    
         
            +
                              range.getBoundingClientRect(),
         
     | 
| 
      
 79 
     | 
    
         
            +
                              text.parentNode.getBoundingClientRect(),
         
     | 
| 
      
 80 
     | 
    
         
            +
                              getComputedStyle(text.parentNode),
         
     | 
| 
      
 81 
     | 
    
         
            +
                            ] ));
         
     | 
| 
      
 82 
     | 
    
         
            +
                          }
         
     | 
| 
      
 83 
     | 
    
         
            +
                          return result;
         
     | 
| 
      
 84 
     | 
    
         
            +
                        })(arguments[0])
         
     | 
| 
      
 85 
     | 
    
         
            +
                      HEREDOC
         
     | 
| 
       68 
86 
     | 
    
         | 
| 
       69 
     | 
    
         
            -
             
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
                  ).until &try
         
     | 
| 
       73 
     | 
    
         
            -
                else
         
     | 
| 
       74 
     | 
    
         
            -
                  t = Time.now
         
     | 
| 
       75 
     | 
    
         
            -
                  until try.call
         
     | 
| 
       76 
     | 
    
         
            -
                    fail "number of DOM elements didn't stop to change" if Time.now > t + 5
         
     | 
| 
       77 
     | 
    
         
            -
                  end
         
     | 
| 
       78 
     | 
    
         
            -
                end
         
     | 
| 
       79 
     | 
    
         
            -
                logger.info "#{nodes.size} DOM nodes found"
         
     | 
| 
      
 87 
     | 
    
         
            +
                        # google SERP has 1x1 nodes with text _<>
         
     | 
| 
      
 88 
     | 
    
         
            +
                        next if rect1["width"] < 2 || rect1["height"] < 2
         
     | 
| 
      
 89 
     | 
    
         
            +
                        next if rect2["width"] < 2 || rect2["height"] < 2
         
     | 
| 
       80 
90 
     | 
    
         | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
     | 
    
         
            -
             
     | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
       84 
     | 
    
         
            -
             
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
       86 
     | 
    
         
            -
             
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
       90 
     | 
    
         
            -
             
     | 
| 
      
 91 
     | 
    
         
            +
                        color = style["color"]
         
     | 
| 
      
 92 
     | 
    
         
            +
                        fail color unless /\Argba?\((?<red>\d+), (?<green>\d+), (?<blue>\d+)(, 0(\.\d+)?)?\)\z/ =~ color
         
     | 
| 
      
 93 
     | 
    
         
            +
                        closest_color = {   # https://en.wikipedia.org/wiki/Web_colors#Basic_colors
         
     | 
| 
      
 94 
     | 
    
         
            +
                          white: [0, 0, 100],
         
     | 
| 
      
 95 
     | 
    
         
            +
                          silver: [0, 0, 75],
         
     | 
| 
      
 96 
     | 
    
         
            +
                          gray: [0, 0, 50],
         
     | 
| 
      
 97 
     | 
    
         
            +
                          black: [0, 0, 0],
         
     | 
| 
      
 98 
     | 
    
         
            +
                          red: [0, 100, 100],
         
     | 
| 
      
 99 
     | 
    
         
            +
                          maroon: [0, 100, 50],
         
     | 
| 
      
 100 
     | 
    
         
            +
                          yellow: [60, 100, 100],
         
     | 
| 
      
 101 
     | 
    
         
            +
                          olive: [60, 100, 50],
         
     | 
| 
      
 102 
     | 
    
         
            +
                          lime: [120, 100, 100],
         
     | 
| 
      
 103 
     | 
    
         
            +
                          green: [120, 100, 50],
         
     | 
| 
      
 104 
     | 
    
         
            +
                          aqua: [180, 100, 100],
         
     | 
| 
      
 105 
     | 
    
         
            +
                          teal: [180, 100, 50],
         
     | 
| 
      
 106 
     | 
    
         
            +
                          blue: [240, 100, 100],
         
     | 
| 
      
 107 
     | 
    
         
            +
                          navy: [240, 100, 50],
         
     | 
| 
      
 108 
     | 
    
         
            +
                          fuchsia: [300, 100, 100],
         
     | 
| 
      
 109 
     | 
    
         
            +
                          purple: [300, 100, 50],
         
     | 
| 
      
 110 
     | 
    
         
            +
                        }.to_a.min_by do |_, (h1, s1, v1)|
         
     | 
| 
      
 111 
     | 
    
         
            +
                          h2, s2, v2 = PageRecognizer.rgb2hsv(red.to_i, green.to_i, blue.to_i)
         
     | 
| 
      
 112 
     | 
    
         
            +
                          PageRecognizer.dist h1*255/360, s1*256/100, v1*256/100, h2*255/360, s2*255, v2*255
         
     | 
| 
      
 113 
     | 
    
         
            +
                        end.first
         
     | 
| 
      
 114 
     | 
    
         
            +
                        [text, style, closest_color, rect1]
         
     | 
| 
      
 115 
     | 
    
         
            +
                      end.compact
         
     | 
| 
      
 116 
     | 
    
         
            +
                    end
         
     | 
| 
       91 
117 
     | 
    
         
             
                  end
         
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
       93 
     | 
    
         
            -
             
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
             
     | 
| 
       96 
     | 
    
         
            -
             
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
       98 
     | 
    
         
            -
             
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
             
     | 
| 
       102 
     | 
    
         
            -
             
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
             
     | 
| 
       105 
     | 
    
         
            -
                       
     | 
| 
       106 
     | 
    
         
            -
                       
     | 
| 
       107 
     | 
    
         
            -
             
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
             
     | 
| 
       110 
     | 
    
         
            -
             
     | 
| 
       111 
     | 
    
         
            -
             
     | 
| 
       112 
     | 
    
         
            -
                    };
         
     | 
| 
       113 
     | 
    
         
            -
                    return _tap(f(node), function(){ scrollTo(x, y) });
         
     | 
| 
       114 
     | 
    
         
            -
                  } )(arguments[0])"
         
     | 
| 
       115 
     | 
    
         
            -
                  str = Struct.new :node, :top, :left, :width, :height
         
     | 
| 
       116 
     | 
    
         
            -
                  nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
         
     | 
| 
       117 
     | 
    
         
            -
                  nodes.size == prev.size
         
     | 
| 
      
 118 
     | 
    
         
            +
                  prev = nodes.size
         
     | 
| 
      
 119 
     | 
    
         
            +
                  t = page.evaluate(<<~HEREDOC, self)
         
     | 
| 
      
 120 
     | 
    
         
            +
                    ( function(node) {
         
     | 
| 
      
 121 
     | 
    
         
            +
                      var x = scrollX, y = scrollY;
         
     | 
| 
      
 122 
     | 
    
         
            +
                      var _tap = function(x, f){ f(); return x };
         
     | 
| 
      
 123 
     | 
    
         
            +
                      var f = function(node) {
         
     | 
| 
      
 124 
     | 
    
         
            +
                        node.scrollIntoView();
         
     | 
| 
      
 125 
     | 
    
         
            +
                        var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
         
     | 
| 
      
 126 
     | 
    
         
            +
                        rect.top += scrollY;
         
     | 
| 
      
 127 
     | 
    
         
            +
                        rect.left += scrollX;
         
     | 
| 
      
 128 
     | 
    
         
            +
                        return [
         
     | 
| 
      
 129 
     | 
    
         
            +
                          node, JSON.stringify([rect.top, rect.left, rect.width, rect.height]), ("visible" == getComputedStyle(node).visibility)
         
     | 
| 
      
 130 
     | 
    
         
            +
                        ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
         
     | 
| 
      
 131 
     | 
    
         
            +
                      };
         
     | 
| 
      
 132 
     | 
    
         
            +
                      return _tap(f(node), function(){ scrollTo(x, y) });
         
     | 
| 
      
 133 
     | 
    
         
            +
                    } )(arguments[0])
         
     | 
| 
      
 134 
     | 
    
         
            +
                  HEREDOC
         
     | 
| 
      
 135 
     | 
    
         
            +
                  logger.debug [t.size / 3, prev]
         
     | 
| 
      
 136 
     | 
    
         
            +
                  nodes = t.each_slice(3).map{ |node, rect, visible| str.new(node, visible, *JSON.load(rect)).tap{ |_| _.area = _.width * _.height } }
         
     | 
| 
      
 137 
     | 
    
         
            +
                  nodes.size == prev
         
     | 
| 
       118 
138 
     | 
    
         
             
                end
         
     | 
| 
       119 
139 
     | 
    
         | 
| 
       120 
140 
     | 
    
         
             
                if defined? Selenium::WebDriver::Wait
         
     | 
| 
         @@ -128,9 +148,9 @@ module PageRecognizer 
     | 
|
| 
       128 
148 
     | 
    
         
             
                  end
         
     | 
| 
       129 
149 
     | 
    
         
             
                end
         
     | 
| 
       130 
150 
     | 
    
         
             
                logger.info "#{nodes.size} DOM nodes found"
         
     | 
| 
       131 
     | 
    
         
            -
             
     | 
| 
       132 
     | 
    
         
            -
                 
     | 
| 
       133 
     | 
    
         
            -
                nodes
         
     | 
| 
      
 151 
     | 
    
         
            +
                nodes.reject!{ |_| _.height.zero? || _.width.zero? || !_.visible }
         
     | 
| 
      
 152 
     | 
    
         
            +
                logger.info "visible nodes: #{nodes.size}"
         
     | 
| 
      
 153 
     | 
    
         
            +
                nodes.extend Dumpable
         
     | 
| 
       134 
154 
     | 
    
         
             
              end
         
     | 
| 
       135 
155 
     | 
    
         | 
| 
       136 
156 
     | 
    
         
             
              logging_error = Class.new RuntimeError do
         
     | 
| 
         @@ -143,8 +163,9 @@ module PageRecognizer 
     | 
|
| 
       143 
163 
     | 
    
         
             
              end
         
     | 
| 
       144 
164 
     | 
    
         
             
              class ErrorNotEnoughNodes < logging_error ; end
         
     | 
| 
       145 
165 
     | 
    
         | 
| 
       146 
     | 
    
         
            -
              private def split  
     | 
| 
      
 166 
     | 
    
         
            +
              private def split hh, ww, tt, ll, heuristics, try_min, dump, &filter
         
     | 
| 
       147 
167 
     | 
    
         
             
                logger = Module.nesting.first.logger
         
     | 
| 
      
 168 
     | 
    
         
            +
                logger.info heuristics
         
     | 
| 
       148 
169 
     | 
    
         | 
| 
       149 
170 
     | 
    
         
             
                unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
         
     | 
| 
       150 
171 
     | 
    
         
             
                  ->(&b){ b.call }
         
     | 
| 
         @@ -159,82 +180,253 @@ module PageRecognizer 
     | 
|
| 
       159 
180 
     | 
    
         
             
                    end
         
     | 
| 
       160 
181 
     | 
    
         
             
                  end
         
     | 
| 
       161 
182 
     | 
    
         
             
                end
         
     | 
| 
       162 
     | 
    
         
            -
                all = unstale.call do recognize_more end.sort_by(&tt)
         
     | 
| 
       163 
     | 
    
         
            -
                logger.info "all nodes: #{all.size}"
         
     | 
| 
       164 
     | 
    
         
            -
                rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
         
     | 
| 
       165 
     | 
    
         
            -
                inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
         
     | 
| 
       166 
     | 
    
         
            -
                raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
         
     | 
| 
       167 
     | 
    
         
            -
                logger.info "inside nodes: #{inside.size}"
         
     | 
| 
       168 
     | 
    
         
            -
                nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
         
     | 
| 
       169 
     | 
    
         
            -
                logger.info "good nodes: #{nodes.size}"   # only those that might be containers
         
     | 
| 
       170 
183 
     | 
    
         | 
| 
       171 
     | 
    
         
            -
                 
     | 
| 
       172 
     | 
    
         
            -
                 
     | 
| 
      
 184 
     | 
    
         
            +
                nodes = unstale.call do recognize end.sort_by{ |_| [_[tt], _[ll]] }
         
     | 
| 
      
 185 
     | 
    
         
            +
                File.write "#{dump}.all.htm", nodes.extend(Dumpable).dump if dump
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
                nodes = unstale.call do nodes.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name } end.uniq{ |_| [_[hh], _[ww], _[tt], _[ll]] }
         
     | 
| 
      
 189 
     | 
    
         
            +
                logger.info "good and unique: #{nodes.size}"   # only those that might be containers
         
     | 
| 
      
 190 
     | 
    
         
            +
                File.write "#{dump}.nodes.htm", nodes.extend(Dumpable).dump if dump
         
     | 
| 
       173 
191 
     | 
    
         | 
| 
       174 
192 
     | 
    
         
             
                interfere = lambda do |a, b|
         
     | 
| 
       175 
193 
     | 
    
         
             
                  a[tt] < b[tt] + b[hh] &&
         
     | 
| 
       176 
194 
     | 
    
         
             
                  b[tt] < a[tt] + a[hh]
         
     | 
| 
       177 
195 
     | 
    
         
             
                end
         
     | 
| 
       178 
196 
     | 
    
         | 
| 
       179 
     | 
    
         
            -
             
     | 
| 
       180 
     | 
    
         
            -
             
     | 
| 
      
 197 
     | 
    
         
            +
             
     | 
| 
      
 198 
     | 
    
         
            +
                rest = nodes.select.with_index do |a, i|
         
     | 
| 
      
 199 
     | 
    
         
            +
                  nodes.each_with_index.none? do |b, j|
         
     | 
| 
       181 
200 
     | 
    
         
             
                    next if i == j
         
     | 
| 
       182 
201 
     | 
    
         
             
                    a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
         
     | 
| 
       183 
     | 
    
         
            -
                     
     | 
| 
      
 202 
     | 
    
         
            +
                    a[ll] >= b[ll] && a[ll] + a[ww] <= b[ll] + b[ww] &&
         
     | 
| 
      
 203 
     | 
    
         
            +
                    nodes.all?{ |c| interfere[a, c] == interfere[b, c] }
         
     | 
| 
       184 
204 
     | 
    
         
             
                  end
         
     | 
| 
       185 
205 
     | 
    
         
             
                end
         
     | 
| 
       186 
206 
     | 
    
         
             
                logger.info "not nested: #{rest.size}"
         
     | 
| 
       187 
     | 
    
         
            -
                # rest  
     | 
| 
      
 207 
     | 
    
         
            +
                File.write "#{dump}.rest1.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
                # 8 = max_results - 1, 3 = (from row size diff euristic)
         
     | 
| 
      
 210 
     | 
    
         
            +
                if try_min
         
     | 
| 
      
 211 
     | 
    
         
            +
                  rest = rest.reject{ |_| _[hh] + _[hh]/3*(try_min - 1) > (rest.map{ |_| _[tt] + _[hh] }.max - rest.map(&tt).min) }
         
     | 
| 
      
 212 
     | 
    
         
            +
                  logger.info "small enough: #{rest.size}"
         
     | 
| 
      
 213 
     | 
    
         
            +
                end
         
     | 
| 
      
 214 
     | 
    
         
            +
                File.write "#{dump}.rest2.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
       188 
215 
     | 
    
         | 
| 
       189 
     | 
    
         
            -
                 
     | 
| 
       190 
     | 
    
         
            -
                 
     | 
| 
       191 
     | 
    
         
            -
                 
     | 
| 
      
 216 
     | 
    
         
            +
                rest.select! &filter
         
     | 
| 
      
 217 
     | 
    
         
            +
                logger.info "filtered: #{rest.size}"
         
     | 
| 
      
 218 
     | 
    
         
            +
                File.write "#{dump}.filtered.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
                rest.sort_by!(&:area).reverse!
         
     | 
| 
      
 221 
     | 
    
         
            +
                File.write "#{dump}.sorted.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
       192 
222 
     | 
    
         | 
| 
       193 
223 
     | 
    
         
             
                require "pcbr"
         
     | 
| 
       194 
224 
     | 
    
         
             
                pcbr = PCBR.new
         
     | 
| 
       195 
225 
     | 
    
         
             
                is = []
         
     | 
| 
       196 
     | 
    
         
            -
                max, past = 0,  
     | 
| 
      
 226 
     | 
    
         
            +
                max, past = 0, Set.new
         
     | 
| 
       197 
227 
     | 
    
         
             
                prev = nil
         
     | 
| 
       198 
228 
     | 
    
         
             
                time = Time.now
         
     | 
| 
       199 
229 
     | 
    
         
             
                loop do
         
     | 
| 
       200 
     | 
    
         
            -
                  rest. 
     | 
| 
       201 
     | 
    
         
            -
                     
     | 
| 
      
 230 
     | 
    
         
            +
                  si = (0...rest.size).reject do |i|
         
     | 
| 
      
 231 
     | 
    
         
            +
                    # I don't shrink pcbr so this should be a safe optimization
         
     | 
| 
      
 232 
     | 
    
         
            +
                    next true if is.last > i unless is.empty?
         
     | 
| 
      
 233 
     | 
    
         
            +
                    # also we've sorted from large to small so it does not get stuck with the half of the page below the largest node
         
     | 
| 
      
 234 
     | 
    
         
            +
             
     | 
| 
      
 235 
     | 
    
         
            +
                    next (logger.debug [i, 2]; true) if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
         
     | 
| 
      
 236 
     | 
    
         
            +
                    next (logger.debug [i, 3]; true) if is.any?{ |j| rest[i][ww] > rest[j][ww] * 2 } if heuristics.include? :WIDTH
         
     | 
| 
      
 237 
     | 
    
         
            +
                    next (logger.debug [i, 4]; true) if is.any?{ |j| rest[j][ww] > rest[i][ww] * 2 } if heuristics.include? :WIDTH
         
     | 
| 
      
 238 
     | 
    
         
            +
                    next (logger.debug [i, 5]; true) if is.any?{ |j| rest[i][hh] > rest[j][hh] * 3 }
         
     | 
| 
      
 239 
     | 
    
         
            +
                    next (logger.debug [i, 6]; true) if is.any?{ |j| rest[j][hh] > rest[i][hh] * 3 }
         
     | 
| 
      
 240 
     | 
    
         
            +
                  end
         
     | 
| 
      
 241 
     | 
    
         
            +
                  logger.debug [is, si]
         
     | 
| 
      
 242 
     | 
    
         
            +
                  si.each do |i|
         
     | 
| 
       202 
243 
     | 
    
         
             
                    sol = rest.values_at *is, i
         
     | 
| 
      
 244 
     | 
    
         
            +
                    unless pcbr.set.include? [*is, i].sort
         
     | 
| 
      
 245 
     | 
    
         
            +
                    logger.debug [is, i, sol.map(&:area).reduce(:+)]
         
     | 
| 
       203 
246 
     | 
    
         
             
                    pcbr.store [*is, i].sort, [
         
     | 
| 
       204 
247 
     | 
    
         
             
                      *( is.size                                                                                                                if heuristics.include? :SIZE   ),
         
     | 
| 
       205 
     | 
    
         
            -
                      *( sol.map(&:area). 
     | 
| 
       206 
     | 
    
         
            -
                       
     | 
| 
       207 
     | 
    
         
            -
                      *( -sol.product(sol).map{ |s1, s2| (s1.height             - s2.height            ).abs }. 
     | 
| 
       208 
     | 
    
         
            -
                      *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }. 
     | 
| 
       209 
     | 
    
         
            -
                    ] 
     | 
| 
      
 248 
     | 
    
         
            +
                      *( sol.map(&:area).reduce(:+)                                                                                             if heuristics.include? :AREA   ),
         
     | 
| 
      
 249 
     | 
    
         
            +
                      # https://en.wikipedia.org/wiki/Mean_absolute_difference
         
     | 
| 
      
 250 
     | 
    
         
            +
                      *( -sol.product(sol).map{ |s1, s2| (s1.height             - s2.height            ).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
         
     | 
| 
      
 251 
     | 
    
         
            +
                      *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.reduce(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
         
     | 
| 
      
 252 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 253 
     | 
    
         
            +
                      logger.debug "pcbr.table.size: #{pcbr.table.size}"
         
     | 
| 
      
 254 
     | 
    
         
            +
                      if si.none? do |j|
         
     | 
| 
      
 255 
     | 
    
         
            +
                        next if j <= i
         
     | 
| 
      
 256 
     | 
    
         
            +
                        next true if interfere[rest[i], rest[j]]
         
     | 
| 
      
 257 
     | 
    
         
            +
                        next true if rest[i][ww] > rest[j][ww] * 2 if heuristics.include? :WIDTH
         
     | 
| 
      
 258 
     | 
    
         
            +
                        next true if rest[j][ww] > rest[i][ww] * 2 if heuristics.include? :WIDTH
         
     | 
| 
      
 259 
     | 
    
         
            +
                        next true if rest[i][hh] > rest[j][hh] * 3
         
     | 
| 
      
 260 
     | 
    
         
            +
                        next true if rest[j][hh] > rest[i][hh] * 3
         
     | 
| 
      
 261 
     | 
    
         
            +
                      end
         
     | 
| 
      
 262 
     | 
    
         
            +
                        logger.debug "forced"
         
     | 
| 
      
 263 
     | 
    
         
            +
                        break
         
     | 
| 
      
 264 
     | 
    
         
            +
                      end
         
     | 
| 
      
 265 
     | 
    
         
            +
                    end
         
     | 
| 
       210 
266 
     | 
    
         
             
                  end
         
     | 
| 
       211 
     | 
    
         
            -
                  if prev && Time.now - time >  
     | 
| 
       212 
     | 
    
         
            -
                     
     | 
| 
       213 
     | 
    
         
            -
                    break  
     | 
| 
      
 267 
     | 
    
         
            +
                  if prev && Time.now - time > 5
         
     | 
| 
      
 268 
     | 
    
         
            +
                    logger.debug "check"
         
     | 
| 
      
 269 
     | 
    
         
            +
                    break logger.info "break 0" if Time.now - time > 30
         
     | 
| 
      
 270 
     | 
    
         
            +
                    break logger.info "break 1" if Time.now - prev > 10
         
     | 
| 
      
 271 
     | 
    
         
            +
                    m = pcbr.table.reject{ |i| i.first.size < 2 }.map(&:last).max
         
     | 
| 
      
 272 
     | 
    
         
            +
                    break logger.info "break 2" if Time.now - prev > (prev - time) && 1 == pcbr.table.count{ |i| i.last == m }
         
     | 
| 
       214 
273 
     | 
    
         
             
                  end
         
     | 
| 
       215 
     | 
    
         
            -
                  break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }. 
     | 
| 
      
 274 
     | 
    
         
            +
                  break logger.info "done" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
         
     | 
| 
      
 275 
     | 
    
         
            +
                  logger.debug "next: #{t}"
         
     | 
| 
      
 276 
     | 
    
         
            +
                  past.add (is = t.first).map{ |i| 2**i }.reduce(:+)
         
     | 
| 
       216 
277 
     | 
    
         
             
                  if t.last > max
         
     | 
| 
       217 
278 
     | 
    
         
             
                    prev, max = Time.now, t.last
         
     | 
| 
      
 279 
     | 
    
         
            +
                    logger.debug "new max: #{max}"
         
     | 
| 
       218 
280 
     | 
    
         
             
                    logger.debug [Time.now - time, max, t.first]
         
     | 
| 
       219 
281 
     | 
    
         
             
                  end
         
     | 
| 
       220 
     | 
    
         
            -
                  past.push (is = t.first).map{ |i| 2**i }.inject(:+)
         
     | 
| 
       221 
282 
     | 
    
         
             
                end
         
     | 
| 
       222 
283 
     | 
    
         
             
                # TODO: if multiple with max score, take the max by area
         
     | 
| 
       223 
     | 
    
         
            -
                unless best = pcbr.table.reject{ |is,| is.size  
     | 
| 
       224 
     | 
    
         
            -
                  raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all,  
     | 
| 
      
 284 
     | 
    
         
            +
                unless best = pcbr.table.reject{ |is,| is.size < 2 }.max_by(&:last)
         
     | 
| 
      
 285 
     | 
    
         
            +
                  raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, nodes: nodes, rest: rest
         
     | 
| 
       225 
286 
     | 
    
         
             
                end
         
     | 
| 
       226 
     | 
    
         
            -
                 
     | 
| 
      
 287 
     | 
    
         
            +
                pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
         
     | 
| 
      
 288 
     | 
    
         
            +
                logger.info best
         
     | 
| 
      
 289 
     | 
    
         
            +
                logger.info "splitted in #{best.first.size}"
         
     | 
| 
      
 290 
     | 
    
         
            +
                rest.values_at(*best.first).sort_by(&tt).extend Dumpable
         
     | 
| 
       227 
291 
     | 
    
         
             
              end
         
     | 
| 
       228 
292 
     | 
    
         | 
| 
       229 
     | 
    
         
            -
              def rows  
     | 
| 
       230 
     | 
    
         
            -
                 
     | 
| 
       231 
     | 
    
         
            -
                split heuristics, :height, :width, :top, :left
         
     | 
| 
      
 293 
     | 
    
         
            +
              def rows heuristics, try_min: nil, dump: nil, &b
         
     | 
| 
      
 294 
     | 
    
         
            +
                split :height, :width, :top, :left, heuristics, try_min, dump, &b
         
     | 
| 
       232 
295 
     | 
    
         
             
              end
         
     | 
| 
       233 
     | 
    
         
            -
              def cols  
     | 
| 
       234 
     | 
    
         
            -
                 
     | 
| 
       235 
     | 
    
         
            -
                split heuristics, :width, :height, :left, :top
         
     | 
| 
      
 296 
     | 
    
         
            +
              def cols heuristics, try_min: nil, dump: nil, &b
         
     | 
| 
      
 297 
     | 
    
         
            +
                split :width, :height, :left, :top, heuristics, try_min, dump, &b
         
     | 
| 
       236 
298 
     | 
    
         
             
              end
         
     | 
| 
       237 
299 
     | 
    
         | 
| 
      
 300 
     | 
    
         
            +
              def self.piles z
         
     | 
| 
      
 301 
     | 
    
         
            +
                max = nil
         
     | 
| 
      
 302 
     | 
    
         
            +
                result = [current = []]
         
     | 
| 
      
 303 
     | 
    
         
            +
                z.map.with_index.sort.each do |x|
         
     | 
| 
      
 304 
     | 
    
         
            +
                  if !max || max > x[0][0]
         
     | 
| 
      
 305 
     | 
    
         
            +
                    current.push x
         
     | 
| 
      
 306 
     | 
    
         
            +
                    max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
         
     | 
| 
      
 307 
     | 
    
         
            +
                  else
         
     | 
| 
      
 308 
     | 
    
         
            +
                    result.push current = [x]
         
     | 
| 
      
 309 
     | 
    
         
            +
                    max = x[0][0] + x[0][1]
         
     | 
| 
      
 310 
     | 
    
         
            +
                  end
         
     | 
| 
      
 311 
     | 
    
         
            +
                end
         
     | 
| 
      
 312 
     | 
    
         
            +
                result.map{ |_| _.map &:last }
         
     | 
| 
      
 313 
     | 
    
         
            +
              end
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
              module Gridable
         
     | 
| 
      
 316 
     | 
    
         
            +
                def rows
         
     | 
| 
      
 317 
     | 
    
         
            +
                  Module.nesting[1].piles(map{ |n| [n.top, n.height] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
         
     | 
| 
      
 318 
     | 
    
         
            +
                end
         
     | 
| 
      
 319 
     | 
    
         
            +
                def cols
         
     | 
| 
      
 320 
     | 
    
         
            +
                  Module.nesting[1].piles(map{ |n| [n.left, n.width] }).map{ |s| values_at(*s).extend Module.nesting[1]::Dumpable }
         
     | 
| 
      
 321 
     | 
    
         
            +
                end
         
     | 
| 
      
 322 
     | 
    
         
            +
              end
         
     | 
| 
      
 323 
     | 
    
         
            +
             
     | 
| 
      
 324 
     | 
    
         
            +
              def grid dump = nil
         
     | 
| 
      
 325 
     | 
    
         
            +
                logger = Module.nesting.first.logger
         
     | 
| 
      
 326 
     | 
    
         
            +
             
     | 
| 
      
 327 
     | 
    
         
            +
                all = recognize
         
     | 
| 
      
 328 
     | 
    
         
            +
                logger.info "all nodes: #{all.size}"
         
     | 
| 
      
 329 
     | 
    
         
            +
                File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
         
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
                # adding the fields for faster upcoming computations
         
     | 
| 
      
 332 
     | 
    
         
            +
                struct = Struct.new *all.first.members, :midx, :midy
         
     | 
| 
      
 333 
     | 
    
         
            +
                all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
         
     | 
| 
      
 334 
     | 
    
         
            +
                all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
         
     | 
| 
      
 335 
     | 
    
         
            +
             
     | 
| 
      
 336 
     | 
    
         
            +
                rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
         
     | 
| 
      
 337 
     | 
    
         
            +
                inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
         
     | 
| 
      
 338 
     | 
    
         
            +
                raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
         
     | 
| 
      
 339 
     | 
    
         
            +
                logger.info "inside nodes: #{inside.size}"
         
     | 
| 
      
 340 
     | 
    
         
            +
                File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
         
     | 
| 
      
 341 
     | 
    
         
            +
                good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
         
     | 
| 
      
 342 
     | 
    
         
            +
                logger.info "good and unique: #{good.size}"   # only those that might be containers
         
     | 
| 
      
 343 
     | 
    
         
            +
                File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
         
     | 
| 
      
 344 
     | 
    
         
            +
             
     | 
| 
      
 345 
     | 
    
         
            +
                # large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
         
     | 
| 
      
 346 
     | 
    
         
            +
                # logger.info "large enough: #{large.size}"
         
     | 
| 
      
 347 
     | 
    
         
            +
             
     | 
| 
      
 348 
     | 
    
         
            +
                interfere = lambda do |a, b|
         
     | 
| 
      
 349 
     | 
    
         
            +
                  a.top < b.top + b.height &&
         
     | 
| 
      
 350 
     | 
    
         
            +
                  b.top < a.top + a.height &&
         
     | 
| 
      
 351 
     | 
    
         
            +
                  a.left < b.left + b.width &&
         
     | 
| 
      
 352 
     | 
    
         
            +
                  b.left < a.left + a.width
         
     | 
| 
      
 353 
     | 
    
         
            +
                end
         
     | 
| 
      
 354 
     | 
    
         
            +
             
     | 
| 
      
 355 
     | 
    
         
            +
                rest = good.select.with_index do |a, i|
         
     | 
| 
      
 356 
     | 
    
         
            +
                  good.each_with_index.none? do |b, j|
         
     | 
| 
      
 357 
     | 
    
         
            +
                    next if i == j
         
     | 
| 
      
 358 
     | 
    
         
            +
                    a.top >= b.top && a.top + a.height <= b.top + b.height &&
         
     | 
| 
      
 359 
     | 
    
         
            +
                    a.left >= b.left && a.left + a.width <= b.left + b.width &&
         
     | 
| 
      
 360 
     | 
    
         
            +
                    good.all?{ |c| interfere[a, c] == interfere[b, c] }
         
     | 
| 
      
 361 
     | 
    
         
            +
                  end
         
     | 
| 
      
 362 
     | 
    
         
            +
                end
         
     | 
| 
      
 363 
     | 
    
         
            +
                logger.info "not nested: #{rest.size}"
         
     | 
| 
      
 364 
     | 
    
         
            +
                File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
      
 365 
     | 
    
         
            +
                begin
         
     | 
| 
      
 366 
     | 
    
         
            +
                  prev = rest.size
         
     | 
| 
      
 367 
     | 
    
         
            +
                  rest.select!.with_index do |a, i|
         
     | 
| 
      
 368 
     | 
    
         
            +
                    rest.each_with_index.any? do |b, j|
         
     | 
| 
      
 369 
     | 
    
         
            +
                      cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
         
     | 
| 
      
 370 
     | 
    
         
            +
                      i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
         
     | 
| 
      
 371 
     | 
    
         
            +
                    end and
         
     | 
| 
      
 372 
     | 
    
         
            +
                    rest.each_with_index.any? do |b, j|
         
     | 
| 
      
 373 
     | 
    
         
            +
                      ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
         
     | 
| 
      
 374 
     | 
    
         
            +
                      i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
         
     | 
| 
      
 375 
     | 
    
         
            +
                    end
         
     | 
| 
      
 376 
     | 
    
         
            +
                  end
         
     | 
| 
      
 377 
     | 
    
         
            +
                end until prev == rest.size
         
     | 
| 
      
 378 
     | 
    
         
            +
                logger.info "gridable: #{rest.size}"
         
     | 
| 
      
 379 
     | 
    
         
            +
                File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
         
     | 
| 
      
 380 
     | 
    
         
            +
             
     | 
| 
      
 381 
     | 
    
         
            +
                require "pcbr"
         
     | 
| 
      
 382 
     | 
    
         
            +
                pcbr = PCBR.new
         
     | 
| 
      
 383 
     | 
    
         
            +
                max, past = 0, []
         
     | 
| 
      
 384 
     | 
    
         
            +
                prev = nil
         
     | 
| 
      
 385 
     | 
    
         
            +
                prev_max = nil
         
     | 
| 
      
 386 
     | 
    
         
            +
                time = Time.now
         
     | 
| 
      
 387 
     | 
    
         
            +
                heuristics = %i{ SIZE AREA }
         
     | 
| 
      
 388 
     | 
    
         
            +
                inter = lambda do |a1, a2, b1, b2|
         
     | 
| 
      
 389 
     | 
    
         
            +
                  c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
         
     | 
| 
      
 390 
     | 
    
         
            +
                  [c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
         
     | 
| 
      
 391 
     | 
    
         
            +
                end
         
     | 
| 
      
 392 
     | 
    
         
            +
                lp = lambda do |is|
         
     | 
| 
      
 393 
     | 
    
         
            +
                  past.push is.map{ |i| 2**i }.reduce(:+)
         
     | 
| 
      
 394 
     | 
    
         
            +
                  rest.size.times do |ij|
         
     | 
| 
      
 395 
     | 
    
         
            +
                    next if ij <= is.last unless is.empty?
         
     | 
| 
      
 396 
     | 
    
         
            +
                    sorted = is + [ij]
         
     | 
| 
      
 397 
     | 
    
         
            +
                    next if pcbr.set.include? sorted
         
     | 
| 
      
 398 
     | 
    
         
            +
                    next if is.any?{ |j| interfere[rest[ij], rest[j]] }
         
     | 
| 
      
 399 
     | 
    
         
            +
                    sol = rest.values_at *sorted
         
     | 
| 
      
 400 
     | 
    
         
            +
                    xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
         
     | 
| 
      
 401 
     | 
    
         
            +
                    yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
         
     | 
| 
      
 402 
     | 
    
         
            +
                    next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
         
     | 
| 
      
 403 
     | 
    
         
            +
                    pcbr.store sorted, [
         
     | 
| 
      
 404 
     | 
    
         
            +
                      *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
         
     | 
| 
      
 405 
     | 
    
         
            +
                      xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
         
     | 
| 
      
 406 
     | 
    
         
            +
                      yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
         
     | 
| 
      
 407 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 408 
     | 
    
         
            +
                    if prev && Time.now - time > 3
         
     | 
| 
      
 409 
     | 
    
         
            +
                      logger.debug "check"
         
     | 
| 
      
 410 
     | 
    
         
            +
                      break logger.info "break 0" if Time.now - time > 30
         
     | 
| 
      
 411 
     | 
    
         
            +
                      break logger.info "break 1" if Time.now - prev > 10
         
     | 
| 
      
 412 
     | 
    
         
            +
                      m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
         
     | 
| 
      
 413 
     | 
    
         
            +
                      break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
         
     | 
| 
      
 414 
     | 
    
         
            +
                    end
         
     | 
| 
      
 415 
     | 
    
         
            +
             
     | 
| 
      
 416 
     | 
    
         
            +
                    break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
         
     | 
| 
      
 417 
     | 
    
         
            +
                    logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
         
     | 
| 
      
 418 
     | 
    
         
            +
                    if t.last > max && t.first != prev_max
         
     | 
| 
      
 419 
     | 
    
         
            +
                      prev, max, prev_max = Time.now, t.last, t.first
         
     | 
| 
      
 420 
     | 
    
         
            +
                      logger.debug [pcbr.table.size, max, t.first]
         
     | 
| 
      
 421 
     | 
    
         
            +
                    end
         
     | 
| 
      
 422 
     | 
    
         
            +
                    lp.call t.first
         
     | 
| 
      
 423 
     | 
    
         
            +
                  end
         
     | 
| 
      
 424 
     | 
    
         
            +
                end
         
     | 
| 
      
 425 
     | 
    
         
            +
                lp.call []
         
     | 
| 
      
 426 
     | 
    
         
            +
                # TODO: if multiple with max score, take the max by area
         
     | 
| 
      
 427 
     | 
    
         
            +
                pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
         
     | 
| 
      
 428 
     | 
    
         
            +
                rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
         
     | 
| 
      
 429 
     | 
    
         
            +
              end
         
     | 
| 
       238 
430 
     | 
    
         | 
| 
       239 
431 
     | 
    
         
             
            end
         
     | 
| 
       240 
432 
     | 
    
         | 
    
        data/pagerecognizer.gemspec
    CHANGED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            Gem::Specification.new do |spec|
         
     | 
| 
       2 
2 
     | 
    
         
             
              spec.name         = "pagerecognizer"
         
     | 
| 
       3 
     | 
    
         
            -
              spec.version      = "0.0 
     | 
| 
      
 3 
     | 
    
         
            +
              spec.version      = "0.1.0"
         
     | 
| 
       4 
4 
     | 
    
         
             
              spec.summary      = "visual HTML page structure recognizer"
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
              spec.author       = "Victor Maslov aka Nakilon"
         
     | 
| 
         @@ -8,16 +8,15 @@ Gem::Specification.new do |spec| 
     | 
|
| 
       8 
8 
     | 
    
         
             
              spec.license      = "MIT"
         
     | 
| 
       9 
9 
     | 
    
         
             
              spec.metadata     = {"source_code_uri" => "https://github.com/nakilon/pagerecognizer"}
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
     | 
    
         
            -
              spec.add_dependency "nokogiri"
         
     | 
| 
       12 
     | 
    
         
            -
              spec.add_dependency "pcbr"
         
     | 
| 
       13 
11 
     | 
    
         
             
              spec.add_dependency "ferrum"
         
     | 
| 
      
 12 
     | 
    
         
            +
              spec.add_dependency "nokogiri"
         
     | 
| 
      
 13 
     | 
    
         
            +
              spec.add_dependency "pcbr", "~>0.4.2"
         
     | 
| 
       14 
14 
     | 
    
         
             
              spec.add_development_dependency "minitest"
         
     | 
| 
       15 
15 
     | 
    
         | 
| 
       16 
16 
     | 
    
         
             
              spec.add_development_dependency "ruby-prof"
         
     | 
| 
       17 
17 
     | 
    
         
             
              spec.add_development_dependency "byebug"
         
     | 
| 
       18 
18 
     | 
    
         
             
              spec.add_development_dependency "mll"
         
     | 
| 
       19 
19 
     | 
    
         | 
| 
       20 
     | 
    
         
            -
              spec.require_path = "lib"
         
     | 
| 
       21 
20 
     | 
    
         
             
              spec.test_file    = "test.rb"
         
     | 
| 
       22 
21 
     | 
    
         
             
              spec.files        = %w{ LICENSE pagerecognizer.gemspec lib/pagerecognizer.rb }
         
     | 
| 
       23 
22 
     | 
    
         
             
            end
         
     | 
    
        data/test.rb
    CHANGED
    
    | 
         @@ -1,28 +1,72 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require "minitest/autorun"
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
       2 
3 
     | 
    
         
             
            require "ferrum"
         
     | 
| 
       3 
4 
     | 
    
         
             
            require_relative "lib/pagerecognizer"
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
      
 5 
     | 
    
         
            +
            PageRecognizer.logger.level = :INFO
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         
             
            describe PageRecognizer do
         
     | 
| 
       7 
     | 
    
         
            -
               
     | 
| 
       8 
     | 
    
         
            -
                 
     | 
| 
       9 
     | 
    
         
            -
                 
     | 
| 
       10 
     | 
    
         
            -
                 
     | 
| 
       11 
     | 
    
         
            -
                 
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
                  [" 
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
      
 8 
     | 
    
         
            +
              before do
         
     | 
| 
      
 9 
     | 
    
         
            +
                options = {}
         
     | 
| 
      
 10 
     | 
    
         
            +
                options[:browser_options] = {"no-sandbox": nil} if ENV.has_key? "FERRUM_NO_SANDBOX"
         
     | 
| 
      
 11 
     | 
    
         
            +
                options[:headless] = false if ENV.has_key? "HEADFULL"
         
     | 
| 
      
 12 
     | 
    
         
            +
                @browser = Ferrum::Browser.new **options
         
     | 
| 
      
 13 
     | 
    
         
            +
              end
         
     | 
| 
      
 14 
     | 
    
         
            +
              after do
         
     | 
| 
      
 15 
     | 
    
         
            +
                @browser&.quit
         
     | 
| 
      
 16 
     | 
    
         
            +
              end
         
     | 
| 
      
 17 
     | 
    
         
            +
              [
         
     | 
| 
      
 18 
     | 
    
         
            +
                  ["google1.htm", [
         
     | 
| 
      
 19 
     | 
    
         
            +
                    ["https://ru.wikipedia.org/wiki/Ruby#:~:te", "Ruby — Википедия"],
         
     | 
| 
      
 20 
     | 
    
         
            +
                    ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
         
     | 
| 
      
 21 
     | 
    
         
            +
                    ["https://ru.wikibooks.org/wiki/Ruby", "Ruby — Викиучебник"],
         
     | 
| 
      
 22 
     | 
    
         
            +
                    ["https://habr.com/ru/post/433672/", "Пацаны, так Ruby умер или нет? / Хабр - Habr"],
         
     | 
| 
      
 23 
     | 
    
         
            +
                    ["https://habr.com/ru/hub/ruby/", "Ruby – Динамический высокоуровневый язык..."],
         
     | 
| 
      
 24 
     | 
    
         
            +
                    ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
         
     | 
| 
      
 25 
     | 
    
         
            +
                    ["http://rusrails.ru/", "Rusrails: Ruby on Rails по-русски"],
         
     | 
| 
      
 26 
     | 
    
         
            +
                    ["https://vc.ru/dev/72391-pochemu-my-vybir", "Почему мы выбираем Ruby для наших проектов..."],
         
     | 
| 
      
 27 
     | 
    
         
            +
                    ["https://tproger.ru/tag/ruby/", "Ruby — всё по этой теме для программистов..."],
         
     | 
| 
      
 28 
     | 
    
         
            +
                    ["https://rubyrussia.club/", "RubyRussia"],
         
     | 
| 
      
 29 
     | 
    
         
            +
                  ] ],
         
     | 
| 
      
 30 
     | 
    
         
            +
                  ["google2.mht", [
         
     | 
| 
      
 31 
     | 
    
         
            +
                    ["https://www.ruby-lang.org/ru/", "Язык программирования Ruby"],
         
     | 
| 
      
 32 
     | 
    
         
            +
                    ["https://ru.wikipedia.org/wiki/Ruby", "Ruby - Википедия"],
         
     | 
| 
      
 33 
     | 
    
         
            +
                    ["https://evrone.ru/why-ruby", "5 причин, почему мы выбираем Ruby - evrone.ru"],
         
     | 
| 
      
 34 
     | 
    
         
            +
                    ["https://habr.com/ru/hub/ruby/", "Ruby — Динамический высокоуровневый язык..."],
         
     | 
| 
      
 35 
     | 
    
         
            +
                    ["https://ru.wikibooks.org/wiki/Ruby", "Ruby - Викиучебник"],
         
     | 
| 
      
 36 
     | 
    
         
            +
                    ["https://context.reverso.net/%D0%BF%D0%B5", "ruby - Перевод на русский - примеры английский..."],
         
     | 
| 
      
 37 
     | 
    
         
            +
                    ["https://web-creator.ru/articles/ruby", "Язык программирования Ruby - Веб Креатор"],
         
     | 
| 
      
 38 
     | 
    
         
            +
                    ["https://ru.hexlet.io/courses/ruby", "Введение в Ruby - Хекслет"],
         
     | 
| 
      
 39 
     | 
    
         
            +
                    ["https://rubyrush.ru/articles/what-is-rub", "Что такое Ruby on Rails?"],
         
     | 
| 
      
 40 
     | 
    
         
            +
                  ] ],
         
     | 
| 
      
 41 
     | 
    
         
            +
              ].each do |filename, expectation|
         
     | 
| 
      
 42 
     | 
    
         
            +
                it "google rows #{filename}" do
         
     | 
| 
      
 43 
     | 
    
         
            +
                @browser.goto "file://#{File.expand_path filename}"
         
     | 
| 
      
 44 
     | 
    
         
            +
                results = @browser.at_css("body").rows([:AREA, :SIZE], try_min: 9) do |node|
         
     | 
| 
      
 45 
     | 
    
         
            +
                  texts = node.texts
         
     | 
| 
      
 46 
     | 
    
         
            +
                  next if texts.none?{ |_, _, color, | :black == color }
         
     | 
| 
      
 47 
     | 
    
         
            +
                  _, group = texts.group_by{ |_, style, | style["fontSize"].to_i }.to_a.max_by(&:first)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  next unless group
         
     | 
| 
      
 49 
     | 
    
         
            +
                  next unless group.size == 1 && %i{ blue navy }.include?(group[0][2])
         
     | 
| 
      
 50 
     | 
    
         
            +
                  true
         
     | 
| 
      
 51 
     | 
    
         
            +
                end
         
     | 
| 
      
 52 
     | 
    
         
            +
                assert_equal expectation, results.reject{ |_| _.node.at_css "img" }.map{ |result| [
         
     | 
| 
      
 53 
     | 
    
         
            +
                  result.node.at_css("a").property("href")[0,40],
         
     | 
| 
      
 54 
     | 
    
         
            +
                  result.texts.max_by{ |_, style, | style["fontStyle"].to_i }[0].sub(/(.{40}) .+/, "\\1..."),
         
     | 
| 
       26 
55 
     | 
    
         
             
                ] }
         
     | 
| 
      
 56 
     | 
    
         
            +
                end
         
     | 
| 
      
 57 
     | 
    
         
            +
              end
         
     | 
| 
      
 58 
     | 
    
         
            +
              [
         
     | 
| 
      
 59 
     | 
    
         
            +
                  ["youtube.htm", %w{ Главная В\ тренде Подписки Библиотека История }, 8],
         
     | 
| 
      
 60 
     | 
    
         
            +
                  ["youtube2.mht", %w{ Главная Навигатор Shorts Подписки Библиотека История }, 10],
         
     | 
| 
      
 61 
     | 
    
         
            +
              ].each do |filename, expected_navigation, rows|
         
     | 
| 
      
 62 
     | 
    
         
            +
                it "youtube rows grid #{filename}" do
         
     | 
| 
      
 63 
     | 
    
         
            +
                  @browser.goto "file://#{File.expand_path filename}"
         
     | 
| 
      
 64 
     | 
    
         
            +
                  assert_equal expected_navigation, @browser.at_css("ytd-mini-guide-renderer").rows([:AREA, :SIZE]){ |_| !_.node.text.strip.empty? }.map{ |nav| nav.texts.first[0] }
         
     | 
| 
      
 65 
     | 
    
         
            +
                  grid = @browser.at_css("#content").grid
         
     | 
| 
      
 66 
     | 
    
         
            +
                  assert_equal 3*rows, grid.size
         
     | 
| 
      
 67 
     | 
    
         
            +
                  assert_equal [3]*rows, grid.rows.map(&:size)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  assert_equal [rows]*3, grid.cols.map(&:size)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  grid.each{ |n| n.to_h.values_at(:width, :height).each{ |_| assert_in_delta 250, _, 50 } }
         
     | 
| 
      
 70 
     | 
    
         
            +
                end
         
     | 
| 
       27 
71 
     | 
    
         
             
              end
         
     | 
| 
       28 
72 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,17 +1,17 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: pagerecognizer
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Victor Maslov aka Nakilon
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2022-05-06 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
     | 
    
         
            -
              name:  
     | 
| 
      
 14 
     | 
    
         
            +
              name: ferrum
         
     | 
| 
       15 
15 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       16 
16 
     | 
    
         
             
                requirements:
         
     | 
| 
       17 
17 
     | 
    
         
             
                - - ">="
         
     | 
| 
         @@ -25,7 +25,7 @@ dependencies: 
     | 
|
| 
       25 
25 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       26 
26 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       27 
27 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       28 
     | 
    
         
            -
              name:  
     | 
| 
      
 28 
     | 
    
         
            +
              name: nokogiri
         
     | 
| 
       29 
29 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       30 
30 
     | 
    
         
             
                requirements:
         
     | 
| 
       31 
31 
     | 
    
         
             
                - - ">="
         
     | 
| 
         @@ -39,19 +39,19 @@ dependencies: 
     | 
|
| 
       39 
39 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       40 
40 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       41 
41 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       42 
     | 
    
         
            -
              name:  
     | 
| 
      
 42 
     | 
    
         
            +
              name: pcbr
         
     | 
| 
       43 
43 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       44 
44 
     | 
    
         
             
                requirements:
         
     | 
| 
       45 
     | 
    
         
            -
                - - " 
     | 
| 
      
 45 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
       46 
46 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       47 
     | 
    
         
            -
                    version:  
     | 
| 
      
 47 
     | 
    
         
            +
                    version: 0.4.2
         
     | 
| 
       48 
48 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       49 
49 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       50 
50 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       51 
51 
     | 
    
         
             
                requirements:
         
     | 
| 
       52 
     | 
    
         
            -
                - - " 
     | 
| 
      
 52 
     | 
    
         
            +
                - - "~>"
         
     | 
| 
       53 
53 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       54 
     | 
    
         
            -
                    version:  
     | 
| 
      
 54 
     | 
    
         
            +
                    version: 0.4.2
         
     | 
| 
       55 
55 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       56 
56 
     | 
    
         
             
              name: minitest
         
     | 
| 
       57 
57 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     |