formscraper_helper 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - checksums.yaml.gz.sig +0 -0
 - data/lib/formscraper_helper.rb +74 -31
 - data.tar.gz.sig +0 -0
 - metadata +3 -3
 - metadata.gz.sig +0 -0
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 402c3ab5b633cce3e4852deba197a388a30d267e81595829cbc05deb6509f07e
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: c3867d152abfa6d910efd7f24b119c413c8ca8368229290583704bba1018108d
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: a3c5c2217823d8734d7e273a07c166012191f0196a3c039cbb7bb55a0ee2ab3593295b9e985c049ebcfdddaa768696091fa49352cb8af3c7782ac56a25d86dd2
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 15b49ce36dd5517099e04436153f3e63c1f9138a383ddb22033e4896eb9b0203c32bcca28ec18ab1883bc452a175182e21700cbca4cd812cdb618ad0b571abcd
         
     | 
    
        checksums.yaml.gz.sig
    CHANGED
    
    | 
         Binary file 
     | 
    
        data/lib/formscraper_helper.rb
    CHANGED
    
    | 
         @@ -4,17 +4,21 @@ 
     | 
|
| 
       4 
4 
     | 
    
         | 
| 
       5 
5 
     | 
    
         
             
            require 'ferrum'
         
     | 
| 
       6 
6 
     | 
    
         
             
            require 'nokorexi'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'clipboard'
         
     | 
| 
       7 
8 
     | 
    
         | 
| 
       8 
9 
     | 
    
         | 
| 
       9 
10 
     | 
    
         
             
            class FormScraperHelper
         
     | 
| 
       10 
11 
     | 
    
         | 
| 
       11 
12 
     | 
    
         
             
              attr_reader :browser
         
     | 
| 
       12 
13 
     | 
    
         | 
| 
       13 
     | 
    
         
            -
               
     | 
| 
      
 14 
     | 
    
         
            +
              # note: fd corresponds to FakeDataGenerator22 which is optional
         
     | 
| 
      
 15 
     | 
    
         
            +
              #
         
     | 
| 
      
 16 
     | 
    
         
            +
              def initialize(url, headless: false, clipb: true, fd: nil, debug: false)
         
     | 
| 
       14 
17 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
                @url, @debug = url, debug
         
     | 
| 
      
 18 
     | 
    
         
            +
                @url, @clipb, @fd, @debug = url, clipb, fd, debug
         
     | 
| 
       16 
19 
     | 
    
         
             
                @browser = Ferrum::Browser.new  headless: headless
         
     | 
| 
       17 
20 
     | 
    
         
             
                @browser.goto(url)
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
       18 
22 
     | 
    
         
             
                sleep 2
         
     | 
| 
       19 
23 
     | 
    
         
             
                scrape()
         
     | 
| 
       20 
24 
     | 
    
         | 
| 
         @@ -40,6 +44,7 @@ doc = Nokorexi.new(browser.body).to_doc 
     | 
|
| 
       40 
44 
     | 
    
         | 
| 
       41 
45 
     | 
    
         
             
            # load the YAML document containing the inputs
         
     | 
| 
       42 
46 
     | 
    
         
             
            #filepath = ''
         
     | 
| 
      
 47 
     | 
    
         
            +
            filepath = '/tmp/tmp.yaml'
         
     | 
| 
       43 
48 
     | 
    
         
             
            h = YAML.load(File.read(filepath))
         
     | 
| 
       44 
49 
     | 
    
         
             
            EOF
         
     | 
| 
       45 
50 
     | 
    
         | 
| 
         @@ -51,29 +56,26 @@ EOF 
     | 
|
| 
       51 
56 
     | 
    
         | 
| 
       52 
57 
     | 
    
         
             
                  if h[:type] == 'text' or h[:type] == 'password' then
         
     | 
| 
       53 
58 
     | 
    
         | 
| 
       54 
     | 
    
         
            -
                    var1 =  
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
                    else 
         
     | 
| 
       57 
     | 
    
         
            -
                      key.downcase
         
     | 
| 
       58 
     | 
    
         
            -
                    end
         
     | 
| 
      
 59 
     | 
    
         
            +
                    var1, s2 = format_var1(h[:title], key)
         
     | 
| 
      
 60 
     | 
    
         
            +
                    s += s2
         
     | 
| 
       59 
61 
     | 
    
         
             
                    s += var1 + " = h['#{var1}']\n"
         
     | 
| 
       60 
     | 
    
         
            -
                    s += "r.focus.type #{var1}\n 
     | 
| 
      
 62 
     | 
    
         
            +
                    s += "r.focus.type #{var1}\n"
         
     | 
| 
      
 63 
     | 
    
         
            +
                    s += "sleep 0.5\n\n"
         
     | 
| 
       61 
64 
     | 
    
         | 
| 
       62 
65 
     | 
    
         
             
                  elsif h[:type] == 'select'
         
     | 
| 
       63 
66 
     | 
    
         | 
| 
       64 
     | 
    
         
            -
                    var1 =  
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
       66 
     | 
    
         
            -
                    else 
         
     | 
| 
       67 
     | 
    
         
            -
                      key.downcase
         
     | 
| 
       68 
     | 
    
         
            -
                    end
         
     | 
| 
      
 67 
     | 
    
         
            +
                    var1, s2 = format_var1(h[:title], key)
         
     | 
| 
      
 68 
     | 
    
         
            +
                    s += s2
         
     | 
| 
       69 
69 
     | 
    
         | 
| 
       70 
70 
     | 
    
         
             
                    s += "# options: #{h[:options].join(', ')}\n"
         
     | 
| 
       71 
71 
     | 
    
         
             
                    s += "#{var1} = h['#{var1}']\n"
         
     | 
| 
       72 
     | 
    
         
            -
                    s += ' 
     | 
| 
       73 
     | 
    
         
            -
                    s +=  
     | 
| 
      
 72 
     | 
    
         
            +
                    s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
         
     | 
| 
      
 73 
     | 
    
         
            +
                    s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
         
     | 
| 
      
 74 
     | 
    
         
            +
                    s += "n = titles.index(found.first) + 1\n"
         
     | 
| 
       74 
75 
     | 
    
         
             
                    s += "r.focus\n"
         
     | 
| 
       75 
76 
     | 
    
         
             
                    s += "n.times { r.type(:down); sleep 1}\n"
         
     | 
| 
       76 
     | 
    
         
            -
                    s += "r.click\n 
     | 
| 
      
 77 
     | 
    
         
            +
                    s += "r.click\n"
         
     | 
| 
      
 78 
     | 
    
         
            +
                    s += "sleep 0.5\n\n"
         
     | 
| 
       77 
79 
     | 
    
         | 
| 
       78 
80 
     | 
    
         
             
                  elsif h[:type] == 'checkbox'
         
     | 
| 
       79 
81 
     | 
    
         
             
                    s += "r.focus.click\n\n"
         
     | 
| 
         @@ -81,6 +83,9 @@ EOF 
     | 
|
| 
       81 
83 
     | 
    
         | 
| 
       82 
84 
     | 
    
         
             
                end
         
     | 
| 
       83 
85 
     | 
    
         | 
| 
      
 86 
     | 
    
         
            +
                Clipboard.copy s if @clipb
         
     | 
| 
      
 87 
     | 
    
         
            +
                puts 'generated code copied to clipboard'
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
       84 
89 
     | 
    
         
             
                return s
         
     | 
| 
       85 
90 
     | 
    
         | 
| 
       86 
91 
     | 
    
         
             
              end
         
     | 
| 
         @@ -97,22 +102,30 @@ EOF 
     | 
|
| 
       97 
102 
     | 
    
         | 
| 
       98 
103 
     | 
    
         
             
                  if h[:type] == 'text' or h[:type] == 'password' then
         
     | 
| 
       99 
104 
     | 
    
         | 
| 
       100 
     | 
    
         
            -
                    var1 =  
     | 
| 
       101 
     | 
    
         
            -
             
     | 
| 
       102 
     | 
    
         
            -
                     
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
      
 105 
     | 
    
         
            +
                    var1, s2 = format_var1(h[:title], key)
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                    s += s2
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                    if h[:type] == 'password' then
         
     | 
| 
      
 110 
     | 
    
         
            +
                      @pwd ||= @fd ? @fd.password : 'xxx'
         
     | 
| 
      
 111 
     | 
    
         
            +
                      s += var1 + ": #{@pwd}\n"
         
     | 
| 
      
 112 
     | 
    
         
            +
                    elsif @fd
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
                      found = @fd.lookup var1
         
     | 
| 
      
 115 
     | 
    
         
            +
                      val = found.is_a?(String) ? found : 'xxx'
         
     | 
| 
      
 116 
     | 
    
         
            +
                      s += var1 + ": #{val}\n"
         
     | 
| 
      
 117 
     | 
    
         
            +
                    else
         
     | 
| 
      
 118 
     | 
    
         
            +
                      s += var1 + ": xxx\n"
         
     | 
| 
       104 
119 
     | 
    
         
             
                    end
         
     | 
| 
       105 
     | 
    
         
            -
                    s += var1 + ": xxx\n"
         
     | 
| 
       106 
120 
     | 
    
         | 
| 
       107 
121 
     | 
    
         
             
                  elsif h[:type] == 'select'
         
     | 
| 
       108 
122 
     | 
    
         | 
| 
       109 
     | 
    
         
            -
                    var1 =  
     | 
| 
       110 
     | 
    
         
            -
                      h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
         
     | 
| 
       111 
     | 
    
         
            -
                    else 
         
     | 
| 
       112 
     | 
    
         
            -
                      key.downcase
         
     | 
| 
       113 
     | 
    
         
            -
                    end
         
     | 
| 
      
 123 
     | 
    
         
            +
                    var1, s2 = format_var1(h[:title], key)
         
     | 
| 
       114 
124 
     | 
    
         | 
| 
       115 
     | 
    
         
            -
                    s +=  
     | 
| 
      
 125 
     | 
    
         
            +
                    s += s2
         
     | 
| 
      
 126 
     | 
    
         
            +
                    s += "# options: #{h[:options].join(', ')}\n"
         
     | 
| 
      
 127 
     | 
    
         
            +
                    val = h[:options][1..-1].sample
         
     | 
| 
      
 128 
     | 
    
         
            +
                    s += "#{var1}: #{val}\n"
         
     | 
| 
       116 
129 
     | 
    
         | 
| 
       117 
130 
     | 
    
         
             
                  elsif h[:type] == 'checkbox'
         
     | 
| 
       118 
131 
     | 
    
         | 
| 
         @@ -120,20 +133,49 @@ EOF 
     | 
|
| 
       120 
133 
     | 
    
         | 
| 
       121 
134 
     | 
    
         
             
                end
         
     | 
| 
       122 
135 
     | 
    
         | 
| 
      
 136 
     | 
    
         
            +
                Clipboard.copy s if @clipb
         
     | 
| 
      
 137 
     | 
    
         
            +
                puts 'generated YAML copied to clipboard'
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
       123 
139 
     | 
    
         
             
                return s
         
     | 
| 
       124 
140 
     | 
    
         | 
| 
       125 
141 
     | 
    
         
             
              end
         
     | 
| 
       126 
142 
     | 
    
         | 
| 
       127 
143 
     | 
    
         
             
              private
         
     | 
| 
       128 
144 
     | 
    
         | 
| 
       129 
     | 
    
         
            -
               
     | 
| 
      
 145 
     | 
    
         
            +
              # returns var1 using arguments rawtitle or key
         
     | 
| 
      
 146 
     | 
    
         
            +
              # note: argument s is passed by reference
         
     | 
| 
      
 147 
     | 
    
         
            +
              #
         
     | 
| 
      
 148 
     | 
    
         
            +
              def format_var1(rawtitle, key)
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
                var1 = if rawtitle.length > 1 then
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
      
 152 
     | 
    
         
            +
                  s = "\n# " + rawtitle + "\n"
         
     | 
| 
      
 153 
     | 
    
         
            +
                  title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
         
     | 
| 
      
 154 
     | 
    
         
            +
                  words = title.downcase.scan(/\w+/)
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                  if words.count > 2 then
         
     | 
| 
      
 157 
     | 
    
         
            +
                    words.take(5).map {|x| x[0]}.join
         
     | 
| 
      
 158 
     | 
    
         
            +
                  else
         
     | 
| 
      
 159 
     | 
    
         
            +
                    title.downcase.gsub(/ +/,'_')
         
     | 
| 
      
 160 
     | 
    
         
            +
                  end
         
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
                else
         
     | 
| 
      
 163 
     | 
    
         
            +
                  newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
         
     | 
| 
      
 164 
     | 
    
         
            +
                  s = "\n# " + newtitle + "\n"
         
     | 
| 
      
 165 
     | 
    
         
            +
                  newtitle.gsub(/[^\w ]/,'').downcase\
         
     | 
| 
      
 166 
     | 
    
         
            +
                      .gsub(/ +/,'_')
         
     | 
| 
      
 167 
     | 
    
         
            +
                end
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                [var1, s]
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
              end
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
              def scrape()
         
     | 
| 
       130 
174 
     | 
    
         | 
| 
       131 
175 
     | 
    
         
             
                doc = Nokorexi.new(@browser.body).to_doc
         
     | 
| 
       132 
176 
     | 
    
         | 
| 
       133 
177 
     | 
    
         
             
                #a = doc.root.xpath('//input|//select')
         
     | 
| 
       134 
     | 
    
         
            -
                a = doc.root.xpath('//*').select  
     | 
| 
       135 
     | 
    
         
            -
                  x.name == 'input' or x.name == 'select'
         
     | 
| 
       136 
     | 
    
         
            -
                end
         
     | 
| 
      
 178 
     | 
    
         
            +
                a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
         
     | 
| 
       137 
179 
     | 
    
         
             
                a.reject! do |x|
         
     | 
| 
       138 
180 
     | 
    
         
             
                  x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
         
     | 
| 
       139 
181 
     | 
    
         
             
                end
         
     | 
| 
         @@ -160,3 +202,4 @@ EOF 
     | 
|
| 
       160 
202 
     | 
    
         | 
| 
       161 
203 
     | 
    
         | 
| 
       162 
204 
     | 
    
         
             
            end
         
     | 
| 
      
 205 
     | 
    
         
            +
             
     | 
    
        data.tar.gz.sig
    CHANGED
    
    | 
         Binary file 
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: formscraper_helper
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - James Robertson
         
     | 
| 
         @@ -35,7 +35,7 @@ cert_chain: 
     | 
|
| 
       35 
35 
     | 
    
         
             
              hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
         
     | 
| 
       36 
36 
     | 
    
         
             
              3v0q89HVKLBtQzj+Dii/vHeI
         
     | 
| 
       37 
37 
     | 
    
         
             
              -----END CERTIFICATE-----
         
     | 
| 
       38 
     | 
    
         
            -
            date: 2022-05- 
     | 
| 
      
 38 
     | 
    
         
            +
            date: 2022-05-28 00:00:00.000000000 Z
         
     | 
| 
       39 
39 
     | 
    
         
             
            dependencies:
         
     | 
| 
       40 
40 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       41 
41 
     | 
    
         
             
              name: ferrum
         
     | 
| 
         @@ -106,5 +106,5 @@ requirements: [] 
     | 
|
| 
       106 
106 
     | 
    
         
             
            rubygems_version: 3.2.22
         
     | 
| 
       107 
107 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       108 
108 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       109 
     | 
    
         
            -
            summary: Attempts to scrape the inputs required to  
     | 
| 
      
 109 
     | 
    
         
            +
            summary: Attempts to scrape the inputs required to complete a 1 page online form.
         
     | 
| 
       110 
110 
     | 
    
         
             
            test_files: []
         
     | 
    
        metadata.gz.sig
    CHANGED
    
    | 
         Binary file 
     |