formscraper_helper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 5b4f9be47b4c46d2e2823161ac508b4748afab4f2f6a03e2617bb91ae6ababe9
4
+ data.tar.gz: bea40e151206763162b372bb2163010f65ce021fbd795b82757780eba32ccde2
5
+ SHA512:
6
+ metadata.gz: 8c7809564deb15b0b6a1ce109a676ceed9f77211dcf30fe14894a638ad75609421de74cc6863064d7c4fdb428b99fdffbb9ae777598bd3c32c99e0ca542f4860
7
+ data.tar.gz: a61eaf497175a3bb4aac7eed0e3e54038640697a90484c05adc11fa8342f1c668f94255488d1a6a6e932c8a42e0143b821af1bd98f4256605e86da92a9c4f218
checksums.yaml.gz.sig ADDED
Binary file
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # file: formscraper_helper.rb
4
+
5
+ require 'ferrum'
6
+ require 'nokorexi'
7
+
8
+
9
+ class FormScraperHelper
10
+
11
+ attr_reader :browser
12
+
13
+ def initialize(url, headless: false, debug: false)
14
+
15
+ @url, @debug = url, debug
16
+ @browser = Ferrum::Browser.new headless: headless
17
+ @browser.goto(url)
18
+ sleep 2
19
+ scrape()
20
+
21
+ end
22
+
23
+ def to_h()
24
+ @h
25
+ end
26
+
27
+ def to_code()
28
+
29
+ s=<<EOF
30
+ require 'yaml'
31
+ require 'ferrum'
32
+ require 'nokorexi'
33
+
34
+ browser = Ferrum::Browser.new headless: false
35
+ url = '#{@url}'
36
+ browser.goto(url)
37
+ sleep 2
38
+
39
+ doc = Nokorexi.new(browser.body).to_doc
40
+
41
+ # load the YAML document containing the inputs
42
+ #filepath = ''
43
+ h = YAML.load(File.read(filepath))
44
+ EOF
45
+
46
+ @h.each do |key, h|
47
+
48
+ puts 'key: ' + key.inspect if @debug
49
+
50
+ s += "r = browser.at_xpath('#{h[:xpath]}')\n"
51
+
52
+ if h[:type] == 'text' or h[:type] == 'password' then
53
+
54
+ var1 = if h[:title].length > 1 then
55
+ h[:title].downcase.gsub(/ +/,'_')
56
+ else
57
+ key.downcase
58
+ end
59
+ s += var1 + " = h['#{var1}']\n"
60
+ s += "r.focus.type #{var1}\n\n"
61
+
62
+ elsif h[:type] == 'select'
63
+
64
+ var1 = if h[:title].length > 1 then
65
+ h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
66
+ else
67
+ key.downcase
68
+ end
69
+
70
+ s += "# options: #{h[:options].join(', ')}\n"
71
+ s += "#{var1} = h['#{var1}']\n"
72
+ s += 'r = titles.grep /#{' + var1 + '}/i' + "\n"
73
+ s += "n = titles.index(r.first) + 1\n"
74
+ s += "r.focus\n"
75
+ s += "n.times { r.type(:down); sleep 1}\n"
76
+ s += "r.click\n\n"
77
+
78
+ elsif h[:type] == 'checkbox'
79
+ s += "r.focus.click\n\n"
80
+ end
81
+
82
+ end
83
+
84
+ return s
85
+
86
+ end
87
+
88
+ # creates a YAML document for the inputs
89
+ #
90
+ def to_yaml()
91
+
92
+ s = '---' + "\n"
93
+
94
+ @h.each do |key, h|
95
+
96
+ puts 'key: ' + key.inspect if @debug
97
+
98
+ if h[:type] == 'text' or h[:type] == 'password' then
99
+
100
+ var1 = if h[:title].length > 1 then
101
+ h[:title].downcase.gsub(/ +/,'_')
102
+ else
103
+ key.downcase
104
+ end
105
+ s += var1 + ": xxx\n"
106
+
107
+ elsif h[:type] == 'select'
108
+
109
+ var1 = if h[:title].length > 1 then
110
+ h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
111
+ else
112
+ key.downcase
113
+ end
114
+
115
+ s += "#{var1}: xxx\n"
116
+
117
+ elsif h[:type] == 'checkbox'
118
+
119
+ end
120
+
121
+ end
122
+
123
+ return s
124
+
125
+ end
126
+
127
+ private
128
+
129
+ def scrape()
130
+
131
+ doc = Nokorexi.new(@browser.body).to_doc
132
+
133
+ #a = doc.root.xpath('//input|//select')
134
+ a = doc.root.xpath('//*').select do |x|
135
+ x.name == 'input' or x.name == 'select'
136
+ end
137
+ a.reject! do |x|
138
+ x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
139
+ end
140
+
141
+ @h = a.map do |x|
142
+
143
+ key = x.attributes[:name]
144
+ type = x.name
145
+
146
+ h = {}
147
+ h[:type] = x.attributes[:type] || type
148
+ h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
149
+ h[:title] = x.attributes[:title]
150
+
151
+ if type == 'select' then
152
+ h[:options] = x.xpath('option').map {|x| x.text.to_s}
153
+ end
154
+
155
+ [key, h]
156
+
157
+ end.to_h
158
+
159
+ end
160
+
161
+
162
+ end
data.tar.gz.sig ADDED
Binary file
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: formscraper_helper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - James Robertson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwNTI1MjA0NzIxWhcN
15
+ MjMwNTI1MjA0NzIxWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxEJVm
17
+ 2J0b5OWsG7dGVaATl+OOZOM3vvtID8nZdmkI750Z/5dsKrUDXRID1/tN7OIXsD4C
18
+ kf/gQTu1ZhDs0d2ZAFF++HhGBHsRpWbJFodCUXlxlSrsOouwsb9QU9wwrjSf2ROh
19
+ CewjBtBu5elGVxg88eCpHm5WJdTt1niiMCnl8ci2QzUkjqyoD7mujUTnJm0vbR2G
20
+ cAoU6A6/xraZb5HyVj+S3iU3tCcvZ7GsyfG5U60XLlee073tEbrhA+1Veu6jD2q+
21
+ 445lGG247SYCEtWmu+S1ia7xWoTtmoHqoc0GsThK8FmCILTO0VouQwI1Em0erLM0
22
+ UWeUxzOIL3HiaxElI39RZwAo0AjsQs9btmIWnE6Qa0UlCc4aIZY4FPPeeOg58LXD
23
+ P3U1zu2dm9BFzJA9fanaKPIFWu9JaLi0h6VfRQ9HxRF0HUxyJz2vc91xhgg4cVv2
24
+ wIE2htNKt3KJrRoz4FF7abK371dYheDVPtgPrTZDRx3ftlIu5gscIwpCohsCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUYBF7Fed1
26
+ r+l7945CzwRkfTiERfIwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
+ c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
+ BgkqhkiG9w0BAQsFAAOCAYEARmjroNgUuAeDY25EA0uXOWArJdBq+0Z95PHZweIa
29
+ Et38VT81ul/VnoEk7ehvRZNUU42XjhJkRXfnfSJdViEZYfuXFmr1GzN+Ib8Og1O/
30
+ 3uC8eiDax+7RpFOeqIjAZ7/6lHrW6/h0oQ+66yLAc7kq4SbPZJETHkAj5JFPJijm
31
+ YanatYhSZ2P0/k42k38PLHBn8w8YBq7kcnntMyh/DAq7cr1G/5fLfZKLx4+Aim6S
32
+ wp+1pU+SnrUdzobFQ3Tq1N76CJp27iN3XpNsu5wlSUQDKbDT2hWSPZKO9XsV9/ch
33
+ rlJMGcoOwGdXlgipiQcc71tXQTxPgxZVw8nYlvHtoBexEupPxcPK54BS4LI88KFr
34
+ UTqj+Ktsfri/arY458I/Th+KLrzwetk+Z8xNOa1Pw8pTXyynbiVEPfhe80TYH/Zg
35
+ hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
36
+ 3v0q89HVKLBtQzj+Dii/vHeI
37
+ -----END CERTIFICATE-----
38
+ date: 2022-05-25 00:00:00.000000000 Z
39
+ dependencies:
40
+ - !ruby/object:Gem::Dependency
41
+ name: ferrum
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '0.11'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0.11'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '0.11'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0.11'
60
+ - !ruby/object:Gem::Dependency
61
+ name: nokorexi
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '0.7'
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 0.7.0
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '0.7'
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: 0.7.0
80
+ description:
81
+ email: digital.robertson@gmail.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - lib/formscraper_helper.rb
87
+ homepage: https://github.com/jrobertson/formscraper_helper
88
+ licenses:
89
+ - MIT
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubygems_version: 3.2.22
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: Attempts to scrape the inputs required to complate a 1 page online form.
110
+ test_files: []
metadata.gz.sig ADDED
Binary file