formscraper_helper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +162 -0
- data.tar.gz.sig +0 -0
- metadata +110 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5b4f9be47b4c46d2e2823161ac508b4748afab4f2f6a03e2617bb91ae6ababe9
|
4
|
+
data.tar.gz: bea40e151206763162b372bb2163010f65ce021fbd795b82757780eba32ccde2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8c7809564deb15b0b6a1ce109a676ceed9f77211dcf30fe14894a638ad75609421de74cc6863064d7c4fdb428b99fdffbb9ae777598bd3c32c99e0ca542f4860
|
7
|
+
data.tar.gz: a61eaf497175a3bb4aac7eed0e3e54038640697a90484c05adc11fa8342f1c668f94255488d1a6a6e932c8a42e0143b821af1bd98f4256605e86da92a9c4f218
|
checksums.yaml.gz.sig
ADDED
Binary file
|
@@ -0,0 +1,162 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# file: formscraper_helper.rb
|
4
|
+
|
5
|
+
require 'ferrum'
|
6
|
+
require 'nokorexi'
|
7
|
+
|
8
|
+
|
9
|
+
class FormScraperHelper
|
10
|
+
|
11
|
+
attr_reader :browser
|
12
|
+
|
13
|
+
def initialize(url, headless: false, debug: false)
|
14
|
+
|
15
|
+
@url, @debug = url, debug
|
16
|
+
@browser = Ferrum::Browser.new headless: headless
|
17
|
+
@browser.goto(url)
|
18
|
+
sleep 2
|
19
|
+
scrape()
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_h()
|
24
|
+
@h
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_code()
|
28
|
+
|
29
|
+
s=<<EOF
|
30
|
+
require 'yaml'
|
31
|
+
require 'ferrum'
|
32
|
+
require 'nokorexi'
|
33
|
+
|
34
|
+
browser = Ferrum::Browser.new headless: false
|
35
|
+
url = '#{@url}'
|
36
|
+
browser.goto(url)
|
37
|
+
sleep 2
|
38
|
+
|
39
|
+
doc = Nokorexi.new(browser.body).to_doc
|
40
|
+
|
41
|
+
# load the YAML document containing the inputs
|
42
|
+
#filepath = ''
|
43
|
+
h = YAML.load(File.read(filepath))
|
44
|
+
EOF
|
45
|
+
|
46
|
+
@h.each do |key, h|
|
47
|
+
|
48
|
+
puts 'key: ' + key.inspect if @debug
|
49
|
+
|
50
|
+
s += "r = browser.at_xpath('#{h[:xpath]}')\n"
|
51
|
+
|
52
|
+
if h[:type] == 'text' or h[:type] == 'password' then
|
53
|
+
|
54
|
+
var1 = if h[:title].length > 1 then
|
55
|
+
h[:title].downcase.gsub(/ +/,'_')
|
56
|
+
else
|
57
|
+
key.downcase
|
58
|
+
end
|
59
|
+
s += var1 + " = h['#{var1}']\n"
|
60
|
+
s += "r.focus.type #{var1}\n\n"
|
61
|
+
|
62
|
+
elsif h[:type] == 'select'
|
63
|
+
|
64
|
+
var1 = if h[:title].length > 1 then
|
65
|
+
h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
|
66
|
+
else
|
67
|
+
key.downcase
|
68
|
+
end
|
69
|
+
|
70
|
+
s += "# options: #{h[:options].join(', ')}\n"
|
71
|
+
s += "#{var1} = h['#{var1}']\n"
|
72
|
+
s += 'r = titles.grep /#{' + var1 + '}/i' + "\n"
|
73
|
+
s += "n = titles.index(r.first) + 1\n"
|
74
|
+
s += "r.focus\n"
|
75
|
+
s += "n.times { r.type(:down); sleep 1}\n"
|
76
|
+
s += "r.click\n\n"
|
77
|
+
|
78
|
+
elsif h[:type] == 'checkbox'
|
79
|
+
s += "r.focus.click\n\n"
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
return s
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
# creates a YAML document for the inputs
|
89
|
+
#
|
90
|
+
def to_yaml()
|
91
|
+
|
92
|
+
s = '---' + "\n"
|
93
|
+
|
94
|
+
@h.each do |key, h|
|
95
|
+
|
96
|
+
puts 'key: ' + key.inspect if @debug
|
97
|
+
|
98
|
+
if h[:type] == 'text' or h[:type] == 'password' then
|
99
|
+
|
100
|
+
var1 = if h[:title].length > 1 then
|
101
|
+
h[:title].downcase.gsub(/ +/,'_')
|
102
|
+
else
|
103
|
+
key.downcase
|
104
|
+
end
|
105
|
+
s += var1 + ": xxx\n"
|
106
|
+
|
107
|
+
elsif h[:type] == 'select'
|
108
|
+
|
109
|
+
var1 = if h[:title].length > 1 then
|
110
|
+
h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
|
111
|
+
else
|
112
|
+
key.downcase
|
113
|
+
end
|
114
|
+
|
115
|
+
s += "#{var1}: xxx\n"
|
116
|
+
|
117
|
+
elsif h[:type] == 'checkbox'
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
return s
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def scrape()
|
130
|
+
|
131
|
+
doc = Nokorexi.new(@browser.body).to_doc
|
132
|
+
|
133
|
+
#a = doc.root.xpath('//input|//select')
|
134
|
+
a = doc.root.xpath('//*').select do |x|
|
135
|
+
x.name == 'input' or x.name == 'select'
|
136
|
+
end
|
137
|
+
a.reject! do |x|
|
138
|
+
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
139
|
+
end
|
140
|
+
|
141
|
+
@h = a.map do |x|
|
142
|
+
|
143
|
+
key = x.attributes[:name]
|
144
|
+
type = x.name
|
145
|
+
|
146
|
+
h = {}
|
147
|
+
h[:type] = x.attributes[:type] || type
|
148
|
+
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
149
|
+
h[:title] = x.attributes[:title]
|
150
|
+
|
151
|
+
if type == 'select' then
|
152
|
+
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
153
|
+
end
|
154
|
+
|
155
|
+
[key, h]
|
156
|
+
|
157
|
+
end.to_h
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
end
|
data.tar.gz.sig
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: formscraper_helper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- James Robertson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwNTI1MjA0NzIxWhcN
|
15
|
+
MjMwNTI1MjA0NzIxWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxEJVm
|
17
|
+
2J0b5OWsG7dGVaATl+OOZOM3vvtID8nZdmkI750Z/5dsKrUDXRID1/tN7OIXsD4C
|
18
|
+
kf/gQTu1ZhDs0d2ZAFF++HhGBHsRpWbJFodCUXlxlSrsOouwsb9QU9wwrjSf2ROh
|
19
|
+
CewjBtBu5elGVxg88eCpHm5WJdTt1niiMCnl8ci2QzUkjqyoD7mujUTnJm0vbR2G
|
20
|
+
cAoU6A6/xraZb5HyVj+S3iU3tCcvZ7GsyfG5U60XLlee073tEbrhA+1Veu6jD2q+
|
21
|
+
445lGG247SYCEtWmu+S1ia7xWoTtmoHqoc0GsThK8FmCILTO0VouQwI1Em0erLM0
|
22
|
+
UWeUxzOIL3HiaxElI39RZwAo0AjsQs9btmIWnE6Qa0UlCc4aIZY4FPPeeOg58LXD
|
23
|
+
P3U1zu2dm9BFzJA9fanaKPIFWu9JaLi0h6VfRQ9HxRF0HUxyJz2vc91xhgg4cVv2
|
24
|
+
wIE2htNKt3KJrRoz4FF7abK371dYheDVPtgPrTZDRx3ftlIu5gscIwpCohsCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUYBF7Fed1
|
26
|
+
r+l7945CzwRkfTiERfIwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
|
+
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEARmjroNgUuAeDY25EA0uXOWArJdBq+0Z95PHZweIa
|
29
|
+
Et38VT81ul/VnoEk7ehvRZNUU42XjhJkRXfnfSJdViEZYfuXFmr1GzN+Ib8Og1O/
|
30
|
+
3uC8eiDax+7RpFOeqIjAZ7/6lHrW6/h0oQ+66yLAc7kq4SbPZJETHkAj5JFPJijm
|
31
|
+
YanatYhSZ2P0/k42k38PLHBn8w8YBq7kcnntMyh/DAq7cr1G/5fLfZKLx4+Aim6S
|
32
|
+
wp+1pU+SnrUdzobFQ3Tq1N76CJp27iN3XpNsu5wlSUQDKbDT2hWSPZKO9XsV9/ch
|
33
|
+
rlJMGcoOwGdXlgipiQcc71tXQTxPgxZVw8nYlvHtoBexEupPxcPK54BS4LI88KFr
|
34
|
+
UTqj+Ktsfri/arY458I/Th+KLrzwetk+Z8xNOa1Pw8pTXyynbiVEPfhe80TYH/Zg
|
35
|
+
hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
|
36
|
+
3v0q89HVKLBtQzj+Dii/vHeI
|
37
|
+
-----END CERTIFICATE-----
|
38
|
+
date: 2022-05-25 00:00:00.000000000 Z
|
39
|
+
dependencies:
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: ferrum
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0.11'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0.11'
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0.11'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0.11'
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: nokorexi
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - "~>"
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0.7'
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.7.0
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0.7'
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.7.0
|
80
|
+
description:
|
81
|
+
email: digital.robertson@gmail.com
|
82
|
+
executables: []
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- lib/formscraper_helper.rb
|
87
|
+
homepage: https://github.com/jrobertson/formscraper_helper
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubygems_version: 3.2.22
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: Attempts to scrape the inputs required to complate a 1 page online form.
|
110
|
+
test_files: []
|
metadata.gz.sig
ADDED
Binary file
|