formscraper_helper 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +148 -57
- data.tar.gz.sig +0 -0
- metadata +28 -8
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fc2eba27c47cbfae52b020e76bf1771953bf4944ae5da17216bd513eff74067
|
4
|
+
data.tar.gz: 5200c30d48828c8d29dd3794e26d3779237ddc9da8da5afda854bc15968c33c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 40d8f123a3000ff0072ba5d3bf71ccb0b983363c6298281edc94326af842c81688c67f25624662e5c02d4383b602392be9379ada75136a4205decebe1d843619
|
7
|
+
data.tar.gz: 516f582b35e1da445631d7f8f1daf1a22a86e62b4c5c4b4861d71f16a5918222701bb935d0e65547b1c4d12baf3a170993b761a0bc071ba85f78086893134668
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/formscraper_helper.rb
CHANGED
@@ -2,21 +2,70 @@
|
|
2
2
|
|
3
3
|
# file: formscraper_helper.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'clipboard'
|
8
|
+
require 'fdg22'
|
7
9
|
|
8
10
|
|
9
11
|
class FormScraperHelper
|
10
12
|
|
11
13
|
attr_reader :browser
|
12
14
|
|
13
|
-
|
15
|
+
# note: fd corresponds to FakeDataGenerator22 which is optional
|
16
|
+
#
|
17
|
+
def initialize(url=nil, browser: nil, headless: false, clipb: true,
|
18
|
+
fd: nil, debug: false)
|
19
|
+
|
20
|
+
@url, @clipb, @fd, @debug = url, clipb, fd, debug
|
21
|
+
|
22
|
+
@browser = browser ? browser : FerrumWizard.new(url, headless: headless)
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
def scrape(body=@browser.body)
|
27
|
+
puts 'body: ' + body.inspect if @debug
|
28
|
+
doc = Nokorexi.new(body).to_doc
|
29
|
+
|
30
|
+
#a = doc.root.xpath('//input|//select')
|
31
|
+
a = doc.root.xpath('//*').select do |x|
|
32
|
+
x.name == 'input' or x.name == 'select' or \
|
33
|
+
(x.name == 'button' and x.attributes[:type] == 'submit')
|
34
|
+
end
|
35
|
+
a.reject! do |x|
|
36
|
+
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
37
|
+
end
|
38
|
+
|
39
|
+
a2 = a.map do |x|
|
40
|
+
|
41
|
+
key = x.attributes[:name]
|
42
|
+
name = x.name
|
43
|
+
|
44
|
+
h = {}
|
45
|
+
h[:type] = x.attributes[:type] || name
|
46
|
+
|
47
|
+
if key then
|
48
|
+
h[:xpath] = "//%s[@name=\"%s\"]" % [name, key]
|
49
|
+
else
|
50
|
+
h[:xpath] = "//%s[@type=\"%s\"]" % [name, h[:type]]
|
51
|
+
end
|
52
|
+
|
53
|
+
h[:title] = x.attributes[:title]
|
54
|
+
|
55
|
+
if name == 'select' then
|
56
|
+
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
57
|
+
end
|
58
|
+
|
59
|
+
[key || h[:type], h]
|
14
60
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
61
|
+
end
|
62
|
+
|
63
|
+
# ensure submit appears at the end
|
64
|
+
submit = a2.assoc 'submit'
|
65
|
+
a2.delete submit
|
66
|
+
a2 << submit
|
67
|
+
|
68
|
+
@h = a2.to_h
|
20
69
|
|
21
70
|
end
|
22
71
|
|
@@ -31,15 +80,13 @@ require 'yaml'
|
|
31
80
|
require 'ferrum'
|
32
81
|
require 'nokorexi'
|
33
82
|
|
34
|
-
browser = Ferrum::Browser.new headless: false
|
35
83
|
url = '#{@url}'
|
36
|
-
browser.
|
37
|
-
sleep 2
|
38
|
-
|
84
|
+
browser = FerrumWizard.new(url, headless: false)
|
39
85
|
doc = Nokorexi.new(browser.body).to_doc
|
40
86
|
|
41
87
|
# load the YAML document containing the inputs
|
42
88
|
#filepath = ''
|
89
|
+
filepath = '/tmp/data.yaml'
|
43
90
|
h = YAML.load(File.read(filepath))
|
44
91
|
EOF
|
45
92
|
|
@@ -51,36 +98,45 @@ EOF
|
|
51
98
|
|
52
99
|
if h[:type] == 'text' or h[:type] == 'password' then
|
53
100
|
|
54
|
-
var1 =
|
55
|
-
|
56
|
-
else
|
57
|
-
key.downcase
|
58
|
-
end
|
101
|
+
var1, s2 = format_var1(h[:title], key)
|
102
|
+
s += s2
|
59
103
|
s += var1 + " = h['#{var1}']\n"
|
60
|
-
s += "r.focus.type #{var1}\n
|
104
|
+
s += "r.focus.type #{var1}\n"
|
105
|
+
s += "sleep 0.5\n\n"
|
61
106
|
|
62
107
|
elsif h[:type] == 'select'
|
63
108
|
|
64
|
-
var1 =
|
65
|
-
|
66
|
-
else
|
67
|
-
key.downcase
|
68
|
-
end
|
109
|
+
var1, s2 = format_var1(h[:title], key)
|
110
|
+
s += s2
|
69
111
|
|
70
112
|
s += "# options: #{h[:options].join(', ')}\n"
|
71
113
|
s += "#{var1} = h['#{var1}']\n"
|
72
|
-
s += '
|
73
|
-
s +=
|
114
|
+
s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
|
115
|
+
s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
|
116
|
+
s += "n = titles.index(found.first) + 1\n"
|
74
117
|
s += "r.focus\n"
|
75
118
|
s += "n.times { r.type(:down); sleep 1}\n"
|
76
|
-
s += "r.click\n
|
119
|
+
s += "r.click\n"
|
120
|
+
s += "sleep 0.5\n\n"
|
77
121
|
|
78
122
|
elsif h[:type] == 'checkbox'
|
79
|
-
|
123
|
+
|
124
|
+
s += "r.focus.click\n"
|
125
|
+
s += "sleep 0.5\n\n"
|
126
|
+
|
127
|
+
elsif h[:type] == 'submit'
|
128
|
+
|
129
|
+
s += "r.focus.click\n"
|
130
|
+
s += "sleep 4\n"
|
131
|
+
s += "browser.save_cookies('/tmp/cookies.yaml')\n"
|
132
|
+
|
80
133
|
end
|
81
134
|
|
82
135
|
end
|
83
136
|
|
137
|
+
Clipboard.copy s if @clipb
|
138
|
+
puts 'generated code copied to clipboard'
|
139
|
+
|
84
140
|
return s
|
85
141
|
|
86
142
|
end
|
@@ -97,22 +153,30 @@ EOF
|
|
97
153
|
|
98
154
|
if h[:type] == 'text' or h[:type] == 'password' then
|
99
155
|
|
100
|
-
var1 =
|
101
|
-
|
102
|
-
|
103
|
-
|
156
|
+
var1, s2 = format_var1(h[:title], key)
|
157
|
+
|
158
|
+
s += s2
|
159
|
+
|
160
|
+
if h[:type] == 'password' then
|
161
|
+
@pwd ||= @fd ? @fd.password : 'xxx'
|
162
|
+
s += var1 + ": #{@pwd}\n"
|
163
|
+
elsif @fd
|
164
|
+
|
165
|
+
found = @fd.lookup var1
|
166
|
+
val = found.is_a?(String) ? found : 'xxx'
|
167
|
+
s += var1 + ": '#{val}'\n"
|
168
|
+
else
|
169
|
+
s += var1 + ": xxx\n"
|
104
170
|
end
|
105
|
-
s += var1 + ": xxx\n"
|
106
171
|
|
107
172
|
elsif h[:type] == 'select'
|
108
173
|
|
109
|
-
var1 =
|
110
|
-
h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
|
111
|
-
else
|
112
|
-
key.downcase
|
113
|
-
end
|
174
|
+
var1, s2 = format_var1(h[:title], key)
|
114
175
|
|
115
|
-
s +=
|
176
|
+
s += s2
|
177
|
+
s += "# options: #{h[:options].join(', ')}\n"
|
178
|
+
val = h[:options][1..-1].sample
|
179
|
+
s += "#{var1}: '#{val}'\n"
|
116
180
|
|
117
181
|
elsif h[:type] == 'checkbox'
|
118
182
|
|
@@ -120,43 +184,70 @@ EOF
|
|
120
184
|
|
121
185
|
end
|
122
186
|
|
187
|
+
Clipboard.copy s if @clipb
|
188
|
+
puts 'generated YAML copied to clipboard'
|
189
|
+
|
123
190
|
return s
|
124
191
|
|
125
192
|
end
|
126
193
|
|
127
194
|
private
|
128
195
|
|
129
|
-
|
196
|
+
# returns var1 using arguments rawtitle or key
|
197
|
+
#
|
198
|
+
def format_var1(rawtitle, key)
|
199
|
+
|
200
|
+
var1 = if rawtitle.length > 1 then
|
130
201
|
|
131
|
-
|
202
|
+
s = "\n# " + rawtitle + "\n"
|
203
|
+
title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
|
204
|
+
words = title.downcase.scan(/\w+/)
|
132
205
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
206
|
+
if words.count > 2 then
|
207
|
+
words.take(5).map {|x| x[0]}.join
|
208
|
+
else
|
209
|
+
title.downcase.gsub(/ +/,'_')
|
210
|
+
end
|
211
|
+
|
212
|
+
else
|
213
|
+
newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
|
214
|
+
s = "\n# " + newtitle + "\n"
|
215
|
+
newtitle.gsub(/[^\w ]/,'').downcase\
|
216
|
+
.gsub(/ +/,'_')
|
139
217
|
end
|
140
218
|
|
141
|
-
|
219
|
+
[var1, s]
|
142
220
|
|
143
|
-
|
144
|
-
type = x.name
|
221
|
+
end
|
145
222
|
|
146
|
-
|
147
|
-
h[:type] = x.attributes[:type] || type
|
148
|
-
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
149
|
-
h[:title] = x.attributes[:title]
|
223
|
+
end
|
150
224
|
|
151
|
-
|
152
|
-
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
153
|
-
end
|
225
|
+
class FormDataTool
|
154
226
|
|
155
|
-
|
227
|
+
def initialize(fd: nil)
|
228
|
+
|
229
|
+
@fd = fd
|
156
230
|
|
157
|
-
end.to_h
|
158
231
|
|
159
232
|
end
|
160
233
|
|
234
|
+
def regen(yml='/tmp/data.yaml')
|
235
|
+
|
236
|
+
s = File.read(yml)
|
237
|
+
h = YAML.load(s)
|
238
|
+
|
239
|
+
h2 = h.map do |key, value|
|
240
|
+
v = @fd.lookup key
|
241
|
+
[key, (v || value)]
|
242
|
+
end.to_h
|
243
|
+
|
244
|
+
h2.each do |key, value|
|
245
|
+
puts 'scanning key: ' + key.inspect
|
246
|
+
s.sub!(/#{key}: [^\n]+/, "%s: '%s'" % [key, value])
|
247
|
+
end
|
248
|
+
|
249
|
+
return s
|
250
|
+
|
251
|
+
end
|
161
252
|
|
162
253
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: formscraper_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,28 +35,28 @@ cert_chain:
|
|
35
35
|
hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
|
36
36
|
3v0q89HVKLBtQzj+Dii/vHeI
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-05-
|
38
|
+
date: 2022-05-29 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: ferrumwizard
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 0.3.3
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
59
|
+
version: 0.3.3
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: nokorexi
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.7.0
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: fdg22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
@@ -106,5 +126,5 @@ requirements: []
|
|
106
126
|
rubygems_version: 3.2.22
|
107
127
|
signing_key:
|
108
128
|
specification_version: 4
|
109
|
-
summary: Attempts to scrape the inputs required to
|
129
|
+
summary: Attempts to scrape the inputs required to complete a 1 page online form.
|
110
130
|
test_files: []
|
metadata.gz.sig
CHANGED
Binary file
|