formscraper_helper 0.1.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +148 -57
- data.tar.gz.sig +0 -0
- metadata +28 -8
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fc2eba27c47cbfae52b020e76bf1771953bf4944ae5da17216bd513eff74067
|
4
|
+
data.tar.gz: 5200c30d48828c8d29dd3794e26d3779237ddc9da8da5afda854bc15968c33c1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 40d8f123a3000ff0072ba5d3bf71ccb0b983363c6298281edc94326af842c81688c67f25624662e5c02d4383b602392be9379ada75136a4205decebe1d843619
|
7
|
+
data.tar.gz: 516f582b35e1da445631d7f8f1daf1a22a86e62b4c5c4b4861d71f16a5918222701bb935d0e65547b1c4d12baf3a170993b761a0bc071ba85f78086893134668
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/formscraper_helper.rb
CHANGED
@@ -2,21 +2,70 @@
|
|
2
2
|
|
3
3
|
# file: formscraper_helper.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'clipboard'
|
8
|
+
require 'fdg22'
|
7
9
|
|
8
10
|
|
9
11
|
class FormScraperHelper
|
10
12
|
|
11
13
|
attr_reader :browser
|
12
14
|
|
13
|
-
|
15
|
+
# note: fd corresponds to FakeDataGenerator22 which is optional
|
16
|
+
#
|
17
|
+
def initialize(url=nil, browser: nil, headless: false, clipb: true,
|
18
|
+
fd: nil, debug: false)
|
19
|
+
|
20
|
+
@url, @clipb, @fd, @debug = url, clipb, fd, debug
|
21
|
+
|
22
|
+
@browser = browser ? browser : FerrumWizard.new(url, headless: headless)
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
def scrape(body=@browser.body)
|
27
|
+
puts 'body: ' + body.inspect if @debug
|
28
|
+
doc = Nokorexi.new(body).to_doc
|
29
|
+
|
30
|
+
#a = doc.root.xpath('//input|//select')
|
31
|
+
a = doc.root.xpath('//*').select do |x|
|
32
|
+
x.name == 'input' or x.name == 'select' or \
|
33
|
+
(x.name == 'button' and x.attributes[:type] == 'submit')
|
34
|
+
end
|
35
|
+
a.reject! do |x|
|
36
|
+
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
37
|
+
end
|
38
|
+
|
39
|
+
a2 = a.map do |x|
|
40
|
+
|
41
|
+
key = x.attributes[:name]
|
42
|
+
name = x.name
|
43
|
+
|
44
|
+
h = {}
|
45
|
+
h[:type] = x.attributes[:type] || name
|
46
|
+
|
47
|
+
if key then
|
48
|
+
h[:xpath] = "//%s[@name=\"%s\"]" % [name, key]
|
49
|
+
else
|
50
|
+
h[:xpath] = "//%s[@type=\"%s\"]" % [name, h[:type]]
|
51
|
+
end
|
52
|
+
|
53
|
+
h[:title] = x.attributes[:title]
|
54
|
+
|
55
|
+
if name == 'select' then
|
56
|
+
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
57
|
+
end
|
58
|
+
|
59
|
+
[key || h[:type], h]
|
14
60
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
61
|
+
end
|
62
|
+
|
63
|
+
# ensure submit appears at the end
|
64
|
+
submit = a2.assoc 'submit'
|
65
|
+
a2.delete submit
|
66
|
+
a2 << submit
|
67
|
+
|
68
|
+
@h = a2.to_h
|
20
69
|
|
21
70
|
end
|
22
71
|
|
@@ -31,15 +80,13 @@ require 'yaml'
|
|
31
80
|
require 'ferrum'
|
32
81
|
require 'nokorexi'
|
33
82
|
|
34
|
-
browser = Ferrum::Browser.new headless: false
|
35
83
|
url = '#{@url}'
|
36
|
-
browser.
|
37
|
-
sleep 2
|
38
|
-
|
84
|
+
browser = FerrumWizard.new(url, headless: false)
|
39
85
|
doc = Nokorexi.new(browser.body).to_doc
|
40
86
|
|
41
87
|
# load the YAML document containing the inputs
|
42
88
|
#filepath = ''
|
89
|
+
filepath = '/tmp/data.yaml'
|
43
90
|
h = YAML.load(File.read(filepath))
|
44
91
|
EOF
|
45
92
|
|
@@ -51,36 +98,45 @@ EOF
|
|
51
98
|
|
52
99
|
if h[:type] == 'text' or h[:type] == 'password' then
|
53
100
|
|
54
|
-
var1 =
|
55
|
-
|
56
|
-
else
|
57
|
-
key.downcase
|
58
|
-
end
|
101
|
+
var1, s2 = format_var1(h[:title], key)
|
102
|
+
s += s2
|
59
103
|
s += var1 + " = h['#{var1}']\n"
|
60
|
-
s += "r.focus.type #{var1}\n
|
104
|
+
s += "r.focus.type #{var1}\n"
|
105
|
+
s += "sleep 0.5\n\n"
|
61
106
|
|
62
107
|
elsif h[:type] == 'select'
|
63
108
|
|
64
|
-
var1 =
|
65
|
-
|
66
|
-
else
|
67
|
-
key.downcase
|
68
|
-
end
|
109
|
+
var1, s2 = format_var1(h[:title], key)
|
110
|
+
s += s2
|
69
111
|
|
70
112
|
s += "# options: #{h[:options].join(', ')}\n"
|
71
113
|
s += "#{var1} = h['#{var1}']\n"
|
72
|
-
s += '
|
73
|
-
s +=
|
114
|
+
s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
|
115
|
+
s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
|
116
|
+
s += "n = titles.index(found.first) + 1\n"
|
74
117
|
s += "r.focus\n"
|
75
118
|
s += "n.times { r.type(:down); sleep 1}\n"
|
76
|
-
s += "r.click\n
|
119
|
+
s += "r.click\n"
|
120
|
+
s += "sleep 0.5\n\n"
|
77
121
|
|
78
122
|
elsif h[:type] == 'checkbox'
|
79
|
-
|
123
|
+
|
124
|
+
s += "r.focus.click\n"
|
125
|
+
s += "sleep 0.5\n\n"
|
126
|
+
|
127
|
+
elsif h[:type] == 'submit'
|
128
|
+
|
129
|
+
s += "r.focus.click\n"
|
130
|
+
s += "sleep 4\n"
|
131
|
+
s += "browser.save_cookies('/tmp/cookies.yaml')\n"
|
132
|
+
|
80
133
|
end
|
81
134
|
|
82
135
|
end
|
83
136
|
|
137
|
+
Clipboard.copy s if @clipb
|
138
|
+
puts 'generated code copied to clipboard'
|
139
|
+
|
84
140
|
return s
|
85
141
|
|
86
142
|
end
|
@@ -97,22 +153,30 @@ EOF
|
|
97
153
|
|
98
154
|
if h[:type] == 'text' or h[:type] == 'password' then
|
99
155
|
|
100
|
-
var1 =
|
101
|
-
|
102
|
-
|
103
|
-
|
156
|
+
var1, s2 = format_var1(h[:title], key)
|
157
|
+
|
158
|
+
s += s2
|
159
|
+
|
160
|
+
if h[:type] == 'password' then
|
161
|
+
@pwd ||= @fd ? @fd.password : 'xxx'
|
162
|
+
s += var1 + ": #{@pwd}\n"
|
163
|
+
elsif @fd
|
164
|
+
|
165
|
+
found = @fd.lookup var1
|
166
|
+
val = found.is_a?(String) ? found : 'xxx'
|
167
|
+
s += var1 + ": '#{val}'\n"
|
168
|
+
else
|
169
|
+
s += var1 + ": xxx\n"
|
104
170
|
end
|
105
|
-
s += var1 + ": xxx\n"
|
106
171
|
|
107
172
|
elsif h[:type] == 'select'
|
108
173
|
|
109
|
-
var1 =
|
110
|
-
h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
|
111
|
-
else
|
112
|
-
key.downcase
|
113
|
-
end
|
174
|
+
var1, s2 = format_var1(h[:title], key)
|
114
175
|
|
115
|
-
s +=
|
176
|
+
s += s2
|
177
|
+
s += "# options: #{h[:options].join(', ')}\n"
|
178
|
+
val = h[:options][1..-1].sample
|
179
|
+
s += "#{var1}: '#{val}'\n"
|
116
180
|
|
117
181
|
elsif h[:type] == 'checkbox'
|
118
182
|
|
@@ -120,43 +184,70 @@ EOF
|
|
120
184
|
|
121
185
|
end
|
122
186
|
|
187
|
+
Clipboard.copy s if @clipb
|
188
|
+
puts 'generated YAML copied to clipboard'
|
189
|
+
|
123
190
|
return s
|
124
191
|
|
125
192
|
end
|
126
193
|
|
127
194
|
private
|
128
195
|
|
129
|
-
|
196
|
+
# returns var1 using arguments rawtitle or key
|
197
|
+
#
|
198
|
+
def format_var1(rawtitle, key)
|
199
|
+
|
200
|
+
var1 = if rawtitle.length > 1 then
|
130
201
|
|
131
|
-
|
202
|
+
s = "\n# " + rawtitle + "\n"
|
203
|
+
title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
|
204
|
+
words = title.downcase.scan(/\w+/)
|
132
205
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
206
|
+
if words.count > 2 then
|
207
|
+
words.take(5).map {|x| x[0]}.join
|
208
|
+
else
|
209
|
+
title.downcase.gsub(/ +/,'_')
|
210
|
+
end
|
211
|
+
|
212
|
+
else
|
213
|
+
newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
|
214
|
+
s = "\n# " + newtitle + "\n"
|
215
|
+
newtitle.gsub(/[^\w ]/,'').downcase\
|
216
|
+
.gsub(/ +/,'_')
|
139
217
|
end
|
140
218
|
|
141
|
-
|
219
|
+
[var1, s]
|
142
220
|
|
143
|
-
|
144
|
-
type = x.name
|
221
|
+
end
|
145
222
|
|
146
|
-
|
147
|
-
h[:type] = x.attributes[:type] || type
|
148
|
-
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
149
|
-
h[:title] = x.attributes[:title]
|
223
|
+
end
|
150
224
|
|
151
|
-
|
152
|
-
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
153
|
-
end
|
225
|
+
class FormDataTool
|
154
226
|
|
155
|
-
|
227
|
+
def initialize(fd: nil)
|
228
|
+
|
229
|
+
@fd = fd
|
156
230
|
|
157
|
-
end.to_h
|
158
231
|
|
159
232
|
end
|
160
233
|
|
234
|
+
def regen(yml='/tmp/data.yaml')
|
235
|
+
|
236
|
+
s = File.read(yml)
|
237
|
+
h = YAML.load(s)
|
238
|
+
|
239
|
+
h2 = h.map do |key, value|
|
240
|
+
v = @fd.lookup key
|
241
|
+
[key, (v || value)]
|
242
|
+
end.to_h
|
243
|
+
|
244
|
+
h2.each do |key, value|
|
245
|
+
puts 'scanning key: ' + key.inspect
|
246
|
+
s.sub!(/#{key}: [^\n]+/, "%s: '%s'" % [key, value])
|
247
|
+
end
|
248
|
+
|
249
|
+
return s
|
250
|
+
|
251
|
+
end
|
161
252
|
|
162
253
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: formscraper_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,28 +35,28 @@ cert_chain:
|
|
35
35
|
hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
|
36
36
|
3v0q89HVKLBtQzj+Dii/vHeI
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-05-
|
38
|
+
date: 2022-05-29 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: ferrumwizard
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 0.3.3
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
59
|
+
version: 0.3.3
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: nokorexi
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.7.0
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: fdg22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
@@ -106,5 +126,5 @@ requirements: []
|
|
106
126
|
rubygems_version: 3.2.22
|
107
127
|
signing_key:
|
108
128
|
specification_version: 4
|
109
|
-
summary: Attempts to scrape the inputs required to
|
129
|
+
summary: Attempts to scrape the inputs required to complete a 1 page online form.
|
110
130
|
test_files: []
|
metadata.gz.sig
CHANGED
Binary file
|