formscraper_helper 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +74 -31
- data.tar.gz.sig +0 -0
- metadata +3 -3
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 402c3ab5b633cce3e4852deba197a388a30d267e81595829cbc05deb6509f07e
|
4
|
+
data.tar.gz: c3867d152abfa6d910efd7f24b119c413c8ca8368229290583704bba1018108d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a3c5c2217823d8734d7e273a07c166012191f0196a3c039cbb7bb55a0ee2ab3593295b9e985c049ebcfdddaa768696091fa49352cb8af3c7782ac56a25d86dd2
|
7
|
+
data.tar.gz: 15b49ce36dd5517099e04436153f3e63c1f9138a383ddb22033e4896eb9b0203c32bcca28ec18ab1883bc452a175182e21700cbca4cd812cdb618ad0b571abcd
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/formscraper_helper.rb
CHANGED
@@ -4,17 +4,21 @@
|
|
4
4
|
|
5
5
|
require 'ferrum'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'clipboard'
|
7
8
|
|
8
9
|
|
9
10
|
class FormScraperHelper
|
10
11
|
|
11
12
|
attr_reader :browser
|
12
13
|
|
13
|
-
|
14
|
+
# note: fd corresponds to FakeDataGenerator22 which is optional
|
15
|
+
#
|
16
|
+
def initialize(url, headless: false, clipb: true, fd: nil, debug: false)
|
14
17
|
|
15
|
-
@url, @debug = url, debug
|
18
|
+
@url, @clipb, @fd, @debug = url, clipb, fd, debug
|
16
19
|
@browser = Ferrum::Browser.new headless: headless
|
17
20
|
@browser.goto(url)
|
21
|
+
|
18
22
|
sleep 2
|
19
23
|
scrape()
|
20
24
|
|
@@ -40,6 +44,7 @@ doc = Nokorexi.new(browser.body).to_doc
|
|
40
44
|
|
41
45
|
# load the YAML document containing the inputs
|
42
46
|
#filepath = ''
|
47
|
+
filepath = '/tmp/tmp.yaml'
|
43
48
|
h = YAML.load(File.read(filepath))
|
44
49
|
EOF
|
45
50
|
|
@@ -51,29 +56,26 @@ EOF
|
|
51
56
|
|
52
57
|
if h[:type] == 'text' or h[:type] == 'password' then
|
53
58
|
|
54
|
-
var1 =
|
55
|
-
|
56
|
-
else
|
57
|
-
key.downcase
|
58
|
-
end
|
59
|
+
var1, s2 = format_var1(h[:title], key)
|
60
|
+
s += s2
|
59
61
|
s += var1 + " = h['#{var1}']\n"
|
60
|
-
s += "r.focus.type #{var1}\n
|
62
|
+
s += "r.focus.type #{var1}\n"
|
63
|
+
s += "sleep 0.5\n\n"
|
61
64
|
|
62
65
|
elsif h[:type] == 'select'
|
63
66
|
|
64
|
-
var1 =
|
65
|
-
|
66
|
-
else
|
67
|
-
key.downcase
|
68
|
-
end
|
67
|
+
var1, s2 = format_var1(h[:title], key)
|
68
|
+
s += s2
|
69
69
|
|
70
70
|
s += "# options: #{h[:options].join(', ')}\n"
|
71
71
|
s += "#{var1} = h['#{var1}']\n"
|
72
|
-
s += '
|
73
|
-
s +=
|
72
|
+
s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
|
73
|
+
s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
|
74
|
+
s += "n = titles.index(found.first) + 1\n"
|
74
75
|
s += "r.focus\n"
|
75
76
|
s += "n.times { r.type(:down); sleep 1}\n"
|
76
|
-
s += "r.click\n
|
77
|
+
s += "r.click\n"
|
78
|
+
s += "sleep 0.5\n\n"
|
77
79
|
|
78
80
|
elsif h[:type] == 'checkbox'
|
79
81
|
s += "r.focus.click\n\n"
|
@@ -81,6 +83,9 @@ EOF
|
|
81
83
|
|
82
84
|
end
|
83
85
|
|
86
|
+
Clipboard.copy s if @clipb
|
87
|
+
puts 'generated code copied to clipboard'
|
88
|
+
|
84
89
|
return s
|
85
90
|
|
86
91
|
end
|
@@ -97,22 +102,30 @@ EOF
|
|
97
102
|
|
98
103
|
if h[:type] == 'text' or h[:type] == 'password' then
|
99
104
|
|
100
|
-
var1 =
|
101
|
-
|
102
|
-
|
103
|
-
|
105
|
+
var1, s2 = format_var1(h[:title], key)
|
106
|
+
|
107
|
+
s += s2
|
108
|
+
|
109
|
+
if h[:type] == 'password' then
|
110
|
+
@pwd ||= @fd ? @fd.password : 'xxx'
|
111
|
+
s += var1 + ": #{@pwd}\n"
|
112
|
+
elsif @fd
|
113
|
+
|
114
|
+
found = @fd.lookup var1
|
115
|
+
val = found.is_a?(String) ? found : 'xxx'
|
116
|
+
s += var1 + ": #{val}\n"
|
117
|
+
else
|
118
|
+
s += var1 + ": xxx\n"
|
104
119
|
end
|
105
|
-
s += var1 + ": xxx\n"
|
106
120
|
|
107
121
|
elsif h[:type] == 'select'
|
108
122
|
|
109
|
-
var1 =
|
110
|
-
h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
|
111
|
-
else
|
112
|
-
key.downcase
|
113
|
-
end
|
123
|
+
var1, s2 = format_var1(h[:title], key)
|
114
124
|
|
115
|
-
s +=
|
125
|
+
s += s2
|
126
|
+
s += "# options: #{h[:options].join(', ')}\n"
|
127
|
+
val = h[:options][1..-1].sample
|
128
|
+
s += "#{var1}: #{val}\n"
|
116
129
|
|
117
130
|
elsif h[:type] == 'checkbox'
|
118
131
|
|
@@ -120,20 +133,49 @@ EOF
|
|
120
133
|
|
121
134
|
end
|
122
135
|
|
136
|
+
Clipboard.copy s if @clipb
|
137
|
+
puts 'generated YAML copied to clipboard'
|
138
|
+
|
123
139
|
return s
|
124
140
|
|
125
141
|
end
|
126
142
|
|
127
143
|
private
|
128
144
|
|
129
|
-
|
145
|
+
# returns var1 using arguments rawtitle or key
|
146
|
+
# note: argument s is passed by reference
|
147
|
+
#
|
148
|
+
def format_var1(rawtitle, key)
|
149
|
+
|
150
|
+
var1 = if rawtitle.length > 1 then
|
151
|
+
|
152
|
+
s = "\n# " + rawtitle + "\n"
|
153
|
+
title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
|
154
|
+
words = title.downcase.scan(/\w+/)
|
155
|
+
|
156
|
+
if words.count > 2 then
|
157
|
+
words.take(5).map {|x| x[0]}.join
|
158
|
+
else
|
159
|
+
title.downcase.gsub(/ +/,'_')
|
160
|
+
end
|
161
|
+
|
162
|
+
else
|
163
|
+
newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
|
164
|
+
s = "\n# " + newtitle + "\n"
|
165
|
+
newtitle.gsub(/[^\w ]/,'').downcase\
|
166
|
+
.gsub(/ +/,'_')
|
167
|
+
end
|
168
|
+
|
169
|
+
[var1, s]
|
170
|
+
|
171
|
+
end
|
172
|
+
|
173
|
+
def scrape()
|
130
174
|
|
131
175
|
doc = Nokorexi.new(@browser.body).to_doc
|
132
176
|
|
133
177
|
#a = doc.root.xpath('//input|//select')
|
134
|
-
a = doc.root.xpath('//*').select
|
135
|
-
x.name == 'input' or x.name == 'select'
|
136
|
-
end
|
178
|
+
a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
|
137
179
|
a.reject! do |x|
|
138
180
|
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
139
181
|
end
|
@@ -160,3 +202,4 @@ EOF
|
|
160
202
|
|
161
203
|
|
162
204
|
end
|
205
|
+
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: formscraper_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
|
36
36
|
3v0q89HVKLBtQzj+Dii/vHeI
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-05-
|
38
|
+
date: 2022-05-28 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: ferrum
|
@@ -106,5 +106,5 @@ requirements: []
|
|
106
106
|
rubygems_version: 3.2.22
|
107
107
|
signing_key:
|
108
108
|
specification_version: 4
|
109
|
-
summary: Attempts to scrape the inputs required to
|
109
|
+
summary: Attempts to scrape the inputs required to complete a 1 page online form.
|
110
110
|
test_files: []
|
metadata.gz.sig
CHANGED
Binary file
|