formscraper_helper 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b4f9be47b4c46d2e2823161ac508b4748afab4f2f6a03e2617bb91ae6ababe9
4
- data.tar.gz: bea40e151206763162b372bb2163010f65ce021fbd795b82757780eba32ccde2
3
+ metadata.gz: 7fc2eba27c47cbfae52b020e76bf1771953bf4944ae5da17216bd513eff74067
4
+ data.tar.gz: 5200c30d48828c8d29dd3794e26d3779237ddc9da8da5afda854bc15968c33c1
5
5
  SHA512:
6
- metadata.gz: 8c7809564deb15b0b6a1ce109a676ceed9f77211dcf30fe14894a638ad75609421de74cc6863064d7c4fdb428b99fdffbb9ae777598bd3c32c99e0ca542f4860
7
- data.tar.gz: a61eaf497175a3bb4aac7eed0e3e54038640697a90484c05adc11fa8342f1c668f94255488d1a6a6e932c8a42e0143b821af1bd98f4256605e86da92a9c4f218
6
+ metadata.gz: 40d8f123a3000ff0072ba5d3bf71ccb0b983363c6298281edc94326af842c81688c67f25624662e5c02d4383b602392be9379ada75136a4205decebe1d843619
7
+ data.tar.gz: 516f582b35e1da445631d7f8f1daf1a22a86e62b4c5c4b4861d71f16a5918222701bb935d0e65547b1c4d12baf3a170993b761a0bc071ba85f78086893134668
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,21 +2,70 @@
2
2
 
3
3
  # file: formscraper_helper.rb
4
4
 
5
- require 'ferrum'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'clipboard'
8
+ require 'fdg22'
7
9
 
8
10
 
9
11
  class FormScraperHelper
10
12
 
11
13
  attr_reader :browser
12
14
 
13
- def initialize(url, headless: false, debug: false)
15
+ # note: fd corresponds to FakeDataGenerator22 which is optional
16
+ #
17
+ def initialize(url=nil, browser: nil, headless: false, clipb: true,
18
+ fd: nil, debug: false)
19
+
20
+ @url, @clipb, @fd, @debug = url, clipb, fd, debug
21
+
22
+ @browser = browser ? browser : FerrumWizard.new(url, headless: headless)
23
+
24
+ end
25
+
26
+ def scrape(body=@browser.body)
27
+ puts 'body: ' + body.inspect if @debug
28
+ doc = Nokorexi.new(body).to_doc
29
+
30
+ #a = doc.root.xpath('//input|//select')
31
+ a = doc.root.xpath('//*').select do |x|
32
+ x.name == 'input' or x.name == 'select' or \
33
+ (x.name == 'button' and x.attributes[:type] == 'submit')
34
+ end
35
+ a.reject! do |x|
36
+ x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
37
+ end
38
+
39
+ a2 = a.map do |x|
40
+
41
+ key = x.attributes[:name]
42
+ name = x.name
43
+
44
+ h = {}
45
+ h[:type] = x.attributes[:type] || name
46
+
47
+ if key then
48
+ h[:xpath] = "//%s[@name=\"%s\"]" % [name, key]
49
+ else
50
+ h[:xpath] = "//%s[@type=\"%s\"]" % [name, h[:type]]
51
+ end
52
+
53
+ h[:title] = x.attributes[:title]
54
+
55
+ if name == 'select' then
56
+ h[:options] = x.xpath('option').map {|x| x.text.to_s}
57
+ end
58
+
59
+ [key || h[:type], h]
14
60
 
15
- @url, @debug = url, debug
16
- @browser = Ferrum::Browser.new headless: headless
17
- @browser.goto(url)
18
- sleep 2
19
- scrape()
61
+ end
62
+
63
+ # ensure submit appears at the end
64
+ submit = a2.assoc 'submit'
65
+ a2.delete submit
66
+ a2 << submit
67
+
68
+ @h = a2.to_h
20
69
 
21
70
  end
22
71
 
@@ -31,15 +80,13 @@ require 'yaml'
31
80
  require 'ferrum'
32
81
  require 'nokorexi'
33
82
 
34
- browser = Ferrum::Browser.new headless: false
35
83
  url = '#{@url}'
36
- browser.goto(url)
37
- sleep 2
38
-
84
+ browser = FerrumWizard.new(url, headless: false)
39
85
  doc = Nokorexi.new(browser.body).to_doc
40
86
 
41
87
  # load the YAML document containing the inputs
42
88
  #filepath = ''
89
+ filepath = '/tmp/data.yaml'
43
90
  h = YAML.load(File.read(filepath))
44
91
  EOF
45
92
 
@@ -51,36 +98,45 @@ EOF
51
98
 
52
99
  if h[:type] == 'text' or h[:type] == 'password' then
53
100
 
54
- var1 = if h[:title].length > 1 then
55
- h[:title].downcase.gsub(/ +/,'_')
56
- else
57
- key.downcase
58
- end
101
+ var1, s2 = format_var1(h[:title], key)
102
+ s += s2
59
103
  s += var1 + " = h['#{var1}']\n"
60
- s += "r.focus.type #{var1}\n\n"
104
+ s += "r.focus.type #{var1}\n"
105
+ s += "sleep 0.5\n\n"
61
106
 
62
107
  elsif h[:type] == 'select'
63
108
 
64
- var1 = if h[:title].length > 1 then
65
- h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
66
- else
67
- key.downcase
68
- end
109
+ var1, s2 = format_var1(h[:title], key)
110
+ s += s2
69
111
 
70
112
  s += "# options: #{h[:options].join(', ')}\n"
71
113
  s += "#{var1} = h['#{var1}']\n"
72
- s += 'r = titles.grep /#{' + var1 + '}/i' + "\n"
73
- s += "n = titles.index(r.first) + 1\n"
114
+ s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
115
+ s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
116
+ s += "n = titles.index(found.first) + 1\n"
74
117
  s += "r.focus\n"
75
118
  s += "n.times { r.type(:down); sleep 1}\n"
76
- s += "r.click\n\n"
119
+ s += "r.click\n"
120
+ s += "sleep 0.5\n\n"
77
121
 
78
122
  elsif h[:type] == 'checkbox'
79
- s += "r.focus.click\n\n"
123
+
124
+ s += "r.focus.click\n"
125
+ s += "sleep 0.5\n\n"
126
+
127
+ elsif h[:type] == 'submit'
128
+
129
+ s += "r.focus.click\n"
130
+ s += "sleep 4\n"
131
+ s += "browser.save_cookies('/tmp/cookies.yaml')\n"
132
+
80
133
  end
81
134
 
82
135
  end
83
136
 
137
+ Clipboard.copy s if @clipb
138
+ puts 'generated code copied to clipboard'
139
+
84
140
  return s
85
141
 
86
142
  end
@@ -97,22 +153,30 @@ EOF
97
153
 
98
154
  if h[:type] == 'text' or h[:type] == 'password' then
99
155
 
100
- var1 = if h[:title].length > 1 then
101
- h[:title].downcase.gsub(/ +/,'_')
102
- else
103
- key.downcase
156
+ var1, s2 = format_var1(h[:title], key)
157
+
158
+ s += s2
159
+
160
+ if h[:type] == 'password' then
161
+ @pwd ||= @fd ? @fd.password : 'xxx'
162
+ s += var1 + ": #{@pwd}\n"
163
+ elsif @fd
164
+
165
+ found = @fd.lookup var1
166
+ val = found.is_a?(String) ? found : 'xxx'
167
+ s += var1 + ": '#{val}'\n"
168
+ else
169
+ s += var1 + ": xxx\n"
104
170
  end
105
- s += var1 + ": xxx\n"
106
171
 
107
172
  elsif h[:type] == 'select'
108
173
 
109
- var1 = if h[:title].length > 1 then
110
- h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
111
- else
112
- key.downcase
113
- end
174
+ var1, s2 = format_var1(h[:title], key)
114
175
 
115
- s += "#{var1}: xxx\n"
176
+ s += s2
177
+ s += "# options: #{h[:options].join(', ')}\n"
178
+ val = h[:options][1..-1].sample
179
+ s += "#{var1}: '#{val}'\n"
116
180
 
117
181
  elsif h[:type] == 'checkbox'
118
182
 
@@ -120,43 +184,70 @@ EOF
120
184
 
121
185
  end
122
186
 
187
+ Clipboard.copy s if @clipb
188
+ puts 'generated YAML copied to clipboard'
189
+
123
190
  return s
124
191
 
125
192
  end
126
193
 
127
194
  private
128
195
 
129
- def scrape()
196
+ # returns var1 using arguments rawtitle or key
197
+ #
198
+ def format_var1(rawtitle, key)
199
+
200
+ var1 = if rawtitle.length > 1 then
130
201
 
131
- doc = Nokorexi.new(@browser.body).to_doc
202
+ s = "\n# " + rawtitle + "\n"
203
+ title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
204
+ words = title.downcase.scan(/\w+/)
132
205
 
133
- #a = doc.root.xpath('//input|//select')
134
- a = doc.root.xpath('//*').select do |x|
135
- x.name == 'input' or x.name == 'select'
136
- end
137
- a.reject! do |x|
138
- x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
206
+ if words.count > 2 then
207
+ words.take(5).map {|x| x[0]}.join
208
+ else
209
+ title.downcase.gsub(/ +/,'_')
210
+ end
211
+
212
+ else
213
+ newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
214
+ s = "\n# " + newtitle + "\n"
215
+ newtitle.gsub(/[^\w ]/,'').downcase\
216
+ .gsub(/ +/,'_')
139
217
  end
140
218
 
141
- @h = a.map do |x|
219
+ [var1, s]
142
220
 
143
- key = x.attributes[:name]
144
- type = x.name
221
+ end
145
222
 
146
- h = {}
147
- h[:type] = x.attributes[:type] || type
148
- h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
149
- h[:title] = x.attributes[:title]
223
+ end
150
224
 
151
- if type == 'select' then
152
- h[:options] = x.xpath('option').map {|x| x.text.to_s}
153
- end
225
+ class FormDataTool
154
226
 
155
- [key, h]
227
+ def initialize(fd: nil)
228
+
229
+ @fd = fd
156
230
 
157
- end.to_h
158
231
 
159
232
  end
160
233
 
234
+ def regen(yml='/tmp/data.yaml')
235
+
236
+ s = File.read(yml)
237
+ h = YAML.load(s)
238
+
239
+ h2 = h.map do |key, value|
240
+ v = @fd.lookup key
241
+ [key, (v || value)]
242
+ end.to_h
243
+
244
+ h2.each do |key, value|
245
+ puts 'scanning key: ' + key.inspect
246
+ s.sub!(/#{key}: [^\n]+/, "%s: '%s'" % [key, value])
247
+ end
248
+
249
+ return s
250
+
251
+ end
161
252
 
162
253
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: formscraper_helper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,28 +35,28 @@ cert_chain:
35
35
  hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
36
36
  3v0q89HVKLBtQzj+Dii/vHeI
37
37
  -----END CERTIFICATE-----
38
- date: 2022-05-25 00:00:00.000000000 Z
38
+ date: 2022-05-29 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
- name: ferrum
41
+ name: ferrumwizard
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.11'
46
+ version: '0.3'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: '0.11'
49
+ version: 0.3.3
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.11'
56
+ version: '0.3'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: '0.11'
59
+ version: 0.3.3
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: nokorexi
62
62
  requirement: !ruby/object:Gem::Requirement
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.7.0
80
+ - !ruby/object:Gem::Dependency
81
+ name: fdg22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
@@ -106,5 +126,5 @@ requirements: []
106
126
  rubygems_version: 3.2.22
107
127
  signing_key:
108
128
  specification_version: 4
109
- summary: Attempts to scrape the inputs required to complate a 1 page online form.
129
+ summary: Attempts to scrape the inputs required to complete a 1 page online form.
110
130
  test_files: []
metadata.gz.sig CHANGED
Binary file