formscraper_helper 0.1.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b4f9be47b4c46d2e2823161ac508b4748afab4f2f6a03e2617bb91ae6ababe9
4
- data.tar.gz: bea40e151206763162b372bb2163010f65ce021fbd795b82757780eba32ccde2
3
+ metadata.gz: 7fc2eba27c47cbfae52b020e76bf1771953bf4944ae5da17216bd513eff74067
4
+ data.tar.gz: 5200c30d48828c8d29dd3794e26d3779237ddc9da8da5afda854bc15968c33c1
5
5
  SHA512:
6
- metadata.gz: 8c7809564deb15b0b6a1ce109a676ceed9f77211dcf30fe14894a638ad75609421de74cc6863064d7c4fdb428b99fdffbb9ae777598bd3c32c99e0ca542f4860
7
- data.tar.gz: a61eaf497175a3bb4aac7eed0e3e54038640697a90484c05adc11fa8342f1c668f94255488d1a6a6e932c8a42e0143b821af1bd98f4256605e86da92a9c4f218
6
+ metadata.gz: 40d8f123a3000ff0072ba5d3bf71ccb0b983363c6298281edc94326af842c81688c67f25624662e5c02d4383b602392be9379ada75136a4205decebe1d843619
7
+ data.tar.gz: 516f582b35e1da445631d7f8f1daf1a22a86e62b4c5c4b4861d71f16a5918222701bb935d0e65547b1c4d12baf3a170993b761a0bc071ba85f78086893134668
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,21 +2,70 @@
2
2
 
3
3
  # file: formscraper_helper.rb
4
4
 
5
- require 'ferrum'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'clipboard'
8
+ require 'fdg22'
7
9
 
8
10
 
9
11
  class FormScraperHelper
10
12
 
11
13
  attr_reader :browser
12
14
 
13
- def initialize(url, headless: false, debug: false)
15
+ # note: fd corresponds to FakeDataGenerator22 which is optional
16
+ #
17
+ def initialize(url=nil, browser: nil, headless: false, clipb: true,
18
+ fd: nil, debug: false)
19
+
20
+ @url, @clipb, @fd, @debug = url, clipb, fd, debug
21
+
22
+ @browser = browser ? browser : FerrumWizard.new(url, headless: headless)
23
+
24
+ end
25
+
26
+ def scrape(body=@browser.body)
27
+ puts 'body: ' + body.inspect if @debug
28
+ doc = Nokorexi.new(body).to_doc
29
+
30
+ #a = doc.root.xpath('//input|//select')
31
+ a = doc.root.xpath('//*').select do |x|
32
+ x.name == 'input' or x.name == 'select' or \
33
+ (x.name == 'button' and x.attributes[:type] == 'submit')
34
+ end
35
+ a.reject! do |x|
36
+ x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
37
+ end
38
+
39
+ a2 = a.map do |x|
40
+
41
+ key = x.attributes[:name]
42
+ name = x.name
43
+
44
+ h = {}
45
+ h[:type] = x.attributes[:type] || name
46
+
47
+ if key then
48
+ h[:xpath] = "//%s[@name=\"%s\"]" % [name, key]
49
+ else
50
+ h[:xpath] = "//%s[@type=\"%s\"]" % [name, h[:type]]
51
+ end
52
+
53
+ h[:title] = x.attributes[:title]
54
+
55
+ if name == 'select' then
56
+ h[:options] = x.xpath('option').map {|x| x.text.to_s}
57
+ end
58
+
59
+ [key || h[:type], h]
14
60
 
15
- @url, @debug = url, debug
16
- @browser = Ferrum::Browser.new headless: headless
17
- @browser.goto(url)
18
- sleep 2
19
- scrape()
61
+ end
62
+
63
+ # ensure submit appears at the end
64
+ submit = a2.assoc 'submit'
65
+ a2.delete submit
66
+ a2 << submit
67
+
68
+ @h = a2.to_h
20
69
 
21
70
  end
22
71
 
@@ -31,15 +80,13 @@ require 'yaml'
31
80
  require 'ferrum'
32
81
  require 'nokorexi'
33
82
 
34
- browser = Ferrum::Browser.new headless: false
35
83
  url = '#{@url}'
36
- browser.goto(url)
37
- sleep 2
38
-
84
+ browser = FerrumWizard.new(url, headless: false)
39
85
  doc = Nokorexi.new(browser.body).to_doc
40
86
 
41
87
  # load the YAML document containing the inputs
42
88
  #filepath = ''
89
+ filepath = '/tmp/data.yaml'
43
90
  h = YAML.load(File.read(filepath))
44
91
  EOF
45
92
 
@@ -51,36 +98,45 @@ EOF
51
98
 
52
99
  if h[:type] == 'text' or h[:type] == 'password' then
53
100
 
54
- var1 = if h[:title].length > 1 then
55
- h[:title].downcase.gsub(/ +/,'_')
56
- else
57
- key.downcase
58
- end
101
+ var1, s2 = format_var1(h[:title], key)
102
+ s += s2
59
103
  s += var1 + " = h['#{var1}']\n"
60
- s += "r.focus.type #{var1}\n\n"
104
+ s += "r.focus.type #{var1}\n"
105
+ s += "sleep 0.5\n\n"
61
106
 
62
107
  elsif h[:type] == 'select'
63
108
 
64
- var1 = if h[:title].length > 1 then
65
- h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
66
- else
67
- key.downcase
68
- end
109
+ var1, s2 = format_var1(h[:title], key)
110
+ s += s2
69
111
 
70
112
  s += "# options: #{h[:options].join(', ')}\n"
71
113
  s += "#{var1} = h['#{var1}']\n"
72
- s += 'r = titles.grep /#{' + var1 + '}/i' + "\n"
73
- s += "n = titles.index(r.first) + 1\n"
114
+ s += 'titles = %w(' + h[:options].join(' ') + ')' + "\n"
115
+ s += 'found = titles.grep /#{' + var1 + '}/i' + "\n"
116
+ s += "n = titles.index(found.first) + 1\n"
74
117
  s += "r.focus\n"
75
118
  s += "n.times { r.type(:down); sleep 1}\n"
76
- s += "r.click\n\n"
119
+ s += "r.click\n"
120
+ s += "sleep 0.5\n\n"
77
121
 
78
122
  elsif h[:type] == 'checkbox'
79
- s += "r.focus.click\n\n"
123
+
124
+ s += "r.focus.click\n"
125
+ s += "sleep 0.5\n\n"
126
+
127
+ elsif h[:type] == 'submit'
128
+
129
+ s += "r.focus.click\n"
130
+ s += "sleep 4\n"
131
+ s += "browser.save_cookies('/tmp/cookies.yaml')\n"
132
+
80
133
  end
81
134
 
82
135
  end
83
136
 
137
+ Clipboard.copy s if @clipb
138
+ puts 'generated code copied to clipboard'
139
+
84
140
  return s
85
141
 
86
142
  end
@@ -97,22 +153,30 @@ EOF
97
153
 
98
154
  if h[:type] == 'text' or h[:type] == 'password' then
99
155
 
100
- var1 = if h[:title].length > 1 then
101
- h[:title].downcase.gsub(/ +/,'_')
102
- else
103
- key.downcase
156
+ var1, s2 = format_var1(h[:title], key)
157
+
158
+ s += s2
159
+
160
+ if h[:type] == 'password' then
161
+ @pwd ||= @fd ? @fd.password : 'xxx'
162
+ s += var1 + ": #{@pwd}\n"
163
+ elsif @fd
164
+
165
+ found = @fd.lookup var1
166
+ val = found.is_a?(String) ? found : 'xxx'
167
+ s += var1 + ": '#{val}'\n"
168
+ else
169
+ s += var1 + ": xxx\n"
104
170
  end
105
- s += var1 + ": xxx\n"
106
171
 
107
172
  elsif h[:type] == 'select'
108
173
 
109
- var1 = if h[:title].length > 1 then
110
- h[:title].downcase.gsub(/ +/,'_').gsub(/\W/,'')
111
- else
112
- key.downcase
113
- end
174
+ var1, s2 = format_var1(h[:title], key)
114
175
 
115
- s += "#{var1}: xxx\n"
176
+ s += s2
177
+ s += "# options: #{h[:options].join(', ')}\n"
178
+ val = h[:options][1..-1].sample
179
+ s += "#{var1}: '#{val}'\n"
116
180
 
117
181
  elsif h[:type] == 'checkbox'
118
182
 
@@ -120,43 +184,70 @@ EOF
120
184
 
121
185
  end
122
186
 
187
+ Clipboard.copy s if @clipb
188
+ puts 'generated YAML copied to clipboard'
189
+
123
190
  return s
124
191
 
125
192
  end
126
193
 
127
194
  private
128
195
 
129
- def scrape()
196
+ # returns var1 using arguments rawtitle or key
197
+ #
198
+ def format_var1(rawtitle, key)
199
+
200
+ var1 = if rawtitle.length > 1 then
130
201
 
131
- doc = Nokorexi.new(@browser.body).to_doc
202
+ s = "\n# " + rawtitle + "\n"
203
+ title = rawtitle.scan(/[A-Z][^A-Z]+/).join(' ').gsub(/[^\w ]/,'')
204
+ words = title.downcase.scan(/\w+/)
132
205
 
133
- #a = doc.root.xpath('//input|//select')
134
- a = doc.root.xpath('//*').select do |x|
135
- x.name == 'input' or x.name == 'select'
136
- end
137
- a.reject! do |x|
138
- x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
206
+ if words.count > 2 then
207
+ words.take(5).map {|x| x[0]}.join
208
+ else
209
+ title.downcase.gsub(/ +/,'_')
210
+ end
211
+
212
+ else
213
+ newtitle = key.scan(/[A-Z][^A-Z]+/).join(' ')
214
+ s = "\n# " + newtitle + "\n"
215
+ newtitle.gsub(/[^\w ]/,'').downcase\
216
+ .gsub(/ +/,'_')
139
217
  end
140
218
 
141
- @h = a.map do |x|
219
+ [var1, s]
142
220
 
143
- key = x.attributes[:name]
144
- type = x.name
221
+ end
145
222
 
146
- h = {}
147
- h[:type] = x.attributes[:type] || type
148
- h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
149
- h[:title] = x.attributes[:title]
223
+ end
150
224
 
151
- if type == 'select' then
152
- h[:options] = x.xpath('option').map {|x| x.text.to_s}
153
- end
225
+ class FormDataTool
154
226
 
155
- [key, h]
227
+ def initialize(fd: nil)
228
+
229
+ @fd = fd
156
230
 
157
- end.to_h
158
231
 
159
232
  end
160
233
 
234
+ def regen(yml='/tmp/data.yaml')
235
+
236
+ s = File.read(yml)
237
+ h = YAML.load(s)
238
+
239
+ h2 = h.map do |key, value|
240
+ v = @fd.lookup key
241
+ [key, (v || value)]
242
+ end.to_h
243
+
244
+ h2.each do |key, value|
245
+ puts 'scanning key: ' + key.inspect
246
+ s.sub!(/#{key}: [^\n]+/, "%s: '%s'" % [key, value])
247
+ end
248
+
249
+ return s
250
+
251
+ end
161
252
 
162
253
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: formscraper_helper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,28 +35,28 @@ cert_chain:
35
35
  hBw+4Vg30COBUGSGYs46Cy3vhis61poJJeWm/pLTMOH4lcl/Jz5fR//QP9ovEu3k
36
36
  3v0q89HVKLBtQzj+Dii/vHeI
37
37
  -----END CERTIFICATE-----
38
- date: 2022-05-25 00:00:00.000000000 Z
38
+ date: 2022-05-29 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
- name: ferrum
41
+ name: ferrumwizard
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.11'
46
+ version: '0.3'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: '0.11'
49
+ version: 0.3.3
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.11'
56
+ version: '0.3'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: '0.11'
59
+ version: 0.3.3
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: nokorexi
62
62
  requirement: !ruby/object:Gem::Requirement
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.7.0
80
+ - !ruby/object:Gem::Dependency
81
+ name: fdg22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
@@ -106,5 +126,5 @@ requirements: []
106
126
  rubygems_version: 3.2.22
107
127
  signing_key:
108
128
  specification_version: 4
109
- summary: Attempts to scrape the inputs required to complate a 1 page online form.
129
+ summary: Attempts to scrape the inputs required to complete a 1 page online form.
110
130
  test_files: []
metadata.gz.sig CHANGED
Binary file