formscraper_helper 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +36 -40
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb75f7060b87ea5800e4aa4ddc1b8b9cc54e5f77b7b4e7037031f3f4a714ba36
|
4
|
+
data.tar.gz: 39a21217d1c7f39f3e6e29cac3671481117a187aa49d1ea76c2802a608667608
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 489e4acefdc092393f0530656e894b2c5e896e530e9ed76a73ff751be1591777c23212b28bc0d01b429e2d79333d0a24373729b1d998c4754d959b79d47ed990
|
7
|
+
data.tar.gz: 3dbdb3bb6abf2579d58621babf7dbd8b1f0320542503bbf3f858122f5d7e3bd8d17f0876ea1e9fbdc32efc3d72f11d35ee429e33ac1489dc2d57fb5c9f879a4a
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/formscraper_helper.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# file: formscraper_helper.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'clipboard'
|
8
8
|
|
@@ -13,14 +13,42 @@ class FormScraperHelper
|
|
13
13
|
|
14
14
|
# note: fd corresponds to FakeDataGenerator22 which is optional
|
15
15
|
#
|
16
|
-
def initialize(url, headless: false, clipb: true,
|
16
|
+
def initialize(url=nil, browser: nil, headless: false, clipb: true,
|
17
|
+
fd: nil, debug: false)
|
17
18
|
|
18
19
|
@url, @clipb, @fd, @debug = url, clipb, fd, debug
|
19
|
-
@browser = Ferrum::Browser.new headless: headless
|
20
|
-
@browser.goto(url)
|
21
20
|
|
22
|
-
|
23
|
-
|
21
|
+
@browser = browser ? browser : FerrumWizard.new(url, headless: headless)
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def scrape(body=@browser.body)
|
26
|
+
puts 'body: ' + body.inspect if @debug
|
27
|
+
doc = Nokorexi.new(body).to_doc
|
28
|
+
|
29
|
+
#a = doc.root.xpath('//input|//select')
|
30
|
+
a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
|
31
|
+
a.reject! do |x|
|
32
|
+
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
33
|
+
end
|
34
|
+
|
35
|
+
@h = a.map do |x|
|
36
|
+
|
37
|
+
key = x.attributes[:name]
|
38
|
+
type = x.name
|
39
|
+
|
40
|
+
h = {}
|
41
|
+
h[:type] = x.attributes[:type] || type
|
42
|
+
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
43
|
+
h[:title] = x.attributes[:title]
|
44
|
+
|
45
|
+
if type == 'select' then
|
46
|
+
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
47
|
+
end
|
48
|
+
|
49
|
+
[key, h]
|
50
|
+
|
51
|
+
end.to_h
|
24
52
|
|
25
53
|
end
|
26
54
|
|
@@ -113,7 +141,7 @@ EOF
|
|
113
141
|
|
114
142
|
found = @fd.lookup var1
|
115
143
|
val = found.is_a?(String) ? found : 'xxx'
|
116
|
-
s += var1 + ": #{val}\n"
|
144
|
+
s += var1 + ": '#{val}'\n"
|
117
145
|
else
|
118
146
|
s += var1 + ": xxx\n"
|
119
147
|
end
|
@@ -125,7 +153,7 @@ EOF
|
|
125
153
|
s += s2
|
126
154
|
s += "# options: #{h[:options].join(', ')}\n"
|
127
155
|
val = h[:options][1..-1].sample
|
128
|
-
s += "#{var1}: #{val}\n"
|
156
|
+
s += "#{var1}: '#{val}'\n"
|
129
157
|
|
130
158
|
elsif h[:type] == 'checkbox'
|
131
159
|
|
@@ -143,7 +171,6 @@ EOF
|
|
143
171
|
private
|
144
172
|
|
145
173
|
# returns var1 using arguments rawtitle or key
|
146
|
-
# note: argument s is passed by reference
|
147
174
|
#
|
148
175
|
def format_var1(rawtitle, key)
|
149
176
|
|
@@ -170,36 +197,5 @@ EOF
|
|
170
197
|
|
171
198
|
end
|
172
199
|
|
173
|
-
def scrape()
|
174
|
-
|
175
|
-
doc = Nokorexi.new(@browser.body).to_doc
|
176
|
-
|
177
|
-
#a = doc.root.xpath('//input|//select')
|
178
|
-
a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
|
179
|
-
a.reject! do |x|
|
180
|
-
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
181
|
-
end
|
182
|
-
|
183
|
-
@h = a.map do |x|
|
184
|
-
|
185
|
-
key = x.attributes[:name]
|
186
|
-
type = x.name
|
187
|
-
|
188
|
-
h = {}
|
189
|
-
h[:type] = x.attributes[:type] || type
|
190
|
-
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
191
|
-
h[:title] = x.attributes[:title]
|
192
|
-
|
193
|
-
if type == 'select' then
|
194
|
-
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
195
|
-
end
|
196
|
-
|
197
|
-
[key, h]
|
198
|
-
|
199
|
-
end.to_h
|
200
|
-
|
201
|
-
end
|
202
|
-
|
203
|
-
|
204
200
|
end
|
205
201
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: formscraper_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -38,25 +38,25 @@ cert_chain:
|
|
38
38
|
date: 2022-05-28 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: ferrumwizard
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 0.3.2
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
59
|
+
version: 0.3.2
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: nokorexi
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
metadata.gz.sig
CHANGED
Binary file
|