formscraper_helper 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/formscraper_helper.rb +36 -40
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb75f7060b87ea5800e4aa4ddc1b8b9cc54e5f77b7b4e7037031f3f4a714ba36
|
4
|
+
data.tar.gz: 39a21217d1c7f39f3e6e29cac3671481117a187aa49d1ea76c2802a608667608
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 489e4acefdc092393f0530656e894b2c5e896e530e9ed76a73ff751be1591777c23212b28bc0d01b429e2d79333d0a24373729b1d998c4754d959b79d47ed990
|
7
|
+
data.tar.gz: 3dbdb3bb6abf2579d58621babf7dbd8b1f0320542503bbf3f858122f5d7e3bd8d17f0876ea1e9fbdc32efc3d72f11d35ee429e33ac1489dc2d57fb5c9f879a4a
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/formscraper_helper.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# file: formscraper_helper.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'clipboard'
|
8
8
|
|
@@ -13,14 +13,42 @@ class FormScraperHelper
|
|
13
13
|
|
14
14
|
# note: fd corresponds to FakeDataGenerator22 which is optional
|
15
15
|
#
|
16
|
-
def initialize(url, headless: false, clipb: true,
|
16
|
+
def initialize(url=nil, browser: nil, headless: false, clipb: true,
|
17
|
+
fd: nil, debug: false)
|
17
18
|
|
18
19
|
@url, @clipb, @fd, @debug = url, clipb, fd, debug
|
19
|
-
@browser = Ferrum::Browser.new headless: headless
|
20
|
-
@browser.goto(url)
|
21
20
|
|
22
|
-
|
23
|
-
|
21
|
+
@browser = browser ? browser : FerrumWizard.new(url, headless: headless)
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def scrape(body=@browser.body)
|
26
|
+
puts 'body: ' + body.inspect if @debug
|
27
|
+
doc = Nokorexi.new(body).to_doc
|
28
|
+
|
29
|
+
#a = doc.root.xpath('//input|//select')
|
30
|
+
a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
|
31
|
+
a.reject! do |x|
|
32
|
+
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
33
|
+
end
|
34
|
+
|
35
|
+
@h = a.map do |x|
|
36
|
+
|
37
|
+
key = x.attributes[:name]
|
38
|
+
type = x.name
|
39
|
+
|
40
|
+
h = {}
|
41
|
+
h[:type] = x.attributes[:type] || type
|
42
|
+
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
43
|
+
h[:title] = x.attributes[:title]
|
44
|
+
|
45
|
+
if type == 'select' then
|
46
|
+
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
47
|
+
end
|
48
|
+
|
49
|
+
[key, h]
|
50
|
+
|
51
|
+
end.to_h
|
24
52
|
|
25
53
|
end
|
26
54
|
|
@@ -113,7 +141,7 @@ EOF
|
|
113
141
|
|
114
142
|
found = @fd.lookup var1
|
115
143
|
val = found.is_a?(String) ? found : 'xxx'
|
116
|
-
s += var1 + ": #{val}\n"
|
144
|
+
s += var1 + ": '#{val}'\n"
|
117
145
|
else
|
118
146
|
s += var1 + ": xxx\n"
|
119
147
|
end
|
@@ -125,7 +153,7 @@ EOF
|
|
125
153
|
s += s2
|
126
154
|
s += "# options: #{h[:options].join(', ')}\n"
|
127
155
|
val = h[:options][1..-1].sample
|
128
|
-
s += "#{var1}: #{val}\n"
|
156
|
+
s += "#{var1}: '#{val}'\n"
|
129
157
|
|
130
158
|
elsif h[:type] == 'checkbox'
|
131
159
|
|
@@ -143,7 +171,6 @@ EOF
|
|
143
171
|
private
|
144
172
|
|
145
173
|
# returns var1 using arguments rawtitle or key
|
146
|
-
# note: argument s is passed by reference
|
147
174
|
#
|
148
175
|
def format_var1(rawtitle, key)
|
149
176
|
|
@@ -170,36 +197,5 @@ EOF
|
|
170
197
|
|
171
198
|
end
|
172
199
|
|
173
|
-
def scrape()
|
174
|
-
|
175
|
-
doc = Nokorexi.new(@browser.body).to_doc
|
176
|
-
|
177
|
-
#a = doc.root.xpath('//input|//select')
|
178
|
-
a = doc.root.xpath('//*').select {|x| x.name == 'input' or x.name == 'select'}
|
179
|
-
a.reject! do |x|
|
180
|
-
x.attributes[:type] == 'hidden' or x.attributes[:style] =~ /display:none/
|
181
|
-
end
|
182
|
-
|
183
|
-
@h = a.map do |x|
|
184
|
-
|
185
|
-
key = x.attributes[:name]
|
186
|
-
type = x.name
|
187
|
-
|
188
|
-
h = {}
|
189
|
-
h[:type] = x.attributes[:type] || type
|
190
|
-
h[:xpath] = "//%s[@name=\"%s\"]" % [type, key]
|
191
|
-
h[:title] = x.attributes[:title]
|
192
|
-
|
193
|
-
if type == 'select' then
|
194
|
-
h[:options] = x.xpath('option').map {|x| x.text.to_s}
|
195
|
-
end
|
196
|
-
|
197
|
-
[key, h]
|
198
|
-
|
199
|
-
end.to_h
|
200
|
-
|
201
|
-
end
|
202
|
-
|
203
|
-
|
204
200
|
end
|
205
201
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: formscraper_helper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -38,25 +38,25 @@ cert_chain:
|
|
38
38
|
date: 2022-05-28 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
|
-
name:
|
41
|
+
name: ferrumwizard
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 0.3.2
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
59
|
+
version: 0.3.2
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: nokorexi
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
metadata.gz.sig
CHANGED
Binary file
|