scrap_kit 0.1.7 → 0.1.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Gemfile.lock +2 -2
- data/README.md +60 -0
- data/lib/scrap_kit/recipe.rb +150 -32
- data/lib/scrap_kit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8aaad7e12415fe1104dd1810e6800a880e5d407e1025e1aebd029620d3e94152
|
4
|
+
data.tar.gz: e0af7af93c10d6a45575eaed22da2f0d9c7472249480d400b4e7dfa557d72a21
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97cb189bfc69cfaa8649431d34cbd303b2b6d81323437a30eb3f23ac98feba0baa0ed69f20c2e358193d60ccbb9d52110ff489f258af9fc8fe3b086735373697
|
7
|
+
data.tar.gz: 57b1d6a4d860b12c3374cdf3a2366c00571a91eb431bc02b2e7d88fbbbb03a85d9ba78414a03bfbb562bcd62157744226940c43b25a3c7c2ed7189c586441b34
|
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,39 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [0.1.12] 2020-09-03
|
4
|
+
|
5
|
+
### Changed/Added
|
6
|
+
- Return nil if `extract_attribute` fails
|
7
|
+
|
8
|
+
## [0.1.11] 2020-09-03
|
9
|
+
|
10
|
+
### Changed/Added
|
11
|
+
- Add `user_agent` accessor for browser
|
12
|
+
|
13
|
+
## [0.1.10] 2020-09-03
|
14
|
+
|
15
|
+
### Changed/Added
|
16
|
+
- Map attributes to JavaScript calls
|
17
|
+
|
18
|
+
## [0.1.9] 2020-08-31
|
19
|
+
|
20
|
+
### Changed/Added
|
21
|
+
- Set arguments for Chrome driver
|
22
|
+
|
23
|
+
## [0.1.8] 2020-08-29
|
24
|
+
|
25
|
+
### Changed/Added
|
26
|
+
- Add new ways to declare selectors
|
27
|
+
- Add steps
|
28
|
+
|
3
29
|
## [0.1.7] 2020-08-28
|
30
|
+
|
31
|
+
### Changed/Added
|
4
32
|
- Match selector condition by regexp or exact value
|
5
33
|
|
6
34
|
## [0.1.6] 2020-08-28
|
35
|
+
|
36
|
+
### Changed/Added
|
7
37
|
- Update activesupport requirement from = 6.0.2.1 to ~> 6.0
|
8
38
|
- Fix bug when matching selector condition
|
9
39
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scrap_kit (0.1.
|
4
|
+
scrap_kit (0.1.12)
|
5
5
|
activesupport (~> 6.0)
|
6
6
|
watir (~> 6.16.5)
|
7
7
|
webdrivers (~> 4.0)
|
@@ -21,7 +21,7 @@ GEM
|
|
21
21
|
i18n (1.8.5)
|
22
22
|
concurrent-ruby (~> 1.0)
|
23
23
|
mini_portile2 (2.4.0)
|
24
|
-
minitest (5.14.
|
24
|
+
minitest (5.14.2)
|
25
25
|
nokogiri (1.10.10)
|
26
26
|
mini_portile2 (~> 2.4.0)
|
27
27
|
rake (13.0.1)
|
data/README.md
CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
|
|
81
81
|
#=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
|
82
82
|
```
|
83
83
|
|
84
|
+
### Working with selectors
|
85
|
+
|
86
|
+
Each attribute can be mapped to a selector, which can be any of the following types:
|
87
|
+
|
88
|
+
* A string, which represents a CSS selector.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
|
92
|
+
```
|
93
|
+
|
94
|
+
* A hash, which can have any of the following options:
|
95
|
+
* `xpath: [String]`
|
96
|
+
* `css: [String]`
|
97
|
+
* `index: [Integer]`
|
98
|
+
* `tag_name: [String]`
|
99
|
+
* `text: [String]`
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
{ text: "View Archive" }
|
103
|
+
```
|
104
|
+
|
105
|
+
* An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
[".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
|
109
|
+
```
|
110
|
+
|
111
|
+
Use any of them as it suits you best.
|
112
|
+
|
113
|
+
### Writing steps
|
114
|
+
|
115
|
+
Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
|
116
|
+
|
117
|
+
* **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
{
|
121
|
+
goto: { text: "View Archive" }
|
122
|
+
}
|
123
|
+
```
|
124
|
+
|
125
|
+
* **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
{
|
129
|
+
click: { css: "[type=submit]" }
|
130
|
+
}
|
131
|
+
```
|
132
|
+
|
133
|
+
* **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
{
|
137
|
+
fill_form: {
|
138
|
+
gem_name: "ScrapKit",
|
139
|
+
author: "hpneo",
|
140
|
+
}
|
141
|
+
}
|
142
|
+
```
|
143
|
+
|
84
144
|
## Development
|
85
145
|
|
86
146
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/scrap_kit/recipe.rb
CHANGED
@@ -4,6 +4,22 @@ require "watir"
|
|
4
4
|
|
5
5
|
module ScrapKit
|
6
6
|
class Recipe
|
7
|
+
attr_accessor :user_agent
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def load(source)
|
11
|
+
input = if source.is_a?(Hash)
|
12
|
+
source
|
13
|
+
elsif source.is_a?(IO)
|
14
|
+
JSON.parse(source.read)
|
15
|
+
else
|
16
|
+
JSON.parse(File.read(source))
|
17
|
+
end
|
18
|
+
|
19
|
+
new(input.deep_symbolize_keys)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
7
23
|
def initialize(url: nil, steps: [], attributes: {})
|
8
24
|
@url = url
|
9
25
|
@steps = steps
|
@@ -13,41 +29,61 @@ module ScrapKit
|
|
13
29
|
def run
|
14
30
|
output = {}
|
15
31
|
|
16
|
-
browser =
|
17
|
-
browser.goto @url
|
32
|
+
@browser = create_browser
|
33
|
+
@browser.goto @url
|
18
34
|
|
19
35
|
@steps.each do |step|
|
20
|
-
run_step(
|
36
|
+
run_step(step)
|
21
37
|
end
|
22
38
|
|
23
39
|
@attributes.each do |attribute_name, selector|
|
24
|
-
output[attribute_name] = extract_attribute(browser, selector)
|
40
|
+
output[attribute_name] = extract_attribute(@browser, selector)
|
25
41
|
end
|
26
42
|
|
27
|
-
browser.close
|
28
|
-
browser = nil
|
43
|
+
@browser.close
|
44
|
+
@browser = nil
|
29
45
|
|
30
46
|
output
|
31
47
|
end
|
32
48
|
|
33
|
-
def run_step(
|
49
|
+
def run_step(step)
|
50
|
+
return goto(step[:goto]) if step[:goto]
|
51
|
+
return click(step[:click]) if step[:click]
|
52
|
+
return fill_form(step[:fill_form]) if step[:fill_form]
|
53
|
+
|
54
|
+
nil
|
55
|
+
end
|
56
|
+
|
57
|
+
def find_element_by_name_or_selector(browser_or_element, name_or_selector)
|
58
|
+
element = browser_or_element.element(name: name_or_selector.to_s)
|
59
|
+
return element if element.exists?
|
60
|
+
|
61
|
+
element = browser_or_element.element(css: name_or_selector.to_s)
|
62
|
+
return element if element.exists?
|
63
|
+
|
64
|
+
nil
|
34
65
|
end
|
35
66
|
|
36
67
|
def elements_from_selector(browser_or_element, selector)
|
37
68
|
if selector.is_a?(String)
|
38
69
|
browser_or_element.elements(css: selector)
|
70
|
+
elsif selector.is_a?(Hash)
|
71
|
+
browser_or_element.elements(selector)
|
39
72
|
elsif selector.is_a?(Array)
|
40
73
|
*remainder, condition = selector
|
74
|
+
condition_key, condition_value = condition.first
|
41
75
|
elements = browser_or_element
|
42
76
|
|
43
|
-
remainder.
|
44
|
-
elements = elements.elements(css:
|
77
|
+
if remainder.empty?
|
78
|
+
elements = elements.elements(css: condition_key.to_s)
|
79
|
+
else
|
80
|
+
remainder.each do |item|
|
81
|
+
elements = elements.elements(css: item)
|
82
|
+
end
|
45
83
|
end
|
46
84
|
|
47
85
|
elements.filter do |element|
|
48
|
-
|
49
|
-
condition_value = condition.values.first
|
50
|
-
found_element = element.element(css: condition_key)
|
86
|
+
found_element = element.element(css: condition_key.to_s)
|
51
87
|
extracted_value = extract_value_from_element(found_element)
|
52
88
|
extracted_value.match(condition_value) || extracted_value == condition_value
|
53
89
|
end
|
@@ -64,37 +100,119 @@ module ScrapKit
|
|
64
100
|
element&.text_content
|
65
101
|
end
|
66
102
|
|
67
|
-
def extract_attribute(browser_or_element,
|
68
|
-
if
|
69
|
-
extract_value_from_element(browser_or_element.element(css:
|
70
|
-
elsif
|
71
|
-
|
72
|
-
selector_for_children_attributes = selector_or_hash[:children_attributes]
|
103
|
+
def extract_attribute(browser_or_element, selector_or_object)
|
104
|
+
if selector_or_object.is_a?(String)
|
105
|
+
extract_value_from_element(browser_or_element.element(css: selector_or_object))
|
106
|
+
elsif selector_or_object.is_a?(Array)
|
107
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
73
108
|
|
74
|
-
|
75
|
-
|
109
|
+
if found_elements.size === 1
|
110
|
+
extract_value_from_element(found_elements.first)
|
111
|
+
else
|
112
|
+
found_elements.map do |element|
|
113
|
+
extract_value_from_element(element)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
elsif selector_or_object.is_a?(Hash)
|
117
|
+
if selector_or_object[:selector] && selector_or_object[:children_attributes]
|
118
|
+
selector = selector_or_object[:selector]
|
119
|
+
selector_for_children_attributes = selector_or_object[:children_attributes]
|
120
|
+
|
121
|
+
elements_from_selector(browser_or_element, selector).map do |element|
|
122
|
+
output = {}
|
123
|
+
|
124
|
+
selector_for_children_attributes.each do |child_attribute_name, child_selector|
|
125
|
+
output[child_attribute_name] = extract_attribute(element, child_selector)
|
126
|
+
end
|
76
127
|
|
77
|
-
|
78
|
-
output[child_attribute_name] = extract_attribute(element, child_selector)
|
128
|
+
output
|
79
129
|
end
|
130
|
+
elsif selector_or_object[:javascript]
|
131
|
+
@browser.execute_script(selector_or_object[:javascript])
|
132
|
+
else
|
133
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
134
|
+
|
135
|
+
if found_elements.size === 1
|
136
|
+
extract_value_from_element(found_elements.first)
|
137
|
+
else
|
138
|
+
found_elements.map do |element|
|
139
|
+
extract_value_from_element(element)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
rescue
|
145
|
+
nil
|
146
|
+
end
|
80
147
|
|
81
|
-
|
148
|
+
private
|
149
|
+
|
150
|
+
def goto(link_or_selector)
|
151
|
+
if link_or_selector.is_a?(String)
|
152
|
+
@browser.goto(link_or_selector)
|
153
|
+
elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
|
154
|
+
if found_element = elements_from_selector(@browser, link_or_selector).first
|
155
|
+
found_element.click
|
82
156
|
end
|
83
157
|
end
|
158
|
+
|
159
|
+
sleep 0.5
|
160
|
+
@browser.wait_until do
|
161
|
+
@browser.ready_state == "complete"
|
162
|
+
end
|
163
|
+
rescue
|
164
|
+
nil
|
84
165
|
end
|
85
166
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
elsif source.is_a?(IO)
|
91
|
-
JSON.parse(source.read)
|
92
|
-
else
|
93
|
-
JSON.parse(File.read(source))
|
167
|
+
def click(selector)
|
168
|
+
if selector.is_a?(Array) || selector.is_a?(Hash)
|
169
|
+
if found_element = elements_from_selector(@browser, selector).first
|
170
|
+
found_element.click
|
94
171
|
end
|
172
|
+
end
|
95
173
|
|
96
|
-
|
174
|
+
sleep 1
|
175
|
+
@browser.wait_until do
|
176
|
+
@browser.ready_state == "complete"
|
97
177
|
end
|
178
|
+
|
179
|
+
rescue
|
180
|
+
nil
|
181
|
+
end
|
182
|
+
|
183
|
+
def fill_form(form_data)
|
184
|
+
form_data.each do |name, value|
|
185
|
+
if element = find_element_by_name_or_selector(@browser.body, name.to_s)
|
186
|
+
element = element.to_subtype
|
187
|
+
|
188
|
+
if element.respond_to?(:set)
|
189
|
+
element.set(value)
|
190
|
+
elsif element.respond_to?(:select)
|
191
|
+
element.select(value)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
sleep 0.25
|
197
|
+
@browser.wait_until do
|
198
|
+
@browser.ready_state == "complete"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def create_browser
|
203
|
+
options = Selenium::WebDriver::Chrome::Options.new
|
204
|
+
|
205
|
+
options.add_argument "--headless"
|
206
|
+
options.add_argument "--window-size=1080x720"
|
207
|
+
options.add_argument "--hide-scrollbars"
|
208
|
+
options.add_argument "--user-agent=#{@user_agent}" if @user_agent
|
209
|
+
|
210
|
+
if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
|
211
|
+
options.add_argument "--no-sandbox"
|
212
|
+
options.binary = chrome_bin
|
213
|
+
end
|
214
|
+
|
215
|
+
Watir::Browser.new(:chrome, options: options)
|
98
216
|
end
|
99
217
|
end
|
100
218
|
end
|
data/lib/scrap_kit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrap_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gustavo Leon
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|