scrap_kit 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Gemfile.lock +1 -1
- data/README.md +60 -0
- data/lib/scrap_kit/recipe.rb +128 -32
- data/lib/scrap_kit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
|
4
|
+
data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
|
7
|
+
data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
|
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,19 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [0.1.8] 2020-08-29
|
4
|
+
|
5
|
+
### Changed/Added
|
6
|
+
- Add new ways to declare selectors
|
7
|
+
- Add steps
|
8
|
+
|
3
9
|
## [0.1.7] 2020-08-28
|
10
|
+
|
11
|
+
### Changed/Added
|
4
12
|
- Match selector condition by regexp or exact value
|
5
13
|
|
6
14
|
## [0.1.6] 2020-08-28
|
15
|
+
|
16
|
+
### Changed/Added
|
7
17
|
- Update activesupport requirement from = 6.0.2.1 to ~> 6.0
|
8
18
|
- Fix bug when matching selector condition
|
9
19
|
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
|
|
81
81
|
#=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
|
82
82
|
```
|
83
83
|
|
84
|
+
### Working with selectors
|
85
|
+
|
86
|
+
Each attribute can be mapped to a selector, which can be any of the following types:
|
87
|
+
|
88
|
+
* A string, which represents a CSS selector.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
|
92
|
+
```
|
93
|
+
|
94
|
+
* A hash, which can have any of the following options:
|
95
|
+
* `xpath: [String]`
|
96
|
+
* `css: [String]`
|
97
|
+
* `index: [Integer]`
|
98
|
+
* `tag_name: [String]`
|
99
|
+
* `text: [String]`
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
{ text: "View Archive" }
|
103
|
+
```
|
104
|
+
|
105
|
+
* An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
[".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
|
109
|
+
```
|
110
|
+
|
111
|
+
Use any of them as it suits you best.
|
112
|
+
|
113
|
+
### Writing steps
|
114
|
+
|
115
|
+
Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
|
116
|
+
|
117
|
+
* **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
{
|
121
|
+
goto: { text: "View Archive" }
|
122
|
+
}
|
123
|
+
```
|
124
|
+
|
125
|
+
* **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
{
|
129
|
+
click: { css: "[type=submit]" }
|
130
|
+
}
|
131
|
+
```
|
132
|
+
|
133
|
+
* **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
{
|
137
|
+
fill_form: {
|
138
|
+
gem_name: "ScrapKit",
|
139
|
+
author: "hpneo",
|
140
|
+
}
|
141
|
+
}
|
142
|
+
```
|
143
|
+
|
84
144
|
## Development
|
85
145
|
|
86
146
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/scrap_kit/recipe.rb
CHANGED
@@ -4,6 +4,20 @@ require "watir"
|
|
4
4
|
|
5
5
|
module ScrapKit
|
6
6
|
class Recipe
|
7
|
+
class << self
|
8
|
+
def load(source)
|
9
|
+
input = if source.is_a?(Hash)
|
10
|
+
source
|
11
|
+
elsif source.is_a?(IO)
|
12
|
+
JSON.parse(source.read)
|
13
|
+
else
|
14
|
+
JSON.parse(File.read(source))
|
15
|
+
end
|
16
|
+
|
17
|
+
new(input.deep_symbolize_keys)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
7
21
|
def initialize(url: nil, steps: [], attributes: {})
|
8
22
|
@url = url
|
9
23
|
@steps = steps
|
@@ -13,41 +27,61 @@ module ScrapKit
|
|
13
27
|
def run
|
14
28
|
output = {}
|
15
29
|
|
16
|
-
browser = Watir::Browser.new(:chrome, headless: true)
|
17
|
-
browser.goto @url
|
30
|
+
@browser = Watir::Browser.new(:chrome, headless: true)
|
31
|
+
@browser.goto @url
|
18
32
|
|
19
33
|
@steps.each do |step|
|
20
|
-
run_step(
|
34
|
+
run_step(step)
|
21
35
|
end
|
22
36
|
|
23
37
|
@attributes.each do |attribute_name, selector|
|
24
|
-
output[attribute_name] = extract_attribute(browser, selector)
|
38
|
+
output[attribute_name] = extract_attribute(@browser, selector)
|
25
39
|
end
|
26
40
|
|
27
|
-
browser.close
|
28
|
-
browser = nil
|
41
|
+
@browser.close
|
42
|
+
@browser = nil
|
29
43
|
|
30
44
|
output
|
31
45
|
end
|
32
46
|
|
33
|
-
def run_step(
|
47
|
+
def run_step(step)
|
48
|
+
return goto(step[:goto]) if step[:goto]
|
49
|
+
return click(step[:click]) if step[:click]
|
50
|
+
return fill_form(step[:fill_form]) if step[:fill_form]
|
51
|
+
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def find_element_by_name_or_selector(browser_or_element, name_or_selector)
|
56
|
+
element = browser_or_element.element(name: name_or_selector.to_s)
|
57
|
+
return element if element.exists?
|
58
|
+
|
59
|
+
element = browser_or_element.element(css: name_or_selector.to_s)
|
60
|
+
return element if element.exists?
|
61
|
+
|
62
|
+
nil
|
34
63
|
end
|
35
64
|
|
36
65
|
def elements_from_selector(browser_or_element, selector)
|
37
66
|
if selector.is_a?(String)
|
38
67
|
browser_or_element.elements(css: selector)
|
68
|
+
elsif selector.is_a?(Hash)
|
69
|
+
browser_or_element.elements(selector)
|
39
70
|
elsif selector.is_a?(Array)
|
40
71
|
*remainder, condition = selector
|
72
|
+
condition_key, condition_value = condition.first
|
41
73
|
elements = browser_or_element
|
42
74
|
|
43
|
-
remainder.
|
44
|
-
elements = elements.elements(css:
|
75
|
+
if remainder.empty?
|
76
|
+
elements = elements.elements(css: condition_key.to_s)
|
77
|
+
else
|
78
|
+
remainder.each do |item|
|
79
|
+
elements = elements.elements(css: item)
|
80
|
+
end
|
45
81
|
end
|
46
82
|
|
47
83
|
elements.filter do |element|
|
48
|
-
|
49
|
-
condition_value = condition.values.first
|
50
|
-
found_element = element.element(css: condition_key)
|
84
|
+
found_element = element.element(css: condition_key.to_s)
|
51
85
|
extracted_value = extract_value_from_element(found_element)
|
52
86
|
extracted_value.match(condition_value) || extracted_value == condition_value
|
53
87
|
end
|
@@ -64,36 +98,98 @@ module ScrapKit
|
|
64
98
|
element&.text_content
|
65
99
|
end
|
66
100
|
|
67
|
-
def extract_attribute(browser_or_element,
|
68
|
-
if
|
69
|
-
extract_value_from_element(browser_or_element.element(css:
|
70
|
-
elsif
|
71
|
-
|
72
|
-
selector_for_children_attributes = selector_or_hash[:children_attributes]
|
101
|
+
def extract_attribute(browser_or_element, selector_or_object)
|
102
|
+
if selector_or_object.is_a?(String)
|
103
|
+
extract_value_from_element(browser_or_element.element(css: selector_or_object))
|
104
|
+
elsif selector_or_object.is_a?(Array)
|
105
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
73
106
|
|
74
|
-
|
75
|
-
|
107
|
+
if found_elements.size === 1
|
108
|
+
extract_value_from_element(found_elements.first)
|
109
|
+
else
|
110
|
+
found_elements.map do |element|
|
111
|
+
extract_value_from_element(element)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
elsif selector_or_object.is_a?(Hash)
|
115
|
+
if selector_or_object[:selector] && selector_or_object[:children_attributes]
|
116
|
+
selector = selector_or_object[:selector]
|
117
|
+
selector_for_children_attributes = selector_or_object[:children_attributes]
|
76
118
|
|
77
|
-
|
78
|
-
output
|
119
|
+
elements_from_selector(browser_or_element, selector).map do |element|
|
120
|
+
output = {}
|
121
|
+
|
122
|
+
selector_for_children_attributes.each do |child_attribute_name, child_selector|
|
123
|
+
output[child_attribute_name] = extract_attribute(element, child_selector)
|
124
|
+
end
|
125
|
+
|
126
|
+
output
|
79
127
|
end
|
128
|
+
else
|
129
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
130
|
+
|
131
|
+
if found_elements.size === 1
|
132
|
+
extract_value_from_element(found_elements.first)
|
133
|
+
else
|
134
|
+
found_elements.map do |element|
|
135
|
+
extract_value_from_element(element)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
80
141
|
|
81
|
-
|
142
|
+
private
|
143
|
+
|
144
|
+
def goto(link_or_selector)
|
145
|
+
if link_or_selector.is_a?(String)
|
146
|
+
@browser.goto(link_or_selector)
|
147
|
+
elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
|
148
|
+
if found_element = elements_from_selector(@browser, link_or_selector).first
|
149
|
+
found_element.click
|
82
150
|
end
|
83
151
|
end
|
152
|
+
|
153
|
+
sleep 0.5
|
154
|
+
@browser.wait_until do
|
155
|
+
@browser.ready_state == "complete"
|
156
|
+
end
|
157
|
+
rescue
|
158
|
+
nil
|
84
159
|
end
|
85
160
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
elsif source.is_a?(IO)
|
91
|
-
JSON.parse(source.read)
|
92
|
-
else
|
93
|
-
JSON.parse(File.read(source))
|
161
|
+
def click(selector)
|
162
|
+
if selector.is_a?(Array) || selector.is_a?(Hash)
|
163
|
+
if found_element = elements_from_selector(@browser, selector).first
|
164
|
+
found_element.click
|
94
165
|
end
|
166
|
+
end
|
95
167
|
|
96
|
-
|
168
|
+
sleep 1
|
169
|
+
@browser.wait_until do
|
170
|
+
@browser.ready_state == "complete"
|
171
|
+
end
|
172
|
+
|
173
|
+
rescue
|
174
|
+
nil
|
175
|
+
end
|
176
|
+
|
177
|
+
def fill_form(form_data)
|
178
|
+
form_data.each do |name, value|
|
179
|
+
if element = find_element_by_name_or_selector(@browser.body, name.to_s)
|
180
|
+
element = element.to_subtype
|
181
|
+
|
182
|
+
if element.respond_to?(:set)
|
183
|
+
element.set(value)
|
184
|
+
elsif element.respond_to?(:select)
|
185
|
+
element.select(value)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
sleep 0.25
|
191
|
+
@browser.wait_until do
|
192
|
+
@browser.ready_state == "complete"
|
97
193
|
end
|
98
194
|
end
|
99
195
|
end
|
data/lib/scrap_kit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrap_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gustavo Leon
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|