scrap_kit 0.1.3 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/run_tests.yml +20 -0
- data/CHANGELOG.md +33 -0
- data/Gemfile.lock +14 -6
- data/README.md +60 -0
- data/lib/scrap_kit/recipe.rb +141 -33
- data/lib/scrap_kit/version.rb +1 -1
- data/scrap_kit.gemspec +2 -1
- metadata +21 -7
- data/.github/workflows/publish_gem.yml +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
|
4
|
+
data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
|
7
|
+
data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
|
@@ -0,0 +1,20 @@
|
|
1
|
+
name: Run tests
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v1
|
12
|
+
- name: Setup Ruby
|
13
|
+
uses: actions/setup-ruby@v1
|
14
|
+
with:
|
15
|
+
ruby-version: '2.7.1'
|
16
|
+
- name: Build and run tests
|
17
|
+
run: |
|
18
|
+
gem install bundler
|
19
|
+
bundle update --conservative --jobs 4 --retry 3
|
20
|
+
bundle exec rake spec
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,37 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [0.1.8] 2020-08-29
|
4
|
+
|
5
|
+
### Changed/Added
|
6
|
+
- Add new ways to declare selectors
|
7
|
+
- Add steps
|
8
|
+
|
9
|
+
## [0.1.7] 2020-08-28
|
10
|
+
|
11
|
+
### Changed/Added
|
12
|
+
- Match selector condition by regexp or exact value
|
13
|
+
|
14
|
+
## [0.1.6] 2020-08-28
|
15
|
+
|
16
|
+
### Changed/Added
|
17
|
+
- Update activesupport requirement from = 6.0.2.1 to ~> 6.0
|
18
|
+
- Fix bug when matching selector condition
|
19
|
+
|
20
|
+
## [0.1.5] 2020-08-08
|
21
|
+
|
22
|
+
### Changed/Added
|
23
|
+
- Add `webdrivers` as dependency
|
24
|
+
|
25
|
+
## [0.1.4] 2020-07-12
|
26
|
+
|
27
|
+
### Changed/Added
|
28
|
+
- Add support for `<input />` elements
|
29
|
+
|
30
|
+
## [0.1.3] 2020-06-18
|
31
|
+
|
32
|
+
### Changed/Added
|
33
|
+
- Moved development dependencies as dependencies
|
34
|
+
|
3
35
|
## [0.1.2] 2020-06-18
|
4
36
|
|
5
37
|
### Changed/Added
|
@@ -9,6 +41,7 @@
|
|
9
41
|
|
10
42
|
### Changed/Added
|
11
43
|
- Update activesupport requirement from = 6.0.2.1 to = 6.0.3.1
|
44
|
+
- Update rake requirement from ~> 10.0 to ~> 13.0
|
12
45
|
|
13
46
|
## [0.1.0] 2020-02-09
|
14
47
|
|
data/Gemfile.lock
CHANGED
@@ -1,25 +1,29 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scrap_kit (0.1.
|
5
|
-
activesupport (
|
4
|
+
scrap_kit (0.1.8)
|
5
|
+
activesupport (~> 6.0)
|
6
6
|
watir (~> 6.16.5)
|
7
|
+
webdrivers (~> 4.0)
|
7
8
|
|
8
9
|
GEM
|
9
10
|
remote: https://rubygems.org/
|
10
11
|
specs:
|
11
|
-
activesupport (6.0.3.
|
12
|
+
activesupport (6.0.3.2)
|
12
13
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
14
|
i18n (>= 0.7, < 2)
|
14
15
|
minitest (~> 5.1)
|
15
16
|
tzinfo (~> 1.1)
|
16
17
|
zeitwerk (~> 2.2, >= 2.2.2)
|
17
18
|
childprocess (3.0.0)
|
18
|
-
concurrent-ruby (1.1.
|
19
|
+
concurrent-ruby (1.1.7)
|
19
20
|
diff-lcs (1.3)
|
20
|
-
i18n (1.8.
|
21
|
+
i18n (1.8.5)
|
21
22
|
concurrent-ruby (~> 1.0)
|
23
|
+
mini_portile2 (2.4.0)
|
22
24
|
minitest (5.14.1)
|
25
|
+
nokogiri (1.10.10)
|
26
|
+
mini_portile2 (~> 2.4.0)
|
23
27
|
rake (13.0.1)
|
24
28
|
regexp_parser (1.7.1)
|
25
29
|
rspec (3.9.0)
|
@@ -45,7 +49,11 @@ GEM
|
|
45
49
|
watir (6.16.5)
|
46
50
|
regexp_parser (~> 1.2)
|
47
51
|
selenium-webdriver (~> 3.6)
|
48
|
-
|
52
|
+
webdrivers (4.4.1)
|
53
|
+
nokogiri (~> 1.6)
|
54
|
+
rubyzip (>= 1.3.0)
|
55
|
+
selenium-webdriver (>= 3.0, < 4.0)
|
56
|
+
zeitwerk (2.4.0)
|
49
57
|
|
50
58
|
PLATFORMS
|
51
59
|
ruby
|
data/README.md
CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
|
|
81
81
|
#=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
|
82
82
|
```
|
83
83
|
|
84
|
+
### Working with selectors
|
85
|
+
|
86
|
+
Each attribute can be mapped to a selector, which can be any of the following types:
|
87
|
+
|
88
|
+
* A string, which represents a CSS selector.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
|
92
|
+
```
|
93
|
+
|
94
|
+
* A hash, which can have any of the following options:
|
95
|
+
* `xpath: [String]`
|
96
|
+
* `css: [String]`
|
97
|
+
* `index: [Integer]`
|
98
|
+
* `tag_name: [String]`
|
99
|
+
* `text: [String]`
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
{ text: "View Archive" }
|
103
|
+
```
|
104
|
+
|
105
|
+
* An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
[".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
|
109
|
+
```
|
110
|
+
|
111
|
+
Use any of them as it suits you best.
|
112
|
+
|
113
|
+
### Writing steps
|
114
|
+
|
115
|
+
Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
|
116
|
+
|
117
|
+
* **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
{
|
121
|
+
goto: { text: "View Archive" }
|
122
|
+
}
|
123
|
+
```
|
124
|
+
|
125
|
+
* **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
{
|
129
|
+
click: { css: "[type=submit]" }
|
130
|
+
}
|
131
|
+
```
|
132
|
+
|
133
|
+
* **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
{
|
137
|
+
fill_form: {
|
138
|
+
gem_name: "ScrapKit",
|
139
|
+
author: "hpneo",
|
140
|
+
}
|
141
|
+
}
|
142
|
+
```
|
143
|
+
|
84
144
|
## Development
|
85
145
|
|
86
146
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/scrap_kit/recipe.rb
CHANGED
@@ -1,8 +1,23 @@
|
|
1
1
|
require "active_support/core_ext/hash"
|
2
|
+
require "webdrivers/chromedriver"
|
2
3
|
require "watir"
|
3
4
|
|
4
5
|
module ScrapKit
|
5
6
|
class Recipe
|
7
|
+
class << self
|
8
|
+
def load(source)
|
9
|
+
input = if source.is_a?(Hash)
|
10
|
+
source
|
11
|
+
elsif source.is_a?(IO)
|
12
|
+
JSON.parse(source.read)
|
13
|
+
else
|
14
|
+
JSON.parse(File.read(source))
|
15
|
+
end
|
16
|
+
|
17
|
+
new(input.deep_symbolize_keys)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
6
21
|
def initialize(url: nil, steps: [], attributes: {})
|
7
22
|
@url = url
|
8
23
|
@steps = steps
|
@@ -12,76 +27,169 @@ module ScrapKit
|
|
12
27
|
def run
|
13
28
|
output = {}
|
14
29
|
|
15
|
-
browser = Watir::Browser.new(:chrome, headless: true)
|
16
|
-
browser.goto @url
|
30
|
+
@browser = Watir::Browser.new(:chrome, headless: true)
|
31
|
+
@browser.goto @url
|
17
32
|
|
18
33
|
@steps.each do |step|
|
19
|
-
run_step(
|
34
|
+
run_step(step)
|
20
35
|
end
|
21
36
|
|
22
37
|
@attributes.each do |attribute_name, selector|
|
23
|
-
output[attribute_name] = extract_attribute(browser, selector)
|
38
|
+
output[attribute_name] = extract_attribute(@browser, selector)
|
24
39
|
end
|
25
40
|
|
26
|
-
browser.close
|
27
|
-
browser = nil
|
41
|
+
@browser.close
|
42
|
+
@browser = nil
|
28
43
|
|
29
44
|
output
|
30
45
|
end
|
31
46
|
|
32
|
-
def run_step(
|
47
|
+
def run_step(step)
|
48
|
+
return goto(step[:goto]) if step[:goto]
|
49
|
+
return click(step[:click]) if step[:click]
|
50
|
+
return fill_form(step[:fill_form]) if step[:fill_form]
|
51
|
+
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def find_element_by_name_or_selector(browser_or_element, name_or_selector)
|
56
|
+
element = browser_or_element.element(name: name_or_selector.to_s)
|
57
|
+
return element if element.exists?
|
58
|
+
|
59
|
+
element = browser_or_element.element(css: name_or_selector.to_s)
|
60
|
+
return element if element.exists?
|
61
|
+
|
62
|
+
nil
|
33
63
|
end
|
34
64
|
|
35
65
|
def elements_from_selector(browser_or_element, selector)
|
36
66
|
if selector.is_a?(String)
|
37
67
|
browser_or_element.elements(css: selector)
|
68
|
+
elsif selector.is_a?(Hash)
|
69
|
+
browser_or_element.elements(selector)
|
38
70
|
elsif selector.is_a?(Array)
|
39
71
|
*remainder, condition = selector
|
72
|
+
condition_key, condition_value = condition.first
|
40
73
|
elements = browser_or_element
|
41
74
|
|
42
|
-
remainder.
|
43
|
-
elements = elements.elements(css:
|
75
|
+
if remainder.empty?
|
76
|
+
elements = elements.elements(css: condition_key.to_s)
|
77
|
+
else
|
78
|
+
remainder.each do |item|
|
79
|
+
elements = elements.elements(css: item)
|
80
|
+
end
|
44
81
|
end
|
45
82
|
|
46
83
|
elements.filter do |element|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
84
|
+
found_element = element.element(css: condition_key.to_s)
|
85
|
+
extracted_value = extract_value_from_element(found_element)
|
86
|
+
extracted_value.match(condition_value) || extracted_value == condition_value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_value_from_element(element)
|
92
|
+
if element&.respond_to?(:tag_name)
|
93
|
+
if element.tag_name.downcase == "input"
|
94
|
+
return element.attribute_value(:value)
|
51
95
|
end
|
52
96
|
end
|
97
|
+
|
98
|
+
element&.text_content
|
53
99
|
end
|
54
100
|
|
55
|
-
def extract_attribute(browser_or_element,
|
56
|
-
if
|
57
|
-
browser_or_element.element(css:
|
58
|
-
elsif
|
59
|
-
|
60
|
-
|
101
|
+
def extract_attribute(browser_or_element, selector_or_object)
|
102
|
+
if selector_or_object.is_a?(String)
|
103
|
+
extract_value_from_element(browser_or_element.element(css: selector_or_object))
|
104
|
+
elsif selector_or_object.is_a?(Array)
|
105
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
106
|
+
|
107
|
+
if found_elements.size === 1
|
108
|
+
extract_value_from_element(found_elements.first)
|
109
|
+
else
|
110
|
+
found_elements.map do |element|
|
111
|
+
extract_value_from_element(element)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
elsif selector_or_object.is_a?(Hash)
|
115
|
+
if selector_or_object[:selector] && selector_or_object[:children_attributes]
|
116
|
+
selector = selector_or_object[:selector]
|
117
|
+
selector_for_children_attributes = selector_or_object[:children_attributes]
|
118
|
+
|
119
|
+
elements_from_selector(browser_or_element, selector).map do |element|
|
120
|
+
output = {}
|
61
121
|
|
62
|
-
|
63
|
-
|
122
|
+
selector_for_children_attributes.each do |child_attribute_name, child_selector|
|
123
|
+
output[child_attribute_name] = extract_attribute(element, child_selector)
|
124
|
+
end
|
64
125
|
|
65
|
-
|
66
|
-
|
126
|
+
output
|
127
|
+
end
|
128
|
+
else
|
129
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
130
|
+
|
131
|
+
if found_elements.size === 1
|
132
|
+
extract_value_from_element(found_elements.first)
|
133
|
+
else
|
134
|
+
found_elements.map do |element|
|
135
|
+
extract_value_from_element(element)
|
136
|
+
end
|
67
137
|
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
private
|
68
143
|
|
69
|
-
|
144
|
+
def goto(link_or_selector)
|
145
|
+
if link_or_selector.is_a?(String)
|
146
|
+
@browser.goto(link_or_selector)
|
147
|
+
elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
|
148
|
+
if found_element = elements_from_selector(@browser, link_or_selector).first
|
149
|
+
found_element.click
|
70
150
|
end
|
71
151
|
end
|
152
|
+
|
153
|
+
sleep 0.5
|
154
|
+
@browser.wait_until do
|
155
|
+
@browser.ready_state == "complete"
|
156
|
+
end
|
157
|
+
rescue
|
158
|
+
nil
|
72
159
|
end
|
73
160
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
elsif source.is_a?(IO)
|
79
|
-
JSON.parse(source.read)
|
80
|
-
else
|
81
|
-
JSON.parse(File.read(source))
|
161
|
+
def click(selector)
|
162
|
+
if selector.is_a?(Array) || selector.is_a?(Hash)
|
163
|
+
if found_element = elements_from_selector(@browser, selector).first
|
164
|
+
found_element.click
|
82
165
|
end
|
166
|
+
end
|
83
167
|
|
84
|
-
|
168
|
+
sleep 1
|
169
|
+
@browser.wait_until do
|
170
|
+
@browser.ready_state == "complete"
|
171
|
+
end
|
172
|
+
|
173
|
+
rescue
|
174
|
+
nil
|
175
|
+
end
|
176
|
+
|
177
|
+
def fill_form(form_data)
|
178
|
+
form_data.each do |name, value|
|
179
|
+
if element = find_element_by_name_or_selector(@browser.body, name.to_s)
|
180
|
+
element = element.to_subtype
|
181
|
+
|
182
|
+
if element.respond_to?(:set)
|
183
|
+
element.set(value)
|
184
|
+
elsif element.respond_to?(:select)
|
185
|
+
element.select(value)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
sleep 0.25
|
191
|
+
@browser.wait_until do
|
192
|
+
@browser.ready_state == "complete"
|
85
193
|
end
|
86
194
|
end
|
87
195
|
end
|
data/lib/scrap_kit/version.rb
CHANGED
data/scrap_kit.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "rake", "~> 13.0"
|
33
33
|
spec.add_development_dependency "rspec", "~> 3.0"
|
34
34
|
spec.add_dependency "watir", "~> 6.16.5"
|
35
|
-
spec.add_dependency "
|
35
|
+
spec.add_dependency "webdrivers", "~> 4.0"
|
36
|
+
spec.add_dependency "activesupport", "~> 6.0"
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrap_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gustavo Leon
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,20 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 6.16.5
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: webdrivers
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '4.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '4.0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: activesupport
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- -
|
87
|
+
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: 6.0
|
89
|
+
version: '6.0'
|
76
90
|
type: :runtime
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- -
|
94
|
+
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: 6.0
|
96
|
+
version: '6.0'
|
83
97
|
description: Run JSON-based recipes to scrap web sites.
|
84
98
|
email:
|
85
99
|
- hpneo@hotmail.com
|
@@ -87,7 +101,7 @@ executables: []
|
|
87
101
|
extensions: []
|
88
102
|
extra_rdoc_files: []
|
89
103
|
files:
|
90
|
-
- ".github/workflows/
|
104
|
+
- ".github/workflows/run_tests.yml"
|
91
105
|
- ".gitignore"
|
92
106
|
- ".rspec"
|
93
107
|
- ".rubocop.yml"
|