scrap_kit 0.1.3 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/run_tests.yml +20 -0
- data/CHANGELOG.md +33 -0
- data/Gemfile.lock +14 -6
- data/README.md +60 -0
- data/lib/scrap_kit/recipe.rb +141 -33
- data/lib/scrap_kit/version.rb +1 -1
- data/scrap_kit.gemspec +2 -1
- metadata +21 -7
- data/.github/workflows/publish_gem.yml +0 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
|
4
|
+
data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
|
7
|
+
data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
|
@@ -0,0 +1,20 @@
|
|
1
|
+
name: Run tests
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
|
10
|
+
steps:
|
11
|
+
- uses: actions/checkout@v1
|
12
|
+
- name: Setup Ruby
|
13
|
+
uses: actions/setup-ruby@v1
|
14
|
+
with:
|
15
|
+
ruby-version: '2.7.1'
|
16
|
+
- name: Build and run tests
|
17
|
+
run: |
|
18
|
+
gem install bundler
|
19
|
+
bundle update --conservative --jobs 4 --retry 3
|
20
|
+
bundle exec rake spec
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,37 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [0.1.8] 2020-08-29
|
4
|
+
|
5
|
+
### Changed/Added
|
6
|
+
- Add new ways to declare selectors
|
7
|
+
- Add steps
|
8
|
+
|
9
|
+
## [0.1.7] 2020-08-28
|
10
|
+
|
11
|
+
### Changed/Added
|
12
|
+
- Match selector condition by regexp or exact value
|
13
|
+
|
14
|
+
## [0.1.6] 2020-08-28
|
15
|
+
|
16
|
+
### Changed/Added
|
17
|
+
- Update activesupport requirement from = 6.0.2.1 to ~> 6.0
|
18
|
+
- Fix bug when matching selector condition
|
19
|
+
|
20
|
+
## [0.1.5] 2020-08-08
|
21
|
+
|
22
|
+
### Changed/Added
|
23
|
+
- Add `webdrivers` as dependency
|
24
|
+
|
25
|
+
## [0.1.4] 2020-07-12
|
26
|
+
|
27
|
+
### Changed/Added
|
28
|
+
- Add support for `<input />` elements
|
29
|
+
|
30
|
+
## [0.1.3] 2020-06-18
|
31
|
+
|
32
|
+
### Changed/Added
|
33
|
+
- Moved development dependencies as dependencies
|
34
|
+
|
3
35
|
## [0.1.2] 2020-06-18
|
4
36
|
|
5
37
|
### Changed/Added
|
@@ -9,6 +41,7 @@
|
|
9
41
|
|
10
42
|
### Changed/Added
|
11
43
|
- Update activesupport requirement from = 6.0.2.1 to = 6.0.3.1
|
44
|
+
- Update rake requirement from ~> 10.0 to ~> 13.0
|
12
45
|
|
13
46
|
## [0.1.0] 2020-02-09
|
14
47
|
|
data/Gemfile.lock
CHANGED
@@ -1,25 +1,29 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scrap_kit (0.1.
|
5
|
-
activesupport (
|
4
|
+
scrap_kit (0.1.8)
|
5
|
+
activesupport (~> 6.0)
|
6
6
|
watir (~> 6.16.5)
|
7
|
+
webdrivers (~> 4.0)
|
7
8
|
|
8
9
|
GEM
|
9
10
|
remote: https://rubygems.org/
|
10
11
|
specs:
|
11
|
-
activesupport (6.0.3.
|
12
|
+
activesupport (6.0.3.2)
|
12
13
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
14
|
i18n (>= 0.7, < 2)
|
14
15
|
minitest (~> 5.1)
|
15
16
|
tzinfo (~> 1.1)
|
16
17
|
zeitwerk (~> 2.2, >= 2.2.2)
|
17
18
|
childprocess (3.0.0)
|
18
|
-
concurrent-ruby (1.1.
|
19
|
+
concurrent-ruby (1.1.7)
|
19
20
|
diff-lcs (1.3)
|
20
|
-
i18n (1.8.
|
21
|
+
i18n (1.8.5)
|
21
22
|
concurrent-ruby (~> 1.0)
|
23
|
+
mini_portile2 (2.4.0)
|
22
24
|
minitest (5.14.1)
|
25
|
+
nokogiri (1.10.10)
|
26
|
+
mini_portile2 (~> 2.4.0)
|
23
27
|
rake (13.0.1)
|
24
28
|
regexp_parser (1.7.1)
|
25
29
|
rspec (3.9.0)
|
@@ -45,7 +49,11 @@ GEM
|
|
45
49
|
watir (6.16.5)
|
46
50
|
regexp_parser (~> 1.2)
|
47
51
|
selenium-webdriver (~> 3.6)
|
48
|
-
|
52
|
+
webdrivers (4.4.1)
|
53
|
+
nokogiri (~> 1.6)
|
54
|
+
rubyzip (>= 1.3.0)
|
55
|
+
selenium-webdriver (>= 3.0, < 4.0)
|
56
|
+
zeitwerk (2.4.0)
|
49
57
|
|
50
58
|
PLATFORMS
|
51
59
|
ruby
|
data/README.md
CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
|
|
81
81
|
#=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
|
82
82
|
```
|
83
83
|
|
84
|
+
### Working with selectors
|
85
|
+
|
86
|
+
Each attribute can be mapped to a selector, which can be any of the following types:
|
87
|
+
|
88
|
+
* A string, which represents a CSS selector.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
|
92
|
+
```
|
93
|
+
|
94
|
+
* A hash, which can have any of the following options:
|
95
|
+
* `xpath: [String]`
|
96
|
+
* `css: [String]`
|
97
|
+
* `index: [Integer]`
|
98
|
+
* `tag_name: [String]`
|
99
|
+
* `text: [String]`
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
{ text: "View Archive" }
|
103
|
+
```
|
104
|
+
|
105
|
+
* An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
|
106
|
+
|
107
|
+
```ruby
|
108
|
+
[".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
|
109
|
+
```
|
110
|
+
|
111
|
+
Use any of them as it suits you best.
|
112
|
+
|
113
|
+
### Writing steps
|
114
|
+
|
115
|
+
Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
|
116
|
+
|
117
|
+
* **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
{
|
121
|
+
goto: { text: "View Archive" }
|
122
|
+
}
|
123
|
+
```
|
124
|
+
|
125
|
+
* **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
{
|
129
|
+
click: { css: "[type=submit]" }
|
130
|
+
}
|
131
|
+
```
|
132
|
+
|
133
|
+
* **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
{
|
137
|
+
fill_form: {
|
138
|
+
gem_name: "ScrapKit",
|
139
|
+
author: "hpneo",
|
140
|
+
}
|
141
|
+
}
|
142
|
+
```
|
143
|
+
|
84
144
|
## Development
|
85
145
|
|
86
146
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/scrap_kit/recipe.rb
CHANGED
@@ -1,8 +1,23 @@
|
|
1
1
|
require "active_support/core_ext/hash"
|
2
|
+
require "webdrivers/chromedriver"
|
2
3
|
require "watir"
|
3
4
|
|
4
5
|
module ScrapKit
|
5
6
|
class Recipe
|
7
|
+
class << self
|
8
|
+
def load(source)
|
9
|
+
input = if source.is_a?(Hash)
|
10
|
+
source
|
11
|
+
elsif source.is_a?(IO)
|
12
|
+
JSON.parse(source.read)
|
13
|
+
else
|
14
|
+
JSON.parse(File.read(source))
|
15
|
+
end
|
16
|
+
|
17
|
+
new(input.deep_symbolize_keys)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
6
21
|
def initialize(url: nil, steps: [], attributes: {})
|
7
22
|
@url = url
|
8
23
|
@steps = steps
|
@@ -12,76 +27,169 @@ module ScrapKit
|
|
12
27
|
def run
|
13
28
|
output = {}
|
14
29
|
|
15
|
-
browser = Watir::Browser.new(:chrome, headless: true)
|
16
|
-
browser.goto @url
|
30
|
+
@browser = Watir::Browser.new(:chrome, headless: true)
|
31
|
+
@browser.goto @url
|
17
32
|
|
18
33
|
@steps.each do |step|
|
19
|
-
run_step(
|
34
|
+
run_step(step)
|
20
35
|
end
|
21
36
|
|
22
37
|
@attributes.each do |attribute_name, selector|
|
23
|
-
output[attribute_name] = extract_attribute(browser, selector)
|
38
|
+
output[attribute_name] = extract_attribute(@browser, selector)
|
24
39
|
end
|
25
40
|
|
26
|
-
browser.close
|
27
|
-
browser = nil
|
41
|
+
@browser.close
|
42
|
+
@browser = nil
|
28
43
|
|
29
44
|
output
|
30
45
|
end
|
31
46
|
|
32
|
-
def run_step(
|
47
|
+
def run_step(step)
|
48
|
+
return goto(step[:goto]) if step[:goto]
|
49
|
+
return click(step[:click]) if step[:click]
|
50
|
+
return fill_form(step[:fill_form]) if step[:fill_form]
|
51
|
+
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def find_element_by_name_or_selector(browser_or_element, name_or_selector)
|
56
|
+
element = browser_or_element.element(name: name_or_selector.to_s)
|
57
|
+
return element if element.exists?
|
58
|
+
|
59
|
+
element = browser_or_element.element(css: name_or_selector.to_s)
|
60
|
+
return element if element.exists?
|
61
|
+
|
62
|
+
nil
|
33
63
|
end
|
34
64
|
|
35
65
|
def elements_from_selector(browser_or_element, selector)
|
36
66
|
if selector.is_a?(String)
|
37
67
|
browser_or_element.elements(css: selector)
|
68
|
+
elsif selector.is_a?(Hash)
|
69
|
+
browser_or_element.elements(selector)
|
38
70
|
elsif selector.is_a?(Array)
|
39
71
|
*remainder, condition = selector
|
72
|
+
condition_key, condition_value = condition.first
|
40
73
|
elements = browser_or_element
|
41
74
|
|
42
|
-
remainder.
|
43
|
-
elements = elements.elements(css:
|
75
|
+
if remainder.empty?
|
76
|
+
elements = elements.elements(css: condition_key.to_s)
|
77
|
+
else
|
78
|
+
remainder.each do |item|
|
79
|
+
elements = elements.elements(css: item)
|
80
|
+
end
|
44
81
|
end
|
45
82
|
|
46
83
|
elements.filter do |element|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
84
|
+
found_element = element.element(css: condition_key.to_s)
|
85
|
+
extracted_value = extract_value_from_element(found_element)
|
86
|
+
extracted_value.match(condition_value) || extracted_value == condition_value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_value_from_element(element)
|
92
|
+
if element&.respond_to?(:tag_name)
|
93
|
+
if element.tag_name.downcase == "input"
|
94
|
+
return element.attribute_value(:value)
|
51
95
|
end
|
52
96
|
end
|
97
|
+
|
98
|
+
element&.text_content
|
53
99
|
end
|
54
100
|
|
55
|
-
def extract_attribute(browser_or_element,
|
56
|
-
if
|
57
|
-
browser_or_element.element(css:
|
58
|
-
elsif
|
59
|
-
|
60
|
-
|
101
|
+
def extract_attribute(browser_or_element, selector_or_object)
|
102
|
+
if selector_or_object.is_a?(String)
|
103
|
+
extract_value_from_element(browser_or_element.element(css: selector_or_object))
|
104
|
+
elsif selector_or_object.is_a?(Array)
|
105
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
106
|
+
|
107
|
+
if found_elements.size === 1
|
108
|
+
extract_value_from_element(found_elements.first)
|
109
|
+
else
|
110
|
+
found_elements.map do |element|
|
111
|
+
extract_value_from_element(element)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
elsif selector_or_object.is_a?(Hash)
|
115
|
+
if selector_or_object[:selector] && selector_or_object[:children_attributes]
|
116
|
+
selector = selector_or_object[:selector]
|
117
|
+
selector_for_children_attributes = selector_or_object[:children_attributes]
|
118
|
+
|
119
|
+
elements_from_selector(browser_or_element, selector).map do |element|
|
120
|
+
output = {}
|
61
121
|
|
62
|
-
|
63
|
-
|
122
|
+
selector_for_children_attributes.each do |child_attribute_name, child_selector|
|
123
|
+
output[child_attribute_name] = extract_attribute(element, child_selector)
|
124
|
+
end
|
64
125
|
|
65
|
-
|
66
|
-
|
126
|
+
output
|
127
|
+
end
|
128
|
+
else
|
129
|
+
found_elements = elements_from_selector(browser_or_element, selector_or_object)
|
130
|
+
|
131
|
+
if found_elements.size === 1
|
132
|
+
extract_value_from_element(found_elements.first)
|
133
|
+
else
|
134
|
+
found_elements.map do |element|
|
135
|
+
extract_value_from_element(element)
|
136
|
+
end
|
67
137
|
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
private
|
68
143
|
|
69
|
-
|
144
|
+
def goto(link_or_selector)
|
145
|
+
if link_or_selector.is_a?(String)
|
146
|
+
@browser.goto(link_or_selector)
|
147
|
+
elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
|
148
|
+
if found_element = elements_from_selector(@browser, link_or_selector).first
|
149
|
+
found_element.click
|
70
150
|
end
|
71
151
|
end
|
152
|
+
|
153
|
+
sleep 0.5
|
154
|
+
@browser.wait_until do
|
155
|
+
@browser.ready_state == "complete"
|
156
|
+
end
|
157
|
+
rescue
|
158
|
+
nil
|
72
159
|
end
|
73
160
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
elsif source.is_a?(IO)
|
79
|
-
JSON.parse(source.read)
|
80
|
-
else
|
81
|
-
JSON.parse(File.read(source))
|
161
|
+
def click(selector)
|
162
|
+
if selector.is_a?(Array) || selector.is_a?(Hash)
|
163
|
+
if found_element = elements_from_selector(@browser, selector).first
|
164
|
+
found_element.click
|
82
165
|
end
|
166
|
+
end
|
83
167
|
|
84
|
-
|
168
|
+
sleep 1
|
169
|
+
@browser.wait_until do
|
170
|
+
@browser.ready_state == "complete"
|
171
|
+
end
|
172
|
+
|
173
|
+
rescue
|
174
|
+
nil
|
175
|
+
end
|
176
|
+
|
177
|
+
def fill_form(form_data)
|
178
|
+
form_data.each do |name, value|
|
179
|
+
if element = find_element_by_name_or_selector(@browser.body, name.to_s)
|
180
|
+
element = element.to_subtype
|
181
|
+
|
182
|
+
if element.respond_to?(:set)
|
183
|
+
element.set(value)
|
184
|
+
elsif element.respond_to?(:select)
|
185
|
+
element.select(value)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
sleep 0.25
|
191
|
+
@browser.wait_until do
|
192
|
+
@browser.ready_state == "complete"
|
85
193
|
end
|
86
194
|
end
|
87
195
|
end
|
data/lib/scrap_kit/version.rb
CHANGED
data/scrap_kit.gemspec
CHANGED
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_development_dependency "rake", "~> 13.0"
|
33
33
|
spec.add_development_dependency "rspec", "~> 3.0"
|
34
34
|
spec.add_dependency "watir", "~> 6.16.5"
|
35
|
-
spec.add_dependency "
|
35
|
+
spec.add_dependency "webdrivers", "~> 4.0"
|
36
|
+
spec.add_dependency "activesupport", "~> 6.0"
|
36
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrap_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gustavo Leon
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,20 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 6.16.5
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: webdrivers
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '4.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '4.0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: activesupport
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- -
|
87
|
+
- - "~>"
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: 6.0
|
89
|
+
version: '6.0'
|
76
90
|
type: :runtime
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- -
|
94
|
+
- - "~>"
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: 6.0
|
96
|
+
version: '6.0'
|
83
97
|
description: Run JSON-based recipes to scrap web sites.
|
84
98
|
email:
|
85
99
|
- hpneo@hotmail.com
|
@@ -87,7 +101,7 @@ executables: []
|
|
87
101
|
extensions: []
|
88
102
|
extra_rdoc_files: []
|
89
103
|
files:
|
90
|
-
- ".github/workflows/
|
104
|
+
- ".github/workflows/run_tests.yml"
|
91
105
|
- ".gitignore"
|
92
106
|
- ".rspec"
|
93
107
|
- ".rubocop.yml"
|