scrap_kit 0.1.3 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cd2924b0ba55e1d73238b17b4bf15b6fb36a8df61910e8af0a5876ed44ada36a
4
- data.tar.gz: 897da113e002b93980a735beb87138f73b39cf2f3961d267c99bba291d6553b6
3
+ metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
4
+ data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
5
5
  SHA512:
6
- metadata.gz: fb70c2d388c38e8f1b7a42f31ab661d8e277fffbba6750e774d89c34d7ef6ac277eaefe6b17a83924cc8f41e9deb61c280839f673388a7b55310d024b554df84
7
- data.tar.gz: 9dfe5f761476abb11ac5e19e905f88694aade27d52e2cea7bbf92dd3cece6dda9cd62a41f0bdff25509dbad321435c907d005e276319b2e9dce05d3ab9b7daf1
6
+ metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
7
+ data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
@@ -0,0 +1,20 @@
1
+ name: Run tests
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Setup Ruby
13
+ uses: actions/setup-ruby@v1
14
+ with:
15
+ ruby-version: '2.7.1'
16
+ - name: Build and run tests
17
+ run: |
18
+ gem install bundler
19
+ bundle update --conservative --jobs 4 --retry 3
20
+ bundle exec rake spec
@@ -1,5 +1,37 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.8] 2020-08-29
4
+
5
+ ### Changed/Added
6
+ - Add new ways to declare selectors
7
+ - Add steps
8
+
9
+ ## [0.1.7] 2020-08-28
10
+
11
+ ### Changed/Added
12
+ - Match selector condition by regexp or exact value
13
+
14
+ ## [0.1.6] 2020-08-28
15
+
16
+ ### Changed/Added
17
+ - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
18
+ - Fix bug when matching selector condition
19
+
20
+ ## [0.1.5] 2020-08-08
21
+
22
+ ### Changed/Added
23
+ - Add `webdrivers` as dependency
24
+
25
+ ## [0.1.4] 2020-07-12
26
+
27
+ ### Changed/Added
28
+ - Add support for `<input />` elements
29
+
30
+ ## [0.1.3] 2020-06-18
31
+
32
+ ### Changed/Added
33
+ - Moved development dependencies as dependencies
34
+
3
35
  ## [0.1.2] 2020-06-18
4
36
 
5
37
  ### Changed/Added
@@ -9,6 +41,7 @@
9
41
 
10
42
  ### Changed/Added
11
43
  - Update activesupport requirement from = 6.0.2.1 to = 6.0.3.1
44
+ - Update rake requirement from ~> 10.0 to ~> 13.0
12
45
 
13
46
  ## [0.1.0] 2020-02-09
14
47
 
@@ -1,25 +1,29 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.3)
5
- activesupport (= 6.0.3.1)
4
+ scrap_kit (0.1.8)
5
+ activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
+ webdrivers (~> 4.0)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- activesupport (6.0.3.1)
12
+ activesupport (6.0.3.2)
12
13
  concurrent-ruby (~> 1.0, >= 1.0.2)
13
14
  i18n (>= 0.7, < 2)
14
15
  minitest (~> 5.1)
15
16
  tzinfo (~> 1.1)
16
17
  zeitwerk (~> 2.2, >= 2.2.2)
17
18
  childprocess (3.0.0)
18
- concurrent-ruby (1.1.6)
19
+ concurrent-ruby (1.1.7)
19
20
  diff-lcs (1.3)
20
- i18n (1.8.3)
21
+ i18n (1.8.5)
21
22
  concurrent-ruby (~> 1.0)
23
+ mini_portile2 (2.4.0)
22
24
  minitest (5.14.1)
25
+ nokogiri (1.10.10)
26
+ mini_portile2 (~> 2.4.0)
23
27
  rake (13.0.1)
24
28
  regexp_parser (1.7.1)
25
29
  rspec (3.9.0)
@@ -45,7 +49,11 @@ GEM
45
49
  watir (6.16.5)
46
50
  regexp_parser (~> 1.2)
47
51
  selenium-webdriver (~> 3.6)
48
- zeitwerk (2.3.0)
52
+ webdrivers (4.4.1)
53
+ nokogiri (~> 1.6)
54
+ rubyzip (>= 1.3.0)
55
+ selenium-webdriver (>= 3.0, < 4.0)
56
+ zeitwerk (2.4.0)
49
57
 
50
58
  PLATFORMS
51
59
  ruby
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,8 +1,23 @@
1
1
  require "active_support/core_ext/hash"
2
+ require "webdrivers/chromedriver"
2
3
  require "watir"
3
4
 
4
5
  module ScrapKit
5
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
6
21
  def initialize(url: nil, steps: [], attributes: {})
7
22
  @url = url
8
23
  @steps = steps
@@ -12,76 +27,169 @@ module ScrapKit
12
27
  def run
13
28
  output = {}
14
29
 
15
- browser = Watir::Browser.new(:chrome, headless: true)
16
- browser.goto @url
30
+ @browser = Watir::Browser.new(:chrome, headless: true)
31
+ @browser.goto @url
17
32
 
18
33
  @steps.each do |step|
19
- run_step(browser, step)
34
+ run_step(step)
20
35
  end
21
36
 
22
37
  @attributes.each do |attribute_name, selector|
23
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
24
39
  end
25
40
 
26
- browser.close
27
- browser = nil
41
+ @browser.close
42
+ @browser = nil
28
43
 
29
44
  output
30
45
  end
31
46
 
32
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
33
63
  end
34
64
 
35
65
  def elements_from_selector(browser_or_element, selector)
36
66
  if selector.is_a?(String)
37
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
38
70
  elsif selector.is_a?(Array)
39
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
40
73
  elements = browser_or_element
41
74
 
42
- remainder.each do |item|
43
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
44
81
  end
45
82
 
46
83
  elements.filter do |element|
47
- condition_key = condition.keys[0].to_s
48
- condition_value = condition.values[0]
49
- found_element = element.element(css: condition_key)
50
- found_element&.text_content&.match(condition_value)
84
+ found_element = element.element(css: condition_key.to_s)
85
+ extracted_value = extract_value_from_element(found_element)
86
+ extracted_value.match(condition_value) || extracted_value == condition_value
87
+ end
88
+ end
89
+ end
90
+
91
+ def extract_value_from_element(element)
92
+ if element&.respond_to?(:tag_name)
93
+ if element.tag_name.downcase == "input"
94
+ return element.attribute_value(:value)
51
95
  end
52
96
  end
97
+
98
+ element&.text_content
53
99
  end
54
100
 
55
- def extract_attribute(browser_or_element, selector_or_hash)
56
- if selector_or_hash.is_a?(String)
57
- browser_or_element.element(css: selector_or_hash)&.text_content
58
- elsif selector_or_hash.is_a?(Hash)
59
- selector = selector_or_hash[:selector]
60
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
106
+
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
118
+
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
61
121
 
62
- elements_from_selector(browser_or_element, selector).map do |element|
63
- output = {}
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
64
125
 
65
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
66
- output[child_attribute_name] = extract_attribute(element, child_selector)
126
+ output
127
+ end
128
+ else
129
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
130
+
131
+ if found_elements.size === 1
132
+ extract_value_from_element(found_elements.first)
133
+ else
134
+ found_elements.map do |element|
135
+ extract_value_from_element(element)
136
+ end
67
137
  end
138
+ end
139
+ end
140
+ end
141
+
142
+ private
68
143
 
69
- output
144
+ def goto(link_or_selector)
145
+ if link_or_selector.is_a?(String)
146
+ @browser.goto(link_or_selector)
147
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
148
+ if found_element = elements_from_selector(@browser, link_or_selector).first
149
+ found_element.click
70
150
  end
71
151
  end
152
+
153
+ sleep 0.5
154
+ @browser.wait_until do
155
+ @browser.ready_state == "complete"
156
+ end
157
+ rescue
158
+ nil
72
159
  end
73
160
 
74
- class << self
75
- def load(source)
76
- input = if source.is_a?(Hash)
77
- source
78
- elsif source.is_a?(IO)
79
- JSON.parse(source.read)
80
- else
81
- JSON.parse(File.read(source))
161
+ def click(selector)
162
+ if selector.is_a?(Array) || selector.is_a?(Hash)
163
+ if found_element = elements_from_selector(@browser, selector).first
164
+ found_element.click
82
165
  end
166
+ end
83
167
 
84
- new(input.deep_symbolize_keys)
168
+ sleep 1
169
+ @browser.wait_until do
170
+ @browser.ready_state == "complete"
171
+ end
172
+
173
+ rescue
174
+ nil
175
+ end
176
+
177
+ def fill_form(form_data)
178
+ form_data.each do |name, value|
179
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
180
+ element = element.to_subtype
181
+
182
+ if element.respond_to?(:set)
183
+ element.set(value)
184
+ elsif element.respond_to?(:select)
185
+ element.select(value)
186
+ end
187
+ end
188
+ end
189
+
190
+ sleep 0.25
191
+ @browser.wait_until do
192
+ @browser.ready_state == "complete"
85
193
  end
86
194
  end
87
195
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "rake", "~> 13.0"
33
33
  spec.add_development_dependency "rspec", "~> 3.0"
34
34
  spec.add_dependency "watir", "~> 6.16.5"
35
- spec.add_dependency "activesupport", "6.0.3.1"
35
+ spec.add_dependency "webdrivers", "~> 4.0"
36
+ spec.add_dependency "activesupport", "~> 6.0"
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-19 00:00:00.000000000 Z
11
+ date: 2020-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,34 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 6.16.5
69
+ - !ruby/object:Gem::Dependency
70
+ name: webdrivers
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: activesupport
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - '='
87
+ - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: 6.0.3.1
89
+ version: '6.0'
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - '='
94
+ - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: 6.0.3.1
96
+ version: '6.0'
83
97
  description: Run JSON-based recipes to scrap web sites.
84
98
  email:
85
99
  - hpneo@hotmail.com
@@ -87,7 +101,7 @@ executables: []
87
101
  extensions: []
88
102
  extra_rdoc_files: []
89
103
  files:
90
- - ".github/workflows/publish_gem.yml"
104
+ - ".github/workflows/run_tests.yml"
91
105
  - ".gitignore"
92
106
  - ".rspec"
93
107
  - ".rubocop.yml"
@@ -1,17 +0,0 @@
1
- name: Publish gem
2
-
3
- on:
4
- push:
5
- tags:
6
- - '*'
7
-
8
- jobs:
9
- build:
10
-
11
- runs-on: ubuntu-latest
12
-
13
- steps:
14
- - name: Publish gem
15
- uses: dawidd6/action-publish-gem@v1.0.0
16
- with:
17
- api_key: ${{secrets.RUBYGEMS_API_KEY}}