scrap_kit 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9b98409a1148b80bc3f7a908779d3881f36fe3f8a8de2e1726135ea6488e24b2
4
- data.tar.gz: a5b66ae5611527e93bee6167dfd4d365822f4122258054c781b09a290644ad80
3
+ metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
4
+ data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
5
5
  SHA512:
6
- metadata.gz: 863a0c56ada01470ff55e655c8a1dfa2159d05aa3da5cf3107c7a48509ee31e292bd383955d4e45c2b5f1962f7ea659b5d893c6cd5a8d63fd4d72000b975f137
7
- data.tar.gz: 9a1345ee740fbfcab5b984be2ecb9c6b57a23a715bd19e7cc77c002cc4e0a4115cb9eeed52c6cd1672f4ff7b6eb9c7082d4bc4173b5c5d3e37f2fa17db2bfce8
6
+ metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
7
+ data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
@@ -1,9 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.8] 2020-08-29
4
+
5
+ ### Changed/Added
6
+ - Add new ways to declare selectors
7
+ - Add steps
8
+
3
9
  ## [0.1.7] 2020-08-28
10
+
11
+ ### Changed/Added
4
12
  - Match selector condition by regexp or exact value
5
13
 
6
14
  ## [0.1.6] 2020-08-28
15
+
16
+ ### Changed/Added
7
17
  - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
8
18
  - Fix bug when matching selector condition
9
19
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.7)
4
+ scrap_kit (0.1.8)
5
5
  activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,20 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
7
21
  def initialize(url: nil, steps: [], attributes: {})
8
22
  @url = url
9
23
  @steps = steps
@@ -13,41 +27,61 @@ module ScrapKit
13
27
  def run
14
28
  output = {}
15
29
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
30
+ @browser = Watir::Browser.new(:chrome, headless: true)
31
+ @browser.goto @url
18
32
 
19
33
  @steps.each do |step|
20
- run_step(browser, step)
34
+ run_step(step)
21
35
  end
22
36
 
23
37
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
25
39
  end
26
40
 
27
- browser.close
28
- browser = nil
41
+ @browser.close
42
+ @browser = nil
29
43
 
30
44
  output
31
45
  end
32
46
 
33
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
34
63
  end
35
64
 
36
65
  def elements_from_selector(browser_or_element, selector)
37
66
  if selector.is_a?(String)
38
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
39
70
  elsif selector.is_a?(Array)
40
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
41
73
  elements = browser_or_element
42
74
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
45
81
  end
46
82
 
47
83
  elements.filter do |element|
48
- condition_key = condition.keys.first.to_s
49
- condition_value = condition.values.first
50
- found_element = element.element(css: condition_key)
84
+ found_element = element.element(css: condition_key.to_s)
51
85
  extracted_value = extract_value_from_element(found_element)
52
86
  extracted_value.match(condition_value) || extracted_value == condition_value
53
87
  end
@@ -64,36 +98,98 @@ module ScrapKit
64
98
  element&.text_content
65
99
  end
66
100
 
67
- def extract_attribute(browser_or_element, selector_or_hash)
68
- if selector_or_hash.is_a?(String)
69
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
70
- elsif selector_or_hash.is_a?(Hash)
71
- selector = selector_or_hash[:selector]
72
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
73
106
 
74
- elements_from_selector(browser_or_element, selector).map do |element|
75
- output = {}
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
76
118
 
77
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
78
- output[child_attribute_name] = extract_attribute(element, child_selector)
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
121
+
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
125
+
126
+ output
79
127
  end
128
+ else
129
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
130
+
131
+ if found_elements.size === 1
132
+ extract_value_from_element(found_elements.first)
133
+ else
134
+ found_elements.map do |element|
135
+ extract_value_from_element(element)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
80
141
 
81
- output
142
+ private
143
+
144
+ def goto(link_or_selector)
145
+ if link_or_selector.is_a?(String)
146
+ @browser.goto(link_or_selector)
147
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
148
+ if found_element = elements_from_selector(@browser, link_or_selector).first
149
+ found_element.click
82
150
  end
83
151
  end
152
+
153
+ sleep 0.5
154
+ @browser.wait_until do
155
+ @browser.ready_state == "complete"
156
+ end
157
+ rescue
158
+ nil
84
159
  end
85
160
 
86
- class << self
87
- def load(source)
88
- input = if source.is_a?(Hash)
89
- source
90
- elsif source.is_a?(IO)
91
- JSON.parse(source.read)
92
- else
93
- JSON.parse(File.read(source))
161
+ def click(selector)
162
+ if selector.is_a?(Array) || selector.is_a?(Hash)
163
+ if found_element = elements_from_selector(@browser, selector).first
164
+ found_element.click
94
165
  end
166
+ end
95
167
 
96
- new(input.deep_symbolize_keys)
168
+ sleep 1
169
+ @browser.wait_until do
170
+ @browser.ready_state == "complete"
171
+ end
172
+
173
+ rescue
174
+ nil
175
+ end
176
+
177
+ def fill_form(form_data)
178
+ form_data.each do |name, value|
179
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
180
+ element = element.to_subtype
181
+
182
+ if element.respond_to?(:set)
183
+ element.set(value)
184
+ elsif element.respond_to?(:select)
185
+ element.select(value)
186
+ end
187
+ end
188
+ end
189
+
190
+ sleep 0.25
191
+ @browser.wait_until do
192
+ @browser.ready_state == "complete"
97
193
  end
98
194
  end
99
195
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-28 00:00:00.000000000 Z
11
+ date: 2020-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler