scrap_kit 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9b98409a1148b80bc3f7a908779d3881f36fe3f8a8de2e1726135ea6488e24b2
4
- data.tar.gz: a5b66ae5611527e93bee6167dfd4d365822f4122258054c781b09a290644ad80
3
+ metadata.gz: 9125aec53d6a517aa7679bc67df8c6ffd366d77afea643fae757ab7862767c0d
4
+ data.tar.gz: bbfa2cae0560461e7fb1f5af1cb672a0b5c11686e75696b24578861e0d14aa1c
5
5
  SHA512:
6
- metadata.gz: 863a0c56ada01470ff55e655c8a1dfa2159d05aa3da5cf3107c7a48509ee31e292bd383955d4e45c2b5f1962f7ea659b5d893c6cd5a8d63fd4d72000b975f137
7
- data.tar.gz: 9a1345ee740fbfcab5b984be2ecb9c6b57a23a715bd19e7cc77c002cc4e0a4115cb9eeed52c6cd1672f4ff7b6eb9c7082d4bc4173b5c5d3e37f2fa17db2bfce8
6
+ metadata.gz: f2f8e6b7cb709ec2db86696adf27ec64e9c5faf2c339248ef34cb31764155d11de7904e1e881b7a21ba933469ebbccee998db35748030544fd699cb41a2cd7e1
7
+ data.tar.gz: 79fcdcf90aae48eefdf83ed6e285114221047f007a15e7e5d3f5b357091b37f7f51e0778c6ad03f03caa0fa291666831f02a1459d426e522aef1f6c02b4b509d
@@ -1,9 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.8] 2020-08-29
4
+
5
+ ### Changed/Added
6
+ - Add new ways to declare selectors
7
+ - Add steps
8
+
3
9
  ## [0.1.7] 2020-08-28
10
+
11
+ ### Changed/Added
4
12
  - Match selector condition by regexp or exact value
5
13
 
6
14
  ## [0.1.6] 2020-08-28
15
+
16
+ ### Changed/Added
7
17
  - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
8
18
  - Fix bug when matching selector condition
9
19
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.7)
4
+ scrap_kit (0.1.8)
5
5
  activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,20 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
7
21
  def initialize(url: nil, steps: [], attributes: {})
8
22
  @url = url
9
23
  @steps = steps
@@ -13,41 +27,61 @@ module ScrapKit
13
27
  def run
14
28
  output = {}
15
29
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
30
+ @browser = Watir::Browser.new(:chrome, headless: true)
31
+ @browser.goto @url
18
32
 
19
33
  @steps.each do |step|
20
- run_step(browser, step)
34
+ run_step(step)
21
35
  end
22
36
 
23
37
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
25
39
  end
26
40
 
27
- browser.close
28
- browser = nil
41
+ @browser.close
42
+ @browser = nil
29
43
 
30
44
  output
31
45
  end
32
46
 
33
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
34
63
  end
35
64
 
36
65
  def elements_from_selector(browser_or_element, selector)
37
66
  if selector.is_a?(String)
38
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
39
70
  elsif selector.is_a?(Array)
40
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
41
73
  elements = browser_or_element
42
74
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
45
81
  end
46
82
 
47
83
  elements.filter do |element|
48
- condition_key = condition.keys.first.to_s
49
- condition_value = condition.values.first
50
- found_element = element.element(css: condition_key)
84
+ found_element = element.element(css: condition_key.to_s)
51
85
  extracted_value = extract_value_from_element(found_element)
52
86
  extracted_value.match(condition_value) || extracted_value == condition_value
53
87
  end
@@ -64,36 +98,98 @@ module ScrapKit
64
98
  element&.text_content
65
99
  end
66
100
 
67
- def extract_attribute(browser_or_element, selector_or_hash)
68
- if selector_or_hash.is_a?(String)
69
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
70
- elsif selector_or_hash.is_a?(Hash)
71
- selector = selector_or_hash[:selector]
72
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
73
106
 
74
- elements_from_selector(browser_or_element, selector).map do |element|
75
- output = {}
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
76
118
 
77
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
78
- output[child_attribute_name] = extract_attribute(element, child_selector)
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
121
+
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
125
+
126
+ output
79
127
  end
128
+ else
129
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
130
+
131
+ if found_elements.size === 1
132
+ extract_value_from_element(found_elements.first)
133
+ else
134
+ found_elements.map do |element|
135
+ extract_value_from_element(element)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
80
141
 
81
- output
142
+ private
143
+
144
+ def goto(link_or_selector)
145
+ if link_or_selector.is_a?(String)
146
+ @browser.goto(link_or_selector)
147
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
148
+ if found_element = elements_from_selector(@browser, link_or_selector).first
149
+ found_element.click
82
150
  end
83
151
  end
152
+
153
+ sleep 0.5
154
+ @browser.wait_until do
155
+ @browser.ready_state == "complete"
156
+ end
157
+ rescue
158
+ nil
84
159
  end
85
160
 
86
- class << self
87
- def load(source)
88
- input = if source.is_a?(Hash)
89
- source
90
- elsif source.is_a?(IO)
91
- JSON.parse(source.read)
92
- else
93
- JSON.parse(File.read(source))
161
+ def click(selector)
162
+ if selector.is_a?(Array) || selector.is_a?(Hash)
163
+ if found_element = elements_from_selector(@browser, selector).first
164
+ found_element.click
94
165
  end
166
+ end
95
167
 
96
- new(input.deep_symbolize_keys)
168
+ sleep 1
169
+ @browser.wait_until do
170
+ @browser.ready_state == "complete"
171
+ end
172
+
173
+ rescue
174
+ nil
175
+ end
176
+
177
+ def fill_form(form_data)
178
+ form_data.each do |name, value|
179
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
180
+ element = element.to_subtype
181
+
182
+ if element.respond_to?(:set)
183
+ element.set(value)
184
+ elsif element.respond_to?(:select)
185
+ element.select(value)
186
+ end
187
+ end
188
+ end
189
+
190
+ sleep 0.25
191
+ @browser.wait_until do
192
+ @browser.ready_state == "complete"
97
193
  end
98
194
  end
99
195
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-28 00:00:00.000000000 Z
11
+ date: 2020-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler