scrap_kit 0.1.7 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9b98409a1148b80bc3f7a908779d3881f36fe3f8a8de2e1726135ea6488e24b2
4
- data.tar.gz: a5b66ae5611527e93bee6167dfd4d365822f4122258054c781b09a290644ad80
3
+ metadata.gz: 8aaad7e12415fe1104dd1810e6800a880e5d407e1025e1aebd029620d3e94152
4
+ data.tar.gz: e0af7af93c10d6a45575eaed22da2f0d9c7472249480d400b4e7dfa557d72a21
5
5
  SHA512:
6
- metadata.gz: 863a0c56ada01470ff55e655c8a1dfa2159d05aa3da5cf3107c7a48509ee31e292bd383955d4e45c2b5f1962f7ea659b5d893c6cd5a8d63fd4d72000b975f137
7
- data.tar.gz: 9a1345ee740fbfcab5b984be2ecb9c6b57a23a715bd19e7cc77c002cc4e0a4115cb9eeed52c6cd1672f4ff7b6eb9c7082d4bc4173b5c5d3e37f2fa17db2bfce8
6
+ metadata.gz: 97cb189bfc69cfaa8649431d34cbd303b2b6d81323437a30eb3f23ac98feba0baa0ed69f20c2e358193d60ccbb9d52110ff489f258af9fc8fe3b086735373697
7
+ data.tar.gz: 57b1d6a4d860b12c3374cdf3a2366c00571a91eb431bc02b2e7d88fbbbb03a85d9ba78414a03bfbb562bcd62157744226940c43b25a3c7c2ed7189c586441b34
@@ -1,9 +1,39 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.12] 2020-09-03
4
+
5
+ ### Changed/Added
6
+ - Return nil if `extract_attribute` fails
7
+
8
+ ## [0.1.11] 2020-09-03
9
+
10
+ ### Changed/Added
11
+ - Add `user_agent` accessor for browser
12
+
13
+ ## [0.1.10] 2020-09-03
14
+
15
+ ### Changed/Added
16
+ - Map attributes to JavaScript calls
17
+
18
+ ## [0.1.9] 2020-08-31
19
+
20
+ ### Changed/Added
21
+ - Set arguments for Chrome driver
22
+
23
+ ## [0.1.8] 2020-08-29
24
+
25
+ ### Changed/Added
26
+ - Add new ways to declare selectors
27
+ - Add steps
28
+
3
29
  ## [0.1.7] 2020-08-28
30
+
31
+ ### Changed/Added
4
32
  - Match selector condition by regexp or exact value
5
33
 
6
34
  ## [0.1.6] 2020-08-28
35
+
36
+ ### Changed/Added
7
37
  - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
8
38
  - Fix bug when matching selector condition
9
39
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.7)
4
+ scrap_kit (0.1.12)
5
5
  activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
@@ -21,7 +21,7 @@ GEM
21
21
  i18n (1.8.5)
22
22
  concurrent-ruby (~> 1.0)
23
23
  mini_portile2 (2.4.0)
24
- minitest (5.14.1)
24
+ minitest (5.14.2)
25
25
  nokogiri (1.10.10)
26
26
  mini_portile2 (~> 2.4.0)
27
27
  rake (13.0.1)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,22 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ attr_accessor :user_agent
8
+
9
+ class << self
10
+ def load(source)
11
+ input = if source.is_a?(Hash)
12
+ source
13
+ elsif source.is_a?(IO)
14
+ JSON.parse(source.read)
15
+ else
16
+ JSON.parse(File.read(source))
17
+ end
18
+
19
+ new(input.deep_symbolize_keys)
20
+ end
21
+ end
22
+
7
23
  def initialize(url: nil, steps: [], attributes: {})
8
24
  @url = url
9
25
  @steps = steps
@@ -13,41 +29,61 @@ module ScrapKit
13
29
  def run
14
30
  output = {}
15
31
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
32
+ @browser = create_browser
33
+ @browser.goto @url
18
34
 
19
35
  @steps.each do |step|
20
- run_step(browser, step)
36
+ run_step(step)
21
37
  end
22
38
 
23
39
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
40
+ output[attribute_name] = extract_attribute(@browser, selector)
25
41
  end
26
42
 
27
- browser.close
28
- browser = nil
43
+ @browser.close
44
+ @browser = nil
29
45
 
30
46
  output
31
47
  end
32
48
 
33
- def run_step(browser, step)
49
+ def run_step(step)
50
+ return goto(step[:goto]) if step[:goto]
51
+ return click(step[:click]) if step[:click]
52
+ return fill_form(step[:fill_form]) if step[:fill_form]
53
+
54
+ nil
55
+ end
56
+
57
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
58
+ element = browser_or_element.element(name: name_or_selector.to_s)
59
+ return element if element.exists?
60
+
61
+ element = browser_or_element.element(css: name_or_selector.to_s)
62
+ return element if element.exists?
63
+
64
+ nil
34
65
  end
35
66
 
36
67
  def elements_from_selector(browser_or_element, selector)
37
68
  if selector.is_a?(String)
38
69
  browser_or_element.elements(css: selector)
70
+ elsif selector.is_a?(Hash)
71
+ browser_or_element.elements(selector)
39
72
  elsif selector.is_a?(Array)
40
73
  *remainder, condition = selector
74
+ condition_key, condition_value = condition.first
41
75
  elements = browser_or_element
42
76
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
77
+ if remainder.empty?
78
+ elements = elements.elements(css: condition_key.to_s)
79
+ else
80
+ remainder.each do |item|
81
+ elements = elements.elements(css: item)
82
+ end
45
83
  end
46
84
 
47
85
  elements.filter do |element|
48
- condition_key = condition.keys.first.to_s
49
- condition_value = condition.values.first
50
- found_element = element.element(css: condition_key)
86
+ found_element = element.element(css: condition_key.to_s)
51
87
  extracted_value = extract_value_from_element(found_element)
52
88
  extracted_value.match(condition_value) || extracted_value == condition_value
53
89
  end
@@ -64,37 +100,119 @@ module ScrapKit
64
100
  element&.text_content
65
101
  end
66
102
 
67
- def extract_attribute(browser_or_element, selector_or_hash)
68
- if selector_or_hash.is_a?(String)
69
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
70
- elsif selector_or_hash.is_a?(Hash)
71
- selector = selector_or_hash[:selector]
72
- selector_for_children_attributes = selector_or_hash[:children_attributes]
103
+ def extract_attribute(browser_or_element, selector_or_object)
104
+ if selector_or_object.is_a?(String)
105
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
106
+ elsif selector_or_object.is_a?(Array)
107
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
73
108
 
74
- elements_from_selector(browser_or_element, selector).map do |element|
75
- output = {}
109
+ if found_elements.size === 1
110
+ extract_value_from_element(found_elements.first)
111
+ else
112
+ found_elements.map do |element|
113
+ extract_value_from_element(element)
114
+ end
115
+ end
116
+ elsif selector_or_object.is_a?(Hash)
117
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
118
+ selector = selector_or_object[:selector]
119
+ selector_for_children_attributes = selector_or_object[:children_attributes]
120
+
121
+ elements_from_selector(browser_or_element, selector).map do |element|
122
+ output = {}
123
+
124
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
125
+ output[child_attribute_name] = extract_attribute(element, child_selector)
126
+ end
76
127
 
77
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
78
- output[child_attribute_name] = extract_attribute(element, child_selector)
128
+ output
79
129
  end
130
+ elsif selector_or_object[:javascript]
131
+ @browser.execute_script(selector_or_object[:javascript])
132
+ else
133
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
134
+
135
+ if found_elements.size === 1
136
+ extract_value_from_element(found_elements.first)
137
+ else
138
+ found_elements.map do |element|
139
+ extract_value_from_element(element)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ rescue
145
+ nil
146
+ end
80
147
 
81
- output
148
+ private
149
+
150
+ def goto(link_or_selector)
151
+ if link_or_selector.is_a?(String)
152
+ @browser.goto(link_or_selector)
153
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
154
+ if found_element = elements_from_selector(@browser, link_or_selector).first
155
+ found_element.click
82
156
  end
83
157
  end
158
+
159
+ sleep 0.5
160
+ @browser.wait_until do
161
+ @browser.ready_state == "complete"
162
+ end
163
+ rescue
164
+ nil
84
165
  end
85
166
 
86
- class << self
87
- def load(source)
88
- input = if source.is_a?(Hash)
89
- source
90
- elsif source.is_a?(IO)
91
- JSON.parse(source.read)
92
- else
93
- JSON.parse(File.read(source))
167
+ def click(selector)
168
+ if selector.is_a?(Array) || selector.is_a?(Hash)
169
+ if found_element = elements_from_selector(@browser, selector).first
170
+ found_element.click
94
171
  end
172
+ end
95
173
 
96
- new(input.deep_symbolize_keys)
174
+ sleep 1
175
+ @browser.wait_until do
176
+ @browser.ready_state == "complete"
97
177
  end
178
+
179
+ rescue
180
+ nil
181
+ end
182
+
183
+ def fill_form(form_data)
184
+ form_data.each do |name, value|
185
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
186
+ element = element.to_subtype
187
+
188
+ if element.respond_to?(:set)
189
+ element.set(value)
190
+ elsif element.respond_to?(:select)
191
+ element.select(value)
192
+ end
193
+ end
194
+ end
195
+
196
+ sleep 0.25
197
+ @browser.wait_until do
198
+ @browser.ready_state == "complete"
199
+ end
200
+ end
201
+
202
+ def create_browser
203
+ options = Selenium::WebDriver::Chrome::Options.new
204
+
205
+ options.add_argument "--headless"
206
+ options.add_argument "--window-size=1080x720"
207
+ options.add_argument "--hide-scrollbars"
208
+ options.add_argument "--user-agent=#{@user_agent}" if @user_agent
209
+
210
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
211
+ options.add_argument "--no-sandbox"
212
+ options.binary = chrome_bin
213
+ end
214
+
215
+ Watir::Browser.new(:chrome, options: options)
98
216
  end
99
217
  end
100
218
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-28 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler