scrap_kit 0.1.6 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffb35087e7e374b2e5fa15d2f9c2d41374d0242e4f2b5a986db195929a530dc6
4
- data.tar.gz: 904246fc2061a3ec1686a70d6755fd81f13b215b3dce7816177f0774af666b89
3
+ metadata.gz: 36f370f43360cdc2725cbb83591738dc46c5949cb995039f45cf1ddaccd091a4
4
+ data.tar.gz: b904bef830b3e5cd88892677eed3f0cf97635e2d46277cc69280ca5ee7b6a0a3
5
5
  SHA512:
6
- metadata.gz: c55754d5f772f7bfd9c9bd4fdda802f77a898f6c86f0956d9f94d9c81e7151fdaed07ec45f56d0b7010eb56dc7a1a94e58079bcb1da4753586766eefd4bd4954
7
- data.tar.gz: b20fc3e3b8852361e707d4597ec510d44cb3d76a9260f6d719e2547adf5a68abafeb7423f02c47ba853a4f92ee43df3ccfc88bb60dd21913249ef8b62613a53c
6
+ metadata.gz: e0c1321ef88bf5be53603e41e0826676d9bc50bb741b4d45d8cc79ff42bc6e0360d989c3deb097d8c784b9c795495898a39cf7ff4a41f36c62f57d0f030a8915
7
+ data.tar.gz: b668a193cf260b94239d1d1e016e9eb4b2d8e3de43aaf604d6d9fb255b5147295e2079e5da7b3b3dbb6177ea45ac413ce73467f11ba73d0f2495fa91b3196af4
@@ -1,6 +1,34 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.11] 2020-09-03
4
+
5
+ ### Changed/Added
6
+ - Add `user_agent` accessor for browser
7
+
8
+ ## [0.1.10] 2020-09-03
9
+
10
+ ### Changed/Added
11
+ - Map attributes to JavaScript calls
12
+
13
+ ## [0.1.9] 2020-08-31
14
+
15
+ ### Changed/Added
16
+ - Set arguments for Chrome driver
17
+
18
+ ## [0.1.8] 2020-08-29
19
+
20
+ ### Changed/Added
21
+ - Add new ways to declare selectors
22
+ - Add steps
23
+
24
+ ## [0.1.7] 2020-08-28
25
+
26
+ ### Changed/Added
27
+ - Match selector condition by regexp or exact value
28
+
3
29
  ## [0.1.6] 2020-08-28
30
+
31
+ ### Changed/Added
4
32
  - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
5
33
  - Fix bug when matching selector condition
6
34
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.6)
4
+ scrap_kit (0.1.11)
5
5
  activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
@@ -21,7 +21,7 @@ GEM
21
21
  i18n (1.8.5)
22
22
  concurrent-ruby (~> 1.0)
23
23
  mini_portile2 (2.4.0)
24
- minitest (5.14.1)
24
+ minitest (5.14.2)
25
25
  nokogiri (1.10.10)
26
26
  mini_portile2 (~> 2.4.0)
27
27
  rake (13.0.1)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,22 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ attr_accessor :user_agent
8
+
9
+ class << self
10
+ def load(source)
11
+ input = if source.is_a?(Hash)
12
+ source
13
+ elsif source.is_a?(IO)
14
+ JSON.parse(source.read)
15
+ else
16
+ JSON.parse(File.read(source))
17
+ end
18
+
19
+ new(input.deep_symbolize_keys)
20
+ end
21
+ end
22
+
7
23
  def initialize(url: nil, steps: [], attributes: {})
8
24
  @url = url
9
25
  @steps = steps
@@ -13,42 +29,63 @@ module ScrapKit
13
29
  def run
14
30
  output = {}
15
31
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
32
+ @browser = create_browser
33
+ @browser.goto @url
18
34
 
19
35
  @steps.each do |step|
20
- run_step(browser, step)
36
+ run_step(step)
21
37
  end
22
38
 
23
39
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
40
+ output[attribute_name] = extract_attribute(@browser, selector)
25
41
  end
26
42
 
27
- browser.close
28
- browser = nil
43
+ @browser.close
44
+ @browser = nil
29
45
 
30
46
  output
31
47
  end
32
48
 
33
- def run_step(browser, step)
49
+ def run_step(step)
50
+ return goto(step[:goto]) if step[:goto]
51
+ return click(step[:click]) if step[:click]
52
+ return fill_form(step[:fill_form]) if step[:fill_form]
53
+
54
+ nil
55
+ end
56
+
57
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
58
+ element = browser_or_element.element(name: name_or_selector.to_s)
59
+ return element if element.exists?
60
+
61
+ element = browser_or_element.element(css: name_or_selector.to_s)
62
+ return element if element.exists?
63
+
64
+ nil
34
65
  end
35
66
 
36
67
  def elements_from_selector(browser_or_element, selector)
37
68
  if selector.is_a?(String)
38
69
  browser_or_element.elements(css: selector)
70
+ elsif selector.is_a?(Hash)
71
+ browser_or_element.elements(selector)
39
72
  elsif selector.is_a?(Array)
40
73
  *remainder, condition = selector
74
+ condition_key, condition_value = condition.first
41
75
  elements = browser_or_element
42
76
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
77
+ if remainder.empty?
78
+ elements = elements.elements(css: condition_key.to_s)
79
+ else
80
+ remainder.each do |item|
81
+ elements = elements.elements(css: item)
82
+ end
45
83
  end
46
84
 
47
85
  elements.filter do |element|
48
- condition_key = condition.keys.first.to_s
49
- condition_value = condition.values.first
50
- found_element = element.element(css: condition_key)
51
- extract_value_from_element(found_element) == condition_value
86
+ found_element = element.element(css: condition_key.to_s)
87
+ extracted_value = extract_value_from_element(found_element)
88
+ extracted_value.match(condition_value) || extracted_value == condition_value
52
89
  end
53
90
  end
54
91
  end
@@ -63,37 +100,117 @@ module ScrapKit
63
100
  element&.text_content
64
101
  end
65
102
 
66
- def extract_attribute(browser_or_element, selector_or_hash)
67
- if selector_or_hash.is_a?(String)
68
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
69
- elsif selector_or_hash.is_a?(Hash)
70
- selector = selector_or_hash[:selector]
71
- selector_for_children_attributes = selector_or_hash[:children_attributes]
103
+ def extract_attribute(browser_or_element, selector_or_object)
104
+ if selector_or_object.is_a?(String)
105
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
106
+ elsif selector_or_object.is_a?(Array)
107
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
72
108
 
73
- elements_from_selector(browser_or_element, selector).map do |element|
74
- output = {}
109
+ if found_elements.size === 1
110
+ extract_value_from_element(found_elements.first)
111
+ else
112
+ found_elements.map do |element|
113
+ extract_value_from_element(element)
114
+ end
115
+ end
116
+ elsif selector_or_object.is_a?(Hash)
117
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
118
+ selector = selector_or_object[:selector]
119
+ selector_for_children_attributes = selector_or_object[:children_attributes]
120
+
121
+ elements_from_selector(browser_or_element, selector).map do |element|
122
+ output = {}
123
+
124
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
125
+ output[child_attribute_name] = extract_attribute(element, child_selector)
126
+ end
75
127
 
76
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
77
- output[child_attribute_name] = extract_attribute(element, child_selector)
128
+ output
78
129
  end
130
+ elsif selector_or_object[:javascript]
131
+ @browser.execute_script(selector_or_object[:javascript])
132
+ else
133
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
134
+
135
+ if found_elements.size === 1
136
+ extract_value_from_element(found_elements.first)
137
+ else
138
+ found_elements.map do |element|
139
+ extract_value_from_element(element)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
79
145
 
80
- output
146
+ private
147
+
148
+ def goto(link_or_selector)
149
+ if link_or_selector.is_a?(String)
150
+ @browser.goto(link_or_selector)
151
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
152
+ if found_element = elements_from_selector(@browser, link_or_selector).first
153
+ found_element.click
81
154
  end
82
155
  end
156
+
157
+ sleep 0.5
158
+ @browser.wait_until do
159
+ @browser.ready_state == "complete"
160
+ end
161
+ rescue
162
+ nil
83
163
  end
84
164
 
85
- class << self
86
- def load(source)
87
- input = if source.is_a?(Hash)
88
- source
89
- elsif source.is_a?(IO)
90
- JSON.parse(source.read)
91
- else
92
- JSON.parse(File.read(source))
165
+ def click(selector)
166
+ if selector.is_a?(Array) || selector.is_a?(Hash)
167
+ if found_element = elements_from_selector(@browser, selector).first
168
+ found_element.click
93
169
  end
170
+ end
94
171
 
95
- new(input.deep_symbolize_keys)
172
+ sleep 1
173
+ @browser.wait_until do
174
+ @browser.ready_state == "complete"
96
175
  end
176
+
177
+ rescue
178
+ nil
179
+ end
180
+
181
+ def fill_form(form_data)
182
+ form_data.each do |name, value|
183
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
184
+ element = element.to_subtype
185
+
186
+ if element.respond_to?(:set)
187
+ element.set(value)
188
+ elsif element.respond_to?(:select)
189
+ element.select(value)
190
+ end
191
+ end
192
+ end
193
+
194
+ sleep 0.25
195
+ @browser.wait_until do
196
+ @browser.ready_state == "complete"
197
+ end
198
+ end
199
+
200
+ def create_browser
201
+ options = Selenium::WebDriver::Chrome::Options.new
202
+
203
+ options.add_argument "--headless"
204
+ options.add_argument "--window-size=1080x720"
205
+ options.add_argument "--hide-scrollbars"
206
+ options.add_argument "--user-agent=#{@user_agent}" if @user_agent
207
+
208
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
209
+ options.add_argument "--no-sandbox"
210
+ options.binary = chrome_bin
211
+ end
212
+
213
+ Watir::Browser.new(:chrome, options: options)
97
214
  end
98
215
  end
99
216
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-28 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler