scrap_kit 0.1.6 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffb35087e7e374b2e5fa15d2f9c2d41374d0242e4f2b5a986db195929a530dc6
4
- data.tar.gz: 904246fc2061a3ec1686a70d6755fd81f13b215b3dce7816177f0774af666b89
3
+ metadata.gz: 36f370f43360cdc2725cbb83591738dc46c5949cb995039f45cf1ddaccd091a4
4
+ data.tar.gz: b904bef830b3e5cd88892677eed3f0cf97635e2d46277cc69280ca5ee7b6a0a3
5
5
  SHA512:
6
- metadata.gz: c55754d5f772f7bfd9c9bd4fdda802f77a898f6c86f0956d9f94d9c81e7151fdaed07ec45f56d0b7010eb56dc7a1a94e58079bcb1da4753586766eefd4bd4954
7
- data.tar.gz: b20fc3e3b8852361e707d4597ec510d44cb3d76a9260f6d719e2547adf5a68abafeb7423f02c47ba853a4f92ee43df3ccfc88bb60dd21913249ef8b62613a53c
6
+ metadata.gz: e0c1321ef88bf5be53603e41e0826676d9bc50bb741b4d45d8cc79ff42bc6e0360d989c3deb097d8c784b9c795495898a39cf7ff4a41f36c62f57d0f030a8915
7
+ data.tar.gz: b668a193cf260b94239d1d1e016e9eb4b2d8e3de43aaf604d6d9fb255b5147295e2079e5da7b3b3dbb6177ea45ac413ce73467f11ba73d0f2495fa91b3196af4
@@ -1,6 +1,34 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.11] 2020-09-03
4
+
5
+ ### Changed/Added
6
+ - Add `user_agent` accessor for browser
7
+
8
+ ## [0.1.10] 2020-09-03
9
+
10
+ ### Changed/Added
11
+ - Map attributes to JavaScript calls
12
+
13
+ ## [0.1.9] 2020-08-31
14
+
15
+ ### Changed/Added
16
+ - Set arguments for Chrome driver
17
+
18
+ ## [0.1.8] 2020-08-29
19
+
20
+ ### Changed/Added
21
+ - Add new ways to declare selectors
22
+ - Add steps
23
+
24
+ ## [0.1.7] 2020-08-28
25
+
26
+ ### Changed/Added
27
+ - Match selector condition by regexp or exact value
28
+
3
29
  ## [0.1.6] 2020-08-28
30
+
31
+ ### Changed/Added
4
32
  - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
5
33
  - Fix bug when matching selector condition
6
34
 
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.6)
4
+ scrap_kit (0.1.11)
5
5
  activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
@@ -21,7 +21,7 @@ GEM
21
21
  i18n (1.8.5)
22
22
  concurrent-ruby (~> 1.0)
23
23
  mini_portile2 (2.4.0)
24
- minitest (5.14.1)
24
+ minitest (5.14.2)
25
25
  nokogiri (1.10.10)
26
26
  mini_portile2 (~> 2.4.0)
27
27
  rake (13.0.1)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,22 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ attr_accessor :user_agent
8
+
9
+ class << self
10
+ def load(source)
11
+ input = if source.is_a?(Hash)
12
+ source
13
+ elsif source.is_a?(IO)
14
+ JSON.parse(source.read)
15
+ else
16
+ JSON.parse(File.read(source))
17
+ end
18
+
19
+ new(input.deep_symbolize_keys)
20
+ end
21
+ end
22
+
7
23
  def initialize(url: nil, steps: [], attributes: {})
8
24
  @url = url
9
25
  @steps = steps
@@ -13,42 +29,63 @@ module ScrapKit
13
29
  def run
14
30
  output = {}
15
31
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
32
+ @browser = create_browser
33
+ @browser.goto @url
18
34
 
19
35
  @steps.each do |step|
20
- run_step(browser, step)
36
+ run_step(step)
21
37
  end
22
38
 
23
39
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
40
+ output[attribute_name] = extract_attribute(@browser, selector)
25
41
  end
26
42
 
27
- browser.close
28
- browser = nil
43
+ @browser.close
44
+ @browser = nil
29
45
 
30
46
  output
31
47
  end
32
48
 
33
- def run_step(browser, step)
49
+ def run_step(step)
50
+ return goto(step[:goto]) if step[:goto]
51
+ return click(step[:click]) if step[:click]
52
+ return fill_form(step[:fill_form]) if step[:fill_form]
53
+
54
+ nil
55
+ end
56
+
57
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
58
+ element = browser_or_element.element(name: name_or_selector.to_s)
59
+ return element if element.exists?
60
+
61
+ element = browser_or_element.element(css: name_or_selector.to_s)
62
+ return element if element.exists?
63
+
64
+ nil
34
65
  end
35
66
 
36
67
  def elements_from_selector(browser_or_element, selector)
37
68
  if selector.is_a?(String)
38
69
  browser_or_element.elements(css: selector)
70
+ elsif selector.is_a?(Hash)
71
+ browser_or_element.elements(selector)
39
72
  elsif selector.is_a?(Array)
40
73
  *remainder, condition = selector
74
+ condition_key, condition_value = condition.first
41
75
  elements = browser_or_element
42
76
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
77
+ if remainder.empty?
78
+ elements = elements.elements(css: condition_key.to_s)
79
+ else
80
+ remainder.each do |item|
81
+ elements = elements.elements(css: item)
82
+ end
45
83
  end
46
84
 
47
85
  elements.filter do |element|
48
- condition_key = condition.keys.first.to_s
49
- condition_value = condition.values.first
50
- found_element = element.element(css: condition_key)
51
- extract_value_from_element(found_element) == condition_value
86
+ found_element = element.element(css: condition_key.to_s)
87
+ extracted_value = extract_value_from_element(found_element)
88
+ extracted_value.match(condition_value) || extracted_value == condition_value
52
89
  end
53
90
  end
54
91
  end
@@ -63,37 +100,117 @@ module ScrapKit
63
100
  element&.text_content
64
101
  end
65
102
 
66
- def extract_attribute(browser_or_element, selector_or_hash)
67
- if selector_or_hash.is_a?(String)
68
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
69
- elsif selector_or_hash.is_a?(Hash)
70
- selector = selector_or_hash[:selector]
71
- selector_for_children_attributes = selector_or_hash[:children_attributes]
103
+ def extract_attribute(browser_or_element, selector_or_object)
104
+ if selector_or_object.is_a?(String)
105
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
106
+ elsif selector_or_object.is_a?(Array)
107
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
72
108
 
73
- elements_from_selector(browser_or_element, selector).map do |element|
74
- output = {}
109
+ if found_elements.size === 1
110
+ extract_value_from_element(found_elements.first)
111
+ else
112
+ found_elements.map do |element|
113
+ extract_value_from_element(element)
114
+ end
115
+ end
116
+ elsif selector_or_object.is_a?(Hash)
117
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
118
+ selector = selector_or_object[:selector]
119
+ selector_for_children_attributes = selector_or_object[:children_attributes]
120
+
121
+ elements_from_selector(browser_or_element, selector).map do |element|
122
+ output = {}
123
+
124
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
125
+ output[child_attribute_name] = extract_attribute(element, child_selector)
126
+ end
75
127
 
76
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
77
- output[child_attribute_name] = extract_attribute(element, child_selector)
128
+ output
78
129
  end
130
+ elsif selector_or_object[:javascript]
131
+ @browser.execute_script(selector_or_object[:javascript])
132
+ else
133
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
134
+
135
+ if found_elements.size === 1
136
+ extract_value_from_element(found_elements.first)
137
+ else
138
+ found_elements.map do |element|
139
+ extract_value_from_element(element)
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
79
145
 
80
- output
146
+ private
147
+
148
+ def goto(link_or_selector)
149
+ if link_or_selector.is_a?(String)
150
+ @browser.goto(link_or_selector)
151
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
152
+ if found_element = elements_from_selector(@browser, link_or_selector).first
153
+ found_element.click
81
154
  end
82
155
  end
156
+
157
+ sleep 0.5
158
+ @browser.wait_until do
159
+ @browser.ready_state == "complete"
160
+ end
161
+ rescue
162
+ nil
83
163
  end
84
164
 
85
- class << self
86
- def load(source)
87
- input = if source.is_a?(Hash)
88
- source
89
- elsif source.is_a?(IO)
90
- JSON.parse(source.read)
91
- else
92
- JSON.parse(File.read(source))
165
+ def click(selector)
166
+ if selector.is_a?(Array) || selector.is_a?(Hash)
167
+ if found_element = elements_from_selector(@browser, selector).first
168
+ found_element.click
93
169
  end
170
+ end
94
171
 
95
- new(input.deep_symbolize_keys)
172
+ sleep 1
173
+ @browser.wait_until do
174
+ @browser.ready_state == "complete"
96
175
  end
176
+
177
+ rescue
178
+ nil
179
+ end
180
+
181
+ def fill_form(form_data)
182
+ form_data.each do |name, value|
183
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
184
+ element = element.to_subtype
185
+
186
+ if element.respond_to?(:set)
187
+ element.set(value)
188
+ elsif element.respond_to?(:select)
189
+ element.select(value)
190
+ end
191
+ end
192
+ end
193
+
194
+ sleep 0.25
195
+ @browser.wait_until do
196
+ @browser.ready_state == "complete"
197
+ end
198
+ end
199
+
200
+ def create_browser
201
+ options = Selenium::WebDriver::Chrome::Options.new
202
+
203
+ options.add_argument "--headless"
204
+ options.add_argument "--window-size=1080x720"
205
+ options.add_argument "--hide-scrollbars"
206
+ options.add_argument "--user-agent=#{@user_agent}" if @user_agent
207
+
208
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
209
+ options.add_argument "--no-sandbox"
210
+ options.binary = chrome_bin
211
+ end
212
+
213
+ Watir::Browser.new(:chrome, options: options)
97
214
  end
98
215
  end
99
216
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-28 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler