scrap_kit 0.1.5 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0800dc31098f3961b39783d64ee7ddfbecc67a3eba2efaa1fdda7c384bf0b88b'
4
- data.tar.gz: bd6e32addeef4d04545d092aeef0b45e9e60dfedf1dc9643ae70e8e50512a589
3
+ metadata.gz: 28049015b73f5b5508d952f54fd9d5a326e3377043be78776ad925e963161ffa
4
+ data.tar.gz: ebf4112f8a71bc4fe6ba98d6bc02a417f3cd1269a493f59657164945747363d2
5
5
  SHA512:
6
- metadata.gz: c8bee9dc8ed755c2edd4b5c6968ce0e9aae4f4a662963838e9493e9ed125b59ca45c95e18af9e9724730d37ada2c39b8b9ebeef95dc1bb69cb3161813770343a
7
- data.tar.gz: a6a86b4ea05fe89541db1dcd82dd768e5b31c7185a7659901324dff13c517e82d4faf3c414165b84b58d19f0e3316ca87493dfe66a6325cc8e43aabaa16ce38e
6
+ metadata.gz: 50bbf6756482a1d3a94ea5994efa9921049405bfd3d5486ab854d0478adaa0d4c7e891e645ee9c9d774398cacacb26a6814b34efdf858be6ab14a7765da2c9b2
7
+ data.tar.gz: 8457382de09844d4fc469c4045e1e916335a1b3152bfd3b5bd0b0c464b590bc306d1fa7f8d66e0c3c40abe490d0b35aed2659223a6fcb0d81ec9c2efa94498d9
@@ -0,0 +1,20 @@
1
+ name: Run tests
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Setup Ruby
13
+ uses: actions/setup-ruby@v1
14
+ with:
15
+ ruby-version: '2.7.1'
16
+ - name: Build and run tests
17
+ run: |
18
+ gem install bundler
19
+ bundle update --conservative --jobs 4 --retry 3
20
+ bundle exec rake spec
@@ -1,5 +1,32 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.10] 2020-09-03
4
+
5
+ ### Changed/Added
6
+ - Map attributes to JavaScript calls
7
+
8
+ ## [0.1.9] 2020-08-31
9
+
10
+ ### Changed/Added
11
+ - Set arguments for Chrome driver
12
+
13
+ ## [0.1.8] 2020-08-29
14
+
15
+ ### Changed/Added
16
+ - Add new ways to declare selectors
17
+ - Add steps
18
+
19
+ ## [0.1.7] 2020-08-28
20
+
21
+ ### Changed/Added
22
+ - Match selector condition by regexp or exact value
23
+
24
+ ## [0.1.6] 2020-08-28
25
+
26
+ ### Changed/Added
27
+ - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
28
+ - Fix bug when matching selector condition
29
+
3
30
  ## [0.1.5] 2020-08-08
4
31
 
5
32
  ### Changed/Added
@@ -1,27 +1,27 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.5)
5
- activesupport (= 6.0.3.1)
4
+ scrap_kit (0.1.10)
5
+ activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- activesupport (6.0.3.1)
12
+ activesupport (6.0.3.2)
13
13
  concurrent-ruby (~> 1.0, >= 1.0.2)
14
14
  i18n (>= 0.7, < 2)
15
15
  minitest (~> 5.1)
16
16
  tzinfo (~> 1.1)
17
17
  zeitwerk (~> 2.2, >= 2.2.2)
18
18
  childprocess (3.0.0)
19
- concurrent-ruby (1.1.6)
19
+ concurrent-ruby (1.1.7)
20
20
  diff-lcs (1.3)
21
21
  i18n (1.8.5)
22
22
  concurrent-ruby (~> 1.0)
23
23
  mini_portile2 (2.4.0)
24
- minitest (5.14.1)
24
+ minitest (5.14.2)
25
25
  nokogiri (1.10.10)
26
26
  mini_portile2 (~> 2.4.0)
27
27
  rake (13.0.1)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,20 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
7
21
  def initialize(url: nil, steps: [], attributes: {})
8
22
  @url = url
9
23
  @steps = steps
@@ -13,42 +27,63 @@ module ScrapKit
13
27
  def run
14
28
  output = {}
15
29
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
30
+ @browser = create_browser
31
+ @browser.goto @url
18
32
 
19
33
  @steps.each do |step|
20
- run_step(browser, step)
34
+ run_step(step)
21
35
  end
22
36
 
23
37
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
25
39
  end
26
40
 
27
- browser.close
28
- browser = nil
41
+ @browser.close
42
+ @browser = nil
29
43
 
30
44
  output
31
45
  end
32
46
 
33
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
34
63
  end
35
64
 
36
65
  def elements_from_selector(browser_or_element, selector)
37
66
  if selector.is_a?(String)
38
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
39
70
  elsif selector.is_a?(Array)
40
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
41
73
  elements = browser_or_element
42
74
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
45
81
  end
46
82
 
47
83
  elements.filter do |element|
48
- condition_key = condition.keys[0].to_s
49
- condition_value = condition.values[0]
50
- found_element = element.element(css: condition_key)
51
- extract_value_from_element(found_element)&.match(condition_value)
84
+ found_element = element.element(css: condition_key.to_s)
85
+ extracted_value = extract_value_from_element(found_element)
86
+ extracted_value.match(condition_value) || extracted_value == condition_value
52
87
  end
53
88
  end
54
89
  end
@@ -63,37 +98,116 @@ module ScrapKit
63
98
  element&.text_content
64
99
  end
65
100
 
66
- def extract_attribute(browser_or_element, selector_or_hash)
67
- if selector_or_hash.is_a?(String)
68
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
69
- elsif selector_or_hash.is_a?(Hash)
70
- selector = selector_or_hash[:selector]
71
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
72
106
 
73
- elements_from_selector(browser_or_element, selector).map do |element|
74
- output = {}
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
118
+
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
75
121
 
76
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
77
- output[child_attribute_name] = extract_attribute(element, child_selector)
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
125
+
126
+ output
78
127
  end
128
+ elsif selector_or_object[:javascript]
129
+ @browser.execute_script(selector_or_object[:javascript])
130
+ else
131
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
132
+
133
+ if found_elements.size === 1
134
+ extract_value_from_element(found_elements.first)
135
+ else
136
+ found_elements.map do |element|
137
+ extract_value_from_element(element)
138
+ end
139
+ end
140
+ end
141
+ end
142
+ end
143
+
144
+ private
79
145
 
80
- output
146
+ def goto(link_or_selector)
147
+ if link_or_selector.is_a?(String)
148
+ @browser.goto(link_or_selector)
149
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
150
+ if found_element = elements_from_selector(@browser, link_or_selector).first
151
+ found_element.click
81
152
  end
82
153
  end
154
+
155
+ sleep 0.5
156
+ @browser.wait_until do
157
+ @browser.ready_state == "complete"
158
+ end
159
+ rescue
160
+ nil
83
161
  end
84
162
 
85
- class << self
86
- def load(source)
87
- input = if source.is_a?(Hash)
88
- source
89
- elsif source.is_a?(IO)
90
- JSON.parse(source.read)
91
- else
92
- JSON.parse(File.read(source))
163
+ def click(selector)
164
+ if selector.is_a?(Array) || selector.is_a?(Hash)
165
+ if found_element = elements_from_selector(@browser, selector).first
166
+ found_element.click
93
167
  end
168
+ end
94
169
 
95
- new(input.deep_symbolize_keys)
170
+ sleep 1
171
+ @browser.wait_until do
172
+ @browser.ready_state == "complete"
96
173
  end
174
+
175
+ rescue
176
+ nil
177
+ end
178
+
179
+ def fill_form(form_data)
180
+ form_data.each do |name, value|
181
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
182
+ element = element.to_subtype
183
+
184
+ if element.respond_to?(:set)
185
+ element.set(value)
186
+ elsif element.respond_to?(:select)
187
+ element.select(value)
188
+ end
189
+ end
190
+ end
191
+
192
+ sleep 0.25
193
+ @browser.wait_until do
194
+ @browser.ready_state == "complete"
195
+ end
196
+ end
197
+
198
+ def create_browser
199
+ options = Selenium::WebDriver::Chrome::Options.new
200
+
201
+ options.add_argument "--headless"
202
+ options.add_argument "--window-size=1080x720"
203
+ options.add_argument "--hide-scrollbars"
204
+
205
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
206
+ options.add_argument "--no-sandbox"
207
+ options.binary = chrome_bin
208
+ end
209
+
210
+ Watir::Browser.new(:chrome, options: options)
97
211
  end
98
212
  end
99
213
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.10"
3
3
  end
@@ -33,5 +33,5 @@ Gem::Specification.new do |spec|
33
33
  spec.add_development_dependency "rspec", "~> 3.0"
34
34
  spec.add_dependency "watir", "~> 6.16.5"
35
35
  spec.add_dependency "webdrivers", "~> 4.0"
36
- spec.add_dependency "activesupport", "6.0.3.1"
36
+ spec.add_dependency "activesupport", "~> 6.0"
37
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-08 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -84,16 +84,16 @@ dependencies:
84
84
  name: activesupport
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '='
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 6.0.3.1
89
+ version: '6.0'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '='
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: 6.0.3.1
96
+ version: '6.0'
97
97
  description: Run JSON-based recipes to scrap web sites.
98
98
  email:
99
99
  - hpneo@hotmail.com
@@ -101,7 +101,7 @@ executables: []
101
101
  extensions: []
102
102
  extra_rdoc_files: []
103
103
  files:
104
- - ".github/workflows/publish_gem.yml"
104
+ - ".github/workflows/run_tests.yml"
105
105
  - ".gitignore"
106
106
  - ".rspec"
107
107
  - ".rubocop.yml"
@@ -1,17 +0,0 @@
1
- name: Publish gem
2
-
3
- on:
4
- push:
5
- tags:
6
- - '*'
7
-
8
- jobs:
9
- build:
10
-
11
- runs-on: ubuntu-latest
12
-
13
- steps:
14
- - name: Publish gem
15
- uses: dawidd6/action-publish-gem@v1.0.0
16
- with:
17
- api_key: ${{secrets.RUBYGEMS_API_KEY}}