scrap_kit 0.1.5 → 0.1.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0800dc31098f3961b39783d64ee7ddfbecc67a3eba2efaa1fdda7c384bf0b88b'
4
- data.tar.gz: bd6e32addeef4d04545d092aeef0b45e9e60dfedf1dc9643ae70e8e50512a589
3
+ metadata.gz: 28049015b73f5b5508d952f54fd9d5a326e3377043be78776ad925e963161ffa
4
+ data.tar.gz: ebf4112f8a71bc4fe6ba98d6bc02a417f3cd1269a493f59657164945747363d2
5
5
  SHA512:
6
- metadata.gz: c8bee9dc8ed755c2edd4b5c6968ce0e9aae4f4a662963838e9493e9ed125b59ca45c95e18af9e9724730d37ada2c39b8b9ebeef95dc1bb69cb3161813770343a
7
- data.tar.gz: a6a86b4ea05fe89541db1dcd82dd768e5b31c7185a7659901324dff13c517e82d4faf3c414165b84b58d19f0e3316ca87493dfe66a6325cc8e43aabaa16ce38e
6
+ metadata.gz: 50bbf6756482a1d3a94ea5994efa9921049405bfd3d5486ab854d0478adaa0d4c7e891e645ee9c9d774398cacacb26a6814b34efdf858be6ab14a7765da2c9b2
7
+ data.tar.gz: 8457382de09844d4fc469c4045e1e916335a1b3152bfd3b5bd0b0c464b590bc306d1fa7f8d66e0c3c40abe490d0b35aed2659223a6fcb0d81ec9c2efa94498d9
@@ -0,0 +1,20 @@
1
+ name: Run tests
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Setup Ruby
13
+ uses: actions/setup-ruby@v1
14
+ with:
15
+ ruby-version: '2.7.1'
16
+ - name: Build and run tests
17
+ run: |
18
+ gem install bundler
19
+ bundle update --conservative --jobs 4 --retry 3
20
+ bundle exec rake spec
@@ -1,5 +1,32 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.10] 2020-09-03
4
+
5
+ ### Changed/Added
6
+ - Map attributes to JavaScript calls
7
+
8
+ ## [0.1.9] 2020-08-31
9
+
10
+ ### Changed/Added
11
+ - Set arguments for Chrome driver
12
+
13
+ ## [0.1.8] 2020-08-29
14
+
15
+ ### Changed/Added
16
+ - Add new ways to declare selectors
17
+ - Add steps
18
+
19
+ ## [0.1.7] 2020-08-28
20
+
21
+ ### Changed/Added
22
+ - Match selector condition by regexp or exact value
23
+
24
+ ## [0.1.6] 2020-08-28
25
+
26
+ ### Changed/Added
27
+ - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
28
+ - Fix bug when matching selector condition
29
+
3
30
  ## [0.1.5] 2020-08-08
4
31
 
5
32
  ### Changed/Added
@@ -1,27 +1,27 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.5)
5
- activesupport (= 6.0.3.1)
4
+ scrap_kit (0.1.10)
5
+ activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
7
  webdrivers (~> 4.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- activesupport (6.0.3.1)
12
+ activesupport (6.0.3.2)
13
13
  concurrent-ruby (~> 1.0, >= 1.0.2)
14
14
  i18n (>= 0.7, < 2)
15
15
  minitest (~> 5.1)
16
16
  tzinfo (~> 1.1)
17
17
  zeitwerk (~> 2.2, >= 2.2.2)
18
18
  childprocess (3.0.0)
19
- concurrent-ruby (1.1.6)
19
+ concurrent-ruby (1.1.7)
20
20
  diff-lcs (1.3)
21
21
  i18n (1.8.5)
22
22
  concurrent-ruby (~> 1.0)
23
23
  mini_portile2 (2.4.0)
24
- minitest (5.14.1)
24
+ minitest (5.14.2)
25
25
  nokogiri (1.10.10)
26
26
  mini_portile2 (~> 2.4.0)
27
27
  rake (13.0.1)
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -4,6 +4,20 @@ require "watir"
4
4
 
5
5
  module ScrapKit
6
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
7
21
  def initialize(url: nil, steps: [], attributes: {})
8
22
  @url = url
9
23
  @steps = steps
@@ -13,42 +27,63 @@ module ScrapKit
13
27
  def run
14
28
  output = {}
15
29
 
16
- browser = Watir::Browser.new(:chrome, headless: true)
17
- browser.goto @url
30
+ @browser = create_browser
31
+ @browser.goto @url
18
32
 
19
33
  @steps.each do |step|
20
- run_step(browser, step)
34
+ run_step(step)
21
35
  end
22
36
 
23
37
  @attributes.each do |attribute_name, selector|
24
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
25
39
  end
26
40
 
27
- browser.close
28
- browser = nil
41
+ @browser.close
42
+ @browser = nil
29
43
 
30
44
  output
31
45
  end
32
46
 
33
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
34
63
  end
35
64
 
36
65
  def elements_from_selector(browser_or_element, selector)
37
66
  if selector.is_a?(String)
38
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
39
70
  elsif selector.is_a?(Array)
40
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
41
73
  elements = browser_or_element
42
74
 
43
- remainder.each do |item|
44
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
45
81
  end
46
82
 
47
83
  elements.filter do |element|
48
- condition_key = condition.keys[0].to_s
49
- condition_value = condition.values[0]
50
- found_element = element.element(css: condition_key)
51
- extract_value_from_element(found_element)&.match(condition_value)
84
+ found_element = element.element(css: condition_key.to_s)
85
+ extracted_value = extract_value_from_element(found_element)
86
+ extracted_value.match(condition_value) || extracted_value == condition_value
52
87
  end
53
88
  end
54
89
  end
@@ -63,37 +98,116 @@ module ScrapKit
63
98
  element&.text_content
64
99
  end
65
100
 
66
- def extract_attribute(browser_or_element, selector_or_hash)
67
- if selector_or_hash.is_a?(String)
68
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
69
- elsif selector_or_hash.is_a?(Hash)
70
- selector = selector_or_hash[:selector]
71
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
72
106
 
73
- elements_from_selector(browser_or_element, selector).map do |element|
74
- output = {}
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
118
+
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
75
121
 
76
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
77
- output[child_attribute_name] = extract_attribute(element, child_selector)
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
125
+
126
+ output
78
127
  end
128
+ elsif selector_or_object[:javascript]
129
+ @browser.execute_script(selector_or_object[:javascript])
130
+ else
131
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
132
+
133
+ if found_elements.size === 1
134
+ extract_value_from_element(found_elements.first)
135
+ else
136
+ found_elements.map do |element|
137
+ extract_value_from_element(element)
138
+ end
139
+ end
140
+ end
141
+ end
142
+ end
143
+
144
+ private
79
145
 
80
- output
146
+ def goto(link_or_selector)
147
+ if link_or_selector.is_a?(String)
148
+ @browser.goto(link_or_selector)
149
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
150
+ if found_element = elements_from_selector(@browser, link_or_selector).first
151
+ found_element.click
81
152
  end
82
153
  end
154
+
155
+ sleep 0.5
156
+ @browser.wait_until do
157
+ @browser.ready_state == "complete"
158
+ end
159
+ rescue
160
+ nil
83
161
  end
84
162
 
85
- class << self
86
- def load(source)
87
- input = if source.is_a?(Hash)
88
- source
89
- elsif source.is_a?(IO)
90
- JSON.parse(source.read)
91
- else
92
- JSON.parse(File.read(source))
163
+ def click(selector)
164
+ if selector.is_a?(Array) || selector.is_a?(Hash)
165
+ if found_element = elements_from_selector(@browser, selector).first
166
+ found_element.click
93
167
  end
168
+ end
94
169
 
95
- new(input.deep_symbolize_keys)
170
+ sleep 1
171
+ @browser.wait_until do
172
+ @browser.ready_state == "complete"
96
173
  end
174
+
175
+ rescue
176
+ nil
177
+ end
178
+
179
+ def fill_form(form_data)
180
+ form_data.each do |name, value|
181
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
182
+ element = element.to_subtype
183
+
184
+ if element.respond_to?(:set)
185
+ element.set(value)
186
+ elsif element.respond_to?(:select)
187
+ element.select(value)
188
+ end
189
+ end
190
+ end
191
+
192
+ sleep 0.25
193
+ @browser.wait_until do
194
+ @browser.ready_state == "complete"
195
+ end
196
+ end
197
+
198
+ def create_browser
199
+ options = Selenium::WebDriver::Chrome::Options.new
200
+
201
+ options.add_argument "--headless"
202
+ options.add_argument "--window-size=1080x720"
203
+ options.add_argument "--hide-scrollbars"
204
+
205
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
206
+ options.add_argument "--no-sandbox"
207
+ options.binary = chrome_bin
208
+ end
209
+
210
+ Watir::Browser.new(:chrome, options: options)
97
211
  end
98
212
  end
99
213
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.10"
3
3
  end
@@ -33,5 +33,5 @@ Gem::Specification.new do |spec|
33
33
  spec.add_development_dependency "rspec", "~> 3.0"
34
34
  spec.add_dependency "watir", "~> 6.16.5"
35
35
  spec.add_dependency "webdrivers", "~> 4.0"
36
- spec.add_dependency "activesupport", "6.0.3.1"
36
+ spec.add_dependency "activesupport", "~> 6.0"
37
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-08 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -84,16 +84,16 @@ dependencies:
84
84
  name: activesupport
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - '='
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 6.0.3.1
89
+ version: '6.0'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - '='
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: 6.0.3.1
96
+ version: '6.0'
97
97
  description: Run JSON-based recipes to scrap web sites.
98
98
  email:
99
99
  - hpneo@hotmail.com
@@ -101,7 +101,7 @@ executables: []
101
101
  extensions: []
102
102
  extra_rdoc_files: []
103
103
  files:
104
- - ".github/workflows/publish_gem.yml"
104
+ - ".github/workflows/run_tests.yml"
105
105
  - ".gitignore"
106
106
  - ".rspec"
107
107
  - ".rubocop.yml"
@@ -1,17 +0,0 @@
1
- name: Publish gem
2
-
3
- on:
4
- push:
5
- tags:
6
- - '*'
7
-
8
- jobs:
9
- build:
10
-
11
- runs-on: ubuntu-latest
12
-
13
- steps:
14
- - name: Publish gem
15
- uses: dawidd6/action-publish-gem@v1.0.0
16
- with:
17
- api_key: ${{secrets.RUBYGEMS_API_KEY}}