scrap_kit 0.1.4 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 06df78782e4cc3aa25ed9f90c1536d935c3fcd08bae82b33e4d79e68e805a09d
4
- data.tar.gz: ddee1afb1a30778818f1b688d8880b5c383f804e8fc8660be89461c98730f869
3
+ metadata.gz: ab218fee7bbf85145afd031bbe59a0820b93eb1e57ee27d50163aa25dbcb8e46
4
+ data.tar.gz: 4d304049b137f2750b3843c80ff925d872576459627bebac443e395d5e897749
5
5
  SHA512:
6
- metadata.gz: 90097dfa2f36f4379117d8487046acdeef98d257f89ca6041064e5d0a3305b52e034a6cacb4d29f1cf1d909afb302dfdd2cd2c6ab7b01755786585dbb217a554
7
- data.tar.gz: f0b8e7a588ac8d195d078af941a335913d547aac7232a3fa473f5b43b55f898e4ed12d28da3ea3488f3d390e43ac561500c585fc030d31ecf1779b9006869227
6
+ metadata.gz: 5e2dd47d6b77ac3983efeb99e351a5e26b2737c1aa0c77972213a744ce44c6033dc2ae3498aca40f68471010b33b6fc3aeb1bf03f922ffa6e95aa0d715493722
7
+ data.tar.gz: bc92e505d8ffca95d621ad8756350ae174bc316efbd015724d87478364c9d500682a4da765595c9158bc56ed3fd855e9827dc4f357fbea64901fdb53665c32b1
@@ -0,0 +1,20 @@
1
+ name: Run tests
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Setup Ruby
13
+ uses: actions/setup-ruby@v1
14
+ with:
15
+ ruby-version: '2.7.1'
16
+ - name: Build and run tests
17
+ run: |
18
+ gem install bundler
19
+ bundle update --conservative --jobs 4 --retry 3
20
+ bundle exec rake spec
@@ -1,5 +1,32 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.1.9] 2020-08-31
4
+
5
+ ### Changed/Added
6
+ - Set arguments for Chrome driver
7
+
8
+ ## [0.1.8] 2020-08-29
9
+
10
+ ### Changed/Added
11
+ - Add new ways to declare selectors
12
+ - Add steps
13
+
14
+ ## [0.1.7] 2020-08-28
15
+
16
+ ### Changed/Added
17
+ - Match selector condition by regexp or exact value
18
+
19
+ ## [0.1.6] 2020-08-28
20
+
21
+ ### Changed/Added
22
+ - Update activesupport requirement from = 6.0.2.1 to ~> 6.0
23
+ - Fix bug when matching selector condition
24
+
25
+ ## [0.1.5] 2020-08-08
26
+
27
+ ### Changed/Added
28
+ - Add `webdrivers` as dependency
29
+
3
30
  ## [0.1.4] 2020-07-12
4
31
 
5
32
  ### Changed/Added
@@ -1,25 +1,29 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scrap_kit (0.1.4)
5
- activesupport (= 6.0.3.1)
4
+ scrap_kit (0.1.9)
5
+ activesupport (~> 6.0)
6
6
  watir (~> 6.16.5)
7
+ webdrivers (~> 4.0)
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- activesupport (6.0.3.1)
12
+ activesupport (6.0.3.2)
12
13
  concurrent-ruby (~> 1.0, >= 1.0.2)
13
14
  i18n (>= 0.7, < 2)
14
15
  minitest (~> 5.1)
15
16
  tzinfo (~> 1.1)
16
17
  zeitwerk (~> 2.2, >= 2.2.2)
17
18
  childprocess (3.0.0)
18
- concurrent-ruby (1.1.6)
19
+ concurrent-ruby (1.1.7)
19
20
  diff-lcs (1.3)
20
- i18n (1.8.3)
21
+ i18n (1.8.5)
21
22
  concurrent-ruby (~> 1.0)
23
+ mini_portile2 (2.4.0)
22
24
  minitest (5.14.1)
25
+ nokogiri (1.10.10)
26
+ mini_portile2 (~> 2.4.0)
23
27
  rake (13.0.1)
24
28
  regexp_parser (1.7.1)
25
29
  rspec (3.9.0)
@@ -45,7 +49,11 @@ GEM
45
49
  watir (6.16.5)
46
50
  regexp_parser (~> 1.2)
47
51
  selenium-webdriver (~> 3.6)
48
- zeitwerk (2.3.1)
52
+ webdrivers (4.4.1)
53
+ nokogiri (~> 1.6)
54
+ rubyzip (>= 1.3.0)
55
+ selenium-webdriver (>= 3.0, < 4.0)
56
+ zeitwerk (2.4.0)
49
57
 
50
58
  PLATFORMS
51
59
  ruby
data/README.md CHANGED
@@ -81,6 +81,66 @@ output = recipe.run
81
81
  #=> {:posts=>[{:title=>"APIs de Internacionalización en JavaScript"}, {:title=>"Ejecutando comandos desde Ruby"}, {:title=>"Usando Higher-Order Components"}]}
82
82
  ```
83
83
 
84
+ ### Working with selectors
85
+
86
+ Each attribute can be mapped to a selector, which can be any of the following types:
87
+
88
+ * A string, which represents a CSS selector.
89
+
90
+ ```ruby
91
+ ".subnav__inner .ember-view:nth-child(1) > .status-summary__description"
92
+ ```
93
+
94
+ * A hash, which can have any of the following options:
95
+ * `xpath: [String]`
96
+ * `css: [String]`
97
+ * `index: [Integer]`
98
+ * `tag_name: [String]`
99
+ * `text: [String]`
100
+
101
+ ```ruby
102
+ { text: "View Archive" }
103
+ ```
104
+
105
+ * An array, which represents a path of selectors, where its last item must be a hash that matches a selector with an expected value.
106
+
107
+ ```ruby
108
+ [".up-time-chart", { ".region-header .u-margin-Tm": "REGION" }]
109
+ ```
110
+
111
+ Use any of them as it suits you best.
112
+
113
+ ### Writing steps
114
+
115
+ Recipes can have a `steps` entry. This entry defines previous actions the scraper have to follow before extract the attributes. The following steps are supported:
116
+
117
+ * **`goto`**: It instructs the scraper to go to a link inside the current page. Its value can be a hash or array selector, or a URL:
118
+
119
+ ```ruby
120
+ {
121
+ goto: { text: "View Archive" }
122
+ }
123
+ ```
124
+
125
+ * **`click`**: It instructs the scraper to click on an element inside the current page. Its value can be a hash or array selector:
126
+
127
+ ```ruby
128
+ {
129
+ click: { css: "[type=submit]" }
130
+ }
131
+ ```
132
+
133
+ * **`fill_form`**: It instructs the scraper to fill a form or any form field inside the current page. Its value is a hash where the keys are either a input's name or a CSS selector, and the values are the values to be entered into those fields:
134
+
135
+ ```ruby
136
+ {
137
+ fill_form: {
138
+ gem_name: "ScrapKit",
139
+ author: "hpneo",
140
+ }
141
+ }
142
+ ```
143
+
84
144
  ## Development
85
145
 
86
146
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,8 +1,23 @@
1
1
  require "active_support/core_ext/hash"
2
+ require "webdrivers/chromedriver"
2
3
  require "watir"
3
4
 
4
5
  module ScrapKit
5
6
  class Recipe
7
+ class << self
8
+ def load(source)
9
+ input = if source.is_a?(Hash)
10
+ source
11
+ elsif source.is_a?(IO)
12
+ JSON.parse(source.read)
13
+ else
14
+ JSON.parse(File.read(source))
15
+ end
16
+
17
+ new(input.deep_symbolize_keys)
18
+ end
19
+ end
20
+
6
21
  def initialize(url: nil, steps: [], attributes: {})
7
22
  @url = url
8
23
  @steps = steps
@@ -12,42 +27,63 @@ module ScrapKit
12
27
  def run
13
28
  output = {}
14
29
 
15
- browser = Watir::Browser.new(:chrome, headless: true)
16
- browser.goto @url
30
+ @browser = create_browser
31
+ @browser.goto @url
17
32
 
18
33
  @steps.each do |step|
19
- run_step(browser, step)
34
+ run_step(step)
20
35
  end
21
36
 
22
37
  @attributes.each do |attribute_name, selector|
23
- output[attribute_name] = extract_attribute(browser, selector)
38
+ output[attribute_name] = extract_attribute(@browser, selector)
24
39
  end
25
40
 
26
- browser.close
27
- browser = nil
41
+ @browser.close
42
+ @browser = nil
28
43
 
29
44
  output
30
45
  end
31
46
 
32
- def run_step(browser, step)
47
+ def run_step(step)
48
+ return goto(step[:goto]) if step[:goto]
49
+ return click(step[:click]) if step[:click]
50
+ return fill_form(step[:fill_form]) if step[:fill_form]
51
+
52
+ nil
53
+ end
54
+
55
+ def find_element_by_name_or_selector(browser_or_element, name_or_selector)
56
+ element = browser_or_element.element(name: name_or_selector.to_s)
57
+ return element if element.exists?
58
+
59
+ element = browser_or_element.element(css: name_or_selector.to_s)
60
+ return element if element.exists?
61
+
62
+ nil
33
63
  end
34
64
 
35
65
  def elements_from_selector(browser_or_element, selector)
36
66
  if selector.is_a?(String)
37
67
  browser_or_element.elements(css: selector)
68
+ elsif selector.is_a?(Hash)
69
+ browser_or_element.elements(selector)
38
70
  elsif selector.is_a?(Array)
39
71
  *remainder, condition = selector
72
+ condition_key, condition_value = condition.first
40
73
  elements = browser_or_element
41
74
 
42
- remainder.each do |item|
43
- elements = elements.elements(css: item)
75
+ if remainder.empty?
76
+ elements = elements.elements(css: condition_key.to_s)
77
+ else
78
+ remainder.each do |item|
79
+ elements = elements.elements(css: item)
80
+ end
44
81
  end
45
82
 
46
83
  elements.filter do |element|
47
- condition_key = condition.keys[0].to_s
48
- condition_value = condition.values[0]
49
- found_element = element.element(css: condition_key)
50
- extract_value_from_element(found_element)&.match(condition_value)
84
+ found_element = element.element(css: condition_key.to_s)
85
+ extracted_value = extract_value_from_element(found_element)
86
+ extracted_value.match(condition_value) || extracted_value == condition_value
51
87
  end
52
88
  end
53
89
  end
@@ -62,37 +98,114 @@ module ScrapKit
62
98
  element&.text_content
63
99
  end
64
100
 
65
- def extract_attribute(browser_or_element, selector_or_hash)
66
- if selector_or_hash.is_a?(String)
67
- extract_value_from_element(browser_or_element.element(css: selector_or_hash))
68
- elsif selector_or_hash.is_a?(Hash)
69
- selector = selector_or_hash[:selector]
70
- selector_for_children_attributes = selector_or_hash[:children_attributes]
101
+ def extract_attribute(browser_or_element, selector_or_object)
102
+ if selector_or_object.is_a?(String)
103
+ extract_value_from_element(browser_or_element.element(css: selector_or_object))
104
+ elsif selector_or_object.is_a?(Array)
105
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
71
106
 
72
- elements_from_selector(browser_or_element, selector).map do |element|
73
- output = {}
107
+ if found_elements.size === 1
108
+ extract_value_from_element(found_elements.first)
109
+ else
110
+ found_elements.map do |element|
111
+ extract_value_from_element(element)
112
+ end
113
+ end
114
+ elsif selector_or_object.is_a?(Hash)
115
+ if selector_or_object[:selector] && selector_or_object[:children_attributes]
116
+ selector = selector_or_object[:selector]
117
+ selector_for_children_attributes = selector_or_object[:children_attributes]
118
+
119
+ elements_from_selector(browser_or_element, selector).map do |element|
120
+ output = {}
74
121
 
75
- selector_for_children_attributes.each do |child_attribute_name, child_selector|
76
- output[child_attribute_name] = extract_attribute(element, child_selector)
122
+ selector_for_children_attributes.each do |child_attribute_name, child_selector|
123
+ output[child_attribute_name] = extract_attribute(element, child_selector)
124
+ end
125
+
126
+ output
77
127
  end
128
+ else
129
+ found_elements = elements_from_selector(browser_or_element, selector_or_object)
130
+
131
+ if found_elements.size === 1
132
+ extract_value_from_element(found_elements.first)
133
+ else
134
+ found_elements.map do |element|
135
+ extract_value_from_element(element)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ private
78
143
 
79
- output
144
+ def goto(link_or_selector)
145
+ if link_or_selector.is_a?(String)
146
+ @browser.goto(link_or_selector)
147
+ elsif link_or_selector.is_a?(Array) || link_or_selector.is_a?(Hash)
148
+ if found_element = elements_from_selector(@browser, link_or_selector).first
149
+ found_element.click
80
150
  end
81
151
  end
152
+
153
+ sleep 0.5
154
+ @browser.wait_until do
155
+ @browser.ready_state == "complete"
156
+ end
157
+ rescue
158
+ nil
82
159
  end
83
160
 
84
- class << self
85
- def load(source)
86
- input = if source.is_a?(Hash)
87
- source
88
- elsif source.is_a?(IO)
89
- JSON.parse(source.read)
90
- else
91
- JSON.parse(File.read(source))
161
+ def click(selector)
162
+ if selector.is_a?(Array) || selector.is_a?(Hash)
163
+ if found_element = elements_from_selector(@browser, selector).first
164
+ found_element.click
92
165
  end
166
+ end
93
167
 
94
- new(input.deep_symbolize_keys)
168
+ sleep 1
169
+ @browser.wait_until do
170
+ @browser.ready_state == "complete"
95
171
  end
172
+
173
+ rescue
174
+ nil
175
+ end
176
+
177
+ def fill_form(form_data)
178
+ form_data.each do |name, value|
179
+ if element = find_element_by_name_or_selector(@browser.body, name.to_s)
180
+ element = element.to_subtype
181
+
182
+ if element.respond_to?(:set)
183
+ element.set(value)
184
+ elsif element.respond_to?(:select)
185
+ element.select(value)
186
+ end
187
+ end
188
+ end
189
+
190
+ sleep 0.25
191
+ @browser.wait_until do
192
+ @browser.ready_state == "complete"
193
+ end
194
+ end
195
+
196
+ def create_browser
197
+ options = Selenium::WebDriver::Chrome::Options.new
198
+
199
+ options.add_argument "--headless"
200
+ options.add_argument "--window-size=1080x720"
201
+ options.add_argument "--hide-scrollbars"
202
+
203
+ if chrome_bin = ENV["GOOGLE_CHROME_SHIM"]
204
+ options.add_argument "--no-sandbox"
205
+ options.binary = chrome_bin
206
+ end
207
+
208
+ Watir::Browser.new(:chrome, options: options)
96
209
  end
97
210
  end
98
211
  end
@@ -1,3 +1,3 @@
1
1
  module ScrapKit
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -32,5 +32,6 @@ Gem::Specification.new do |spec|
32
32
  spec.add_development_dependency "rake", "~> 13.0"
33
33
  spec.add_development_dependency "rspec", "~> 3.0"
34
34
  spec.add_dependency "watir", "~> 6.16.5"
35
- spec.add_dependency "activesupport", "6.0.3.1"
35
+ spec.add_dependency "webdrivers", "~> 4.0"
36
+ spec.add_dependency "activesupport", "~> 6.0"
36
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrap_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gustavo Leon
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-12 00:00:00.000000000 Z
11
+ date: 2020-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,34 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 6.16.5
69
+ - !ruby/object:Gem::Dependency
70
+ name: webdrivers
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: activesupport
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - '='
87
+ - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: 6.0.3.1
89
+ version: '6.0'
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - '='
94
+ - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: 6.0.3.1
96
+ version: '6.0'
83
97
  description: Run JSON-based recipes to scrap web sites.
84
98
  email:
85
99
  - hpneo@hotmail.com
@@ -87,7 +101,7 @@ executables: []
87
101
  extensions: []
88
102
  extra_rdoc_files: []
89
103
  files:
90
- - ".github/workflows/publish_gem.yml"
104
+ - ".github/workflows/run_tests.yml"
91
105
  - ".gitignore"
92
106
  - ".rspec"
93
107
  - ".rubocop.yml"
@@ -1,17 +0,0 @@
1
- name: Publish gem
2
-
3
- on:
4
- push:
5
- tags:
6
- - '*'
7
-
8
- jobs:
9
- build:
10
-
11
- runs-on: ubuntu-latest
12
-
13
- steps:
14
- - name: Publish gem
15
- uses: dawidd6/action-publish-gem@v1.0.0
16
- with:
17
- api_key: ${{secrets.RUBYGEMS_API_KEY}}