makuri 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
4
- data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
3
+ metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
4
+ data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
5
5
  SHA512:
6
- metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
7
- data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
6
+ metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
7
+ data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.4.0 (2024-06-13)
2
+ - Add Ferrum support
3
+ - Remove Capybara/chrome support
4
+ - Bug fixes
5
+
1
6
  ## 0.3.0 (2021-02-07)
2
7
  - Allow scrape from multiple start_urls
3
8
  - Add :engine support to Spider
@@ -24,23 +24,26 @@ module Makuri
24
24
  @url = url
25
25
  validate_url
26
26
 
27
- return follow if engine_chrome?
27
+ return follow if ['Ferrum', 'Chrome'].include? browser_engine
28
28
 
29
29
  @request_method = options.fetch(:method, request_method)
30
30
  @request_body = options.fetch(:body, request_body)
31
31
  browser.visit(@url)
32
32
  end
33
33
 
34
+ def quit
35
+ browser.respond_to?(:quit) ? browser.quit : nil
36
+ end
37
+
38
+ private
39
+
34
40
  # Only allow for capybara get request
35
41
  def follow
36
42
  browser.visit(@url)
37
43
  browser
38
44
  end
39
45
 
40
- private
41
-
42
46
  def create_browser
43
- browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
44
47
  builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
45
48
  builder.new(browser_params).build
46
49
  end
@@ -59,8 +62,13 @@ module Makuri
59
62
  raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
60
63
  end
61
64
 
62
- def engine_chrome?
63
- engine == :chrome
65
+ def browser_engine
66
+ case engine
67
+ when :ferrum
68
+ 'Ferrum'
69
+ else
70
+ 'NetHttp'
71
+ end
64
72
  end
65
73
  end
66
74
  end
@@ -0,0 +1,44 @@
1
+ require 'ferrum'
2
+
3
+ module Makuri::BrowserBuilder
4
+ class Ferrum < Base
5
+ attr_accessor :headless, :html
6
+
7
+ def initialize(options = {})
8
+ super
9
+ @headless = options.fetch(:headless, true)
10
+ end
11
+
12
+ def build
13
+ headers = { 'User-Agent': user_agent }
14
+ @ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
15
+ self
16
+ end
17
+
18
+ def visit(url)
19
+ @ferrum.go_to url
20
+ @html = @ferrum.body
21
+ self
22
+ end
23
+
24
+ def quit
25
+ @ferrum.quit
26
+ end
27
+
28
+ private
29
+
30
+ # TODO:
31
+ # def browser_options
32
+ # args = %w[
33
+ # --disable-gpu
34
+ # --no-sandbox
35
+ # --disable-translate
36
+ # --ignore-certificate-errors
37
+ # ]
38
+ # args << '--headless' if headless
39
+ # args << "--user-agent=#{user_agent}"
40
+ # args << "--blink-settings=imagesEnabled=#{enable_images}"
41
+ # Selenium::WebDriver::Chrome::Options.new(args: args)
42
+ # end
43
+ end
44
+ end
data/lib/makuri/spider.rb CHANGED
@@ -20,28 +20,37 @@ module Makuri
20
20
 
21
21
  def spider_options(**options)
22
22
  @engine = options.fetch(:engine, :net_http)
23
+ @headless = options.fetch(:headless, true)
23
24
  end
24
25
 
25
26
  def run
26
27
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
27
28
 
28
29
  @engine ||= :net_http
30
+ @headless = defined?(@headless) ? @headless : true
29
31
 
30
- @start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
32
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
31
33
  end
32
34
  end
33
35
 
34
- attr_accessor :engine, :response
36
+ attr_accessor :engine, :headless, :response
35
37
 
36
38
  def initialize(**config)
37
39
  @start_url = config.fetch(:start_url, nil)
38
40
  @engine = config.fetch(:engine, :net_http)
41
+ @headless = config.fetch(:headless, true)
39
42
 
40
43
  update_response(@start_url)
44
+ browser_quit_wrapper { parse }
41
45
  end
42
46
 
43
47
  def browser
44
- @browser ||= Makuri::Browser.new(engine: engine)
48
+ @browser ||= Makuri::Browser.new(engine: engine, headless: headless)
49
+ end
50
+
51
+ def browser_quit_wrapper
52
+ yield
53
+ browser.quit
45
54
  end
46
55
 
47
56
  def parse
@@ -77,7 +86,6 @@ module Makuri
77
86
 
78
87
  def update_response(url)
79
88
  res = browser.request(absolute_url(url)).html
80
- # res = res.html if @engine == :chrome
81
89
  @response = Nokogiri::HTML(res)
82
90
  end
83
91
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.3.0'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
data/lib/makuri.rb CHANGED
@@ -5,7 +5,7 @@ require 'makuri/spider'
5
5
  require 'makuri/version'
6
6
 
7
7
  require 'makuri/browser_builder/base'
8
- require 'makuri/browser_builder/chrome'
8
+ require 'makuri/browser_builder/ferrum'
9
9
  require 'makuri/browser_builder/net_http'
10
10
 
11
11
  module Makuri
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-07 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,98 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: '2.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
27
- - !ruby/object:Gem::Dependency
28
- name: capybara
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.34'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.34'
26
+ version: '2.8'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: nokogiri
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 1.10.10
33
+ version: 1.15.5
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 1.10.10
40
+ version: 1.15.5
55
41
  - !ruby/object:Gem::Dependency
56
- name: selenium-webdriver
42
+ name: ferrum
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '3.5'
47
+ version: '0.15'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '3.5'
54
+ version: '0.15'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '2.0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '2.0'
83
- - !ruby/object:Gem::Dependency
84
- name: webdrivers
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '4.0'
61
+ version: 2.5.0
90
62
  type: :development
91
63
  prerelease: false
92
64
  version_requirements: !ruby/object:Gem::Requirement
93
65
  requirements:
94
66
  - - "~>"
95
67
  - !ruby/object:Gem::Version
96
- version: '4.0'
68
+ version: 2.5.0
97
69
  - !ruby/object:Gem::Dependency
98
70
  name: webmock
99
71
  requirement: !ruby/object:Gem::Requirement
100
72
  requirements:
101
73
  - - "~>"
102
74
  - !ruby/object:Gem::Version
103
- version: '3.11'
75
+ version: '3.20'
104
76
  type: :development
105
77
  prerelease: false
106
78
  version_requirements: !ruby/object:Gem::Requirement
107
79
  requirements:
108
80
  - - "~>"
109
81
  - !ruby/object:Gem::Version
110
- version: '3.11'
82
+ version: '3.20'
111
83
  description:
112
84
  email: lalusaud@gmail.com
113
85
  executables: []
@@ -120,7 +92,7 @@ files:
120
92
  - lib/makuri.rb
121
93
  - lib/makuri/browser.rb
122
94
  - lib/makuri/browser_builder/base.rb
123
- - lib/makuri/browser_builder/chrome.rb
95
+ - lib/makuri/browser_builder/ferrum.rb
124
96
  - lib/makuri/browser_builder/net_http.rb
125
97
  - lib/makuri/spider.rb
126
98
  - lib/makuri/version.rb
@@ -136,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
136
108
  requirements:
137
109
  - - ">="
138
110
  - !ruby/object:Gem::Version
139
- version: '2.5'
111
+ version: '2.7'
140
112
  required_rubygems_version: !ruby/object:Gem::Requirement
141
113
  requirements:
142
114
  - - ">="
143
115
  - !ruby/object:Gem::Version
144
116
  version: '0'
145
117
  requirements: []
146
- rubygems_version: 3.1.4
118
+ rubygems_version: 3.5.11
147
119
  signing_key:
148
120
  specification_version: 4
149
121
  summary: Web-crawling framework for Ruby
@@ -1,37 +0,0 @@
1
- require 'capybara'
2
- require 'selenium-webdriver'
3
-
4
- module Makuri::BrowserBuilder
5
- class Chrome < Base
6
- attr_accessor :headless, :enable_images
7
-
8
- def initialize(options = {})
9
- super
10
- @headless = options.fetch(:headless, true)
11
- @enable_images = options.fetch(:enable_images, true)
12
- end
13
-
14
- def build
15
- Capybara.register_driver :selenium_chrome do |app|
16
- Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
17
- end
18
- Capybara.threadsafe = true
19
- Capybara::Session.new :selenium_chrome
20
- end
21
-
22
- private
23
-
24
- def browser_options
25
- args = %w[
26
- --disable-gpu
27
- --no-sandbox
28
- --disable-translate
29
- --ignore-certificate-errors
30
- ]
31
- args << '--headless' if headless
32
- args << "--user-agent=#{user_agent}"
33
- args << "--blink-settings=imagesEnabled=#{enable_images}"
34
- Selenium::WebDriver::Chrome::Options.new(args: args)
35
- end
36
- end
37
- end