makuri 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
4
- data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
3
+ metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
4
+ data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
5
5
  SHA512:
6
- metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
7
- data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
6
+ metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
7
+ data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.4.0 (2024-06-13)
2
+ - Add Ferrum support
3
+ - Remove Capybara/chrome support
4
+ - Bug fixes
5
+
1
6
  ## 0.3.0 (2021-02-07)
2
7
  - Allow scrape from multiple start_urls
3
8
  - Add :engine support to Spider
@@ -24,23 +24,26 @@ module Makuri
24
24
  @url = url
25
25
  validate_url
26
26
 
27
- return follow if engine_chrome?
27
+ return follow if ['Ferrum', 'Chrome'].include? browser_engine
28
28
 
29
29
  @request_method = options.fetch(:method, request_method)
30
30
  @request_body = options.fetch(:body, request_body)
31
31
  browser.visit(@url)
32
32
  end
33
33
 
34
+ def quit
35
+ browser.respond_to?(:quit) ? browser.quit : nil
36
+ end
37
+
38
+ private
39
+
34
40
  # Only allow for capybara get request
35
41
  def follow
36
42
  browser.visit(@url)
37
43
  browser
38
44
  end
39
45
 
40
- private
41
-
42
46
  def create_browser
43
- browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
44
47
  builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
45
48
  builder.new(browser_params).build
46
49
  end
@@ -59,8 +62,13 @@ module Makuri
59
62
  raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
60
63
  end
61
64
 
62
- def engine_chrome?
63
- engine == :chrome
65
+ def browser_engine
66
+ case engine
67
+ when :ferrum
68
+ 'Ferrum'
69
+ else
70
+ 'NetHttp'
71
+ end
64
72
  end
65
73
  end
66
74
  end
@@ -0,0 +1,44 @@
1
+ require 'ferrum'
2
+
3
+ module Makuri::BrowserBuilder
4
+ class Ferrum < Base
5
+ attr_accessor :headless, :html
6
+
7
+ def initialize(options = {})
8
+ super
9
+ @headless = options.fetch(:headless, true)
10
+ end
11
+
12
+ def build
13
+ headers = { 'User-Agent': user_agent }
14
+ @ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
15
+ self
16
+ end
17
+
18
+ def visit(url)
19
+ @ferrum.go_to url
20
+ @html = @ferrum.body
21
+ self
22
+ end
23
+
24
+ def quit
25
+ @ferrum.quit
26
+ end
27
+
28
+ private
29
+
30
+ # TODO:
31
+ # def browser_options
32
+ # args = %w[
33
+ # --disable-gpu
34
+ # --no-sandbox
35
+ # --disable-translate
36
+ # --ignore-certificate-errors
37
+ # ]
38
+ # args << '--headless' if headless
39
+ # args << "--user-agent=#{user_agent}"
40
+ # args << "--blink-settings=imagesEnabled=#{enable_images}"
41
+ # Selenium::WebDriver::Chrome::Options.new(args: args)
42
+ # end
43
+ end
44
+ end
data/lib/makuri/spider.rb CHANGED
@@ -20,28 +20,37 @@ module Makuri
20
20
 
21
21
  def spider_options(**options)
22
22
  @engine = options.fetch(:engine, :net_http)
23
+ @headless = options.fetch(:headless, true)
23
24
  end
24
25
 
25
26
  def run
26
27
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
27
28
 
28
29
  @engine ||= :net_http
30
+ @headless = defined?(@headless) ? @headless : true
29
31
 
30
- @start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
32
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
31
33
  end
32
34
  end
33
35
 
34
- attr_accessor :engine, :response
36
+ attr_accessor :engine, :headless, :response
35
37
 
36
38
  def initialize(**config)
37
39
  @start_url = config.fetch(:start_url, nil)
38
40
  @engine = config.fetch(:engine, :net_http)
41
+ @headless = config.fetch(:headless, true)
39
42
 
40
43
  update_response(@start_url)
44
+ browser_quit_wrapper { parse }
41
45
  end
42
46
 
43
47
  def browser
44
- @browser ||= Makuri::Browser.new(engine: engine)
48
+ @browser ||= Makuri::Browser.new(engine: engine, headless: headless)
49
+ end
50
+
51
+ def browser_quit_wrapper
52
+ yield
53
+ browser.quit
45
54
  end
46
55
 
47
56
  def parse
@@ -77,7 +86,6 @@ module Makuri
77
86
 
78
87
  def update_response(url)
79
88
  res = browser.request(absolute_url(url)).html
80
- # res = res.html if @engine == :chrome
81
89
  @response = Nokogiri::HTML(res)
82
90
  end
83
91
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.3.0'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
data/lib/makuri.rb CHANGED
@@ -5,7 +5,7 @@ require 'makuri/spider'
5
5
  require 'makuri/version'
6
6
 
7
7
  require 'makuri/browser_builder/base'
8
- require 'makuri/browser_builder/chrome'
8
+ require 'makuri/browser_builder/ferrum'
9
9
  require 'makuri/browser_builder/net_http'
10
10
 
11
11
  module Makuri
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-07 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,98 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: '2.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
27
- - !ruby/object:Gem::Dependency
28
- name: capybara
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.34'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.34'
26
+ version: '2.8'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: nokogiri
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 1.10.10
33
+ version: 1.15.5
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 1.10.10
40
+ version: 1.15.5
55
41
  - !ruby/object:Gem::Dependency
56
- name: selenium-webdriver
42
+ name: ferrum
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '3.5'
47
+ version: '0.15'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '3.5'
54
+ version: '0.15'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '2.0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: '2.0'
83
- - !ruby/object:Gem::Dependency
84
- name: webdrivers
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - "~>"
88
- - !ruby/object:Gem::Version
89
- version: '4.0'
61
+ version: 2.5.0
90
62
  type: :development
91
63
  prerelease: false
92
64
  version_requirements: !ruby/object:Gem::Requirement
93
65
  requirements:
94
66
  - - "~>"
95
67
  - !ruby/object:Gem::Version
96
- version: '4.0'
68
+ version: 2.5.0
97
69
  - !ruby/object:Gem::Dependency
98
70
  name: webmock
99
71
  requirement: !ruby/object:Gem::Requirement
100
72
  requirements:
101
73
  - - "~>"
102
74
  - !ruby/object:Gem::Version
103
- version: '3.11'
75
+ version: '3.20'
104
76
  type: :development
105
77
  prerelease: false
106
78
  version_requirements: !ruby/object:Gem::Requirement
107
79
  requirements:
108
80
  - - "~>"
109
81
  - !ruby/object:Gem::Version
110
- version: '3.11'
82
+ version: '3.20'
111
83
  description:
112
84
  email: lalusaud@gmail.com
113
85
  executables: []
@@ -120,7 +92,7 @@ files:
120
92
  - lib/makuri.rb
121
93
  - lib/makuri/browser.rb
122
94
  - lib/makuri/browser_builder/base.rb
123
- - lib/makuri/browser_builder/chrome.rb
95
+ - lib/makuri/browser_builder/ferrum.rb
124
96
  - lib/makuri/browser_builder/net_http.rb
125
97
  - lib/makuri/spider.rb
126
98
  - lib/makuri/version.rb
@@ -136,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
136
108
  requirements:
137
109
  - - ">="
138
110
  - !ruby/object:Gem::Version
139
- version: '2.5'
111
+ version: '2.7'
140
112
  required_rubygems_version: !ruby/object:Gem::Requirement
141
113
  requirements:
142
114
  - - ">="
143
115
  - !ruby/object:Gem::Version
144
116
  version: '0'
145
117
  requirements: []
146
- rubygems_version: 3.1.4
118
+ rubygems_version: 3.5.11
147
119
  signing_key:
148
120
  specification_version: 4
149
121
  summary: Web-crawling framework for Ruby
@@ -1,37 +0,0 @@
1
- require 'capybara'
2
- require 'selenium-webdriver'
3
-
4
- module Makuri::BrowserBuilder
5
- class Chrome < Base
6
- attr_accessor :headless, :enable_images
7
-
8
- def initialize(options = {})
9
- super
10
- @headless = options.fetch(:headless, true)
11
- @enable_images = options.fetch(:enable_images, true)
12
- end
13
-
14
- def build
15
- Capybara.register_driver :selenium_chrome do |app|
16
- Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
17
- end
18
- Capybara.threadsafe = true
19
- Capybara::Session.new :selenium_chrome
20
- end
21
-
22
- private
23
-
24
- def browser_options
25
- args = %w[
26
- --disable-gpu
27
- --no-sandbox
28
- --disable-translate
29
- --ignore-certificate-errors
30
- ]
31
- args << '--headless' if headless
32
- args << "--user-agent=#{user_agent}"
33
- args << "--blink-settings=imagesEnabled=#{enable_images}"
34
- Selenium::WebDriver::Chrome::Options.new(args: args)
35
- end
36
- end
37
- end