makuri 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
- data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
3
+ metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
4
+ data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
5
5
  SHA512:
6
- metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
- data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
6
+ metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
7
+ data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
data/CHANGELOG.md CHANGED
@@ -1,10 +1,20 @@
1
+ ## 0.4.0 (2024-06-13)
2
+ - Add Ferrum support
3
+ - Remove Capybara/chrome support
4
+ - Bug fixes
5
+
6
+ ## 0.3.0 (2021-02-07)
7
+ - Allow scrape from multiple start_urls
8
+ - Add :engine support to Spider
9
+ - Add :enable_images support to :chrome engine
10
+ - Bug fixes and restructure #request_method & #request_body
11
+
1
12
  ## 0.2.0 (2021-01-25)
2
13
  - Add Capybara Support
3
14
  - Add :headless option to Browser object
4
15
  - Bug fixes & restructure Browser #follow, #request methods
5
16
 
6
17
  ## 0.1.0 (2021-01-05)
7
-
8
18
  - First Release
9
19
  - Add Basic Browser Builders for crawling web pages
10
20
  - Add Spider Framework to support scraping
@@ -1,50 +1,50 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @url = options.fetch(:url, '')
7
- @request_method = options.fetch(:request_method, :get)
8
- @request_body = options.fetch(:request_body, {})
9
-
10
- @js = options.fetch(:js, false)
6
+ @engine = options.fetch(:engine, :net_http)
11
7
  @headless = options.fetch(:headless, true)
12
8
  @user_agent = options.fetch(
13
9
  :user_agent,
14
10
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
15
11
  )
12
+
13
+ # Defaults
14
+ @request_method = :get
15
+ @request_body = {}
16
16
  end
17
17
 
18
18
  def browser
19
19
  @browser ||= create_browser
20
20
  end
21
21
 
22
- # Only allow for capybara get request
23
- def follow(current_url = '')
24
- @url = current_url unless current_url.empty?
25
- raise 'Invalid URL supplied' if url.empty?
22
+ # Allow for NetHttp get request
23
+ def request(url, options = {})
24
+ @url = url
25
+ validate_url
26
26
 
27
- raise invalid_request_without_get if request_method != :get
27
+ return follow if ['Ferrum', 'Chrome'].include? browser_engine
28
28
 
29
- browser.visit(url)
30
- browser
29
+ @request_method = options.fetch(:method, request_method)
30
+ @request_body = options.fetch(:body, request_body)
31
+ browser.visit(@url)
31
32
  end
32
33
 
33
- # Allow for NetHttp get request
34
- def request(current_url = '')
35
- @url = current_url unless current_url.empty?
36
- raise 'Invalid URL supplied' if url.empty?
37
-
38
- raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
-
40
- browser.visit(url).body
34
+ def quit
35
+ browser.respond_to?(:quit) ? browser.quit : nil
41
36
  end
42
37
 
43
38
  private
44
39
 
40
+ # Only allow for capybara get request
41
+ def follow
42
+ browser.visit(@url)
43
+ browser
44
+ end
45
+
45
46
  def create_browser
46
- engine = js ? 'Chrome' : 'NetHttp'
47
- builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
47
+ builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
48
48
  builder.new(browser_params).build
49
49
  end
50
50
 
@@ -57,8 +57,18 @@ module Makuri
57
57
  }
58
58
  end
59
59
 
60
- def invalid_request_without_get
61
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
60
+ def validate_url
61
+ uri = URI.parse(@url)
62
+ raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
63
+ end
64
+
65
+ def browser_engine
66
+ case engine
67
+ when :ferrum
68
+ 'Ferrum'
69
+ else
70
+ 'NetHttp'
71
+ end
62
72
  end
63
73
  end
64
74
  end
@@ -0,0 +1,44 @@
1
+ require 'ferrum'
2
+
3
+ module Makuri::BrowserBuilder
4
+ class Ferrum < Base
5
+ attr_accessor :headless, :html
6
+
7
+ def initialize(options = {})
8
+ super
9
+ @headless = options.fetch(:headless, true)
10
+ end
11
+
12
+ def build
13
+ headers = { 'User-Agent': user_agent }
14
+ @ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
15
+ self
16
+ end
17
+
18
+ def visit(url)
19
+ @ferrum.go_to url
20
+ @html = @ferrum.body
21
+ self
22
+ end
23
+
24
+ def quit
25
+ @ferrum.quit
26
+ end
27
+
28
+ private
29
+
30
+ # TODO:
31
+ # def browser_options
32
+ # args = %w[
33
+ # --disable-gpu
34
+ # --no-sandbox
35
+ # --disable-translate
36
+ # --ignore-certificate-errors
37
+ # ]
38
+ # args << '--headless' if headless
39
+ # args << "--user-agent=#{user_agent}"
40
+ # args << "--blink-settings=imagesEnabled=#{enable_images}"
41
+ # Selenium::WebDriver::Chrome::Options.new(args: args)
42
+ # end
43
+ end
44
+ end
@@ -2,6 +2,8 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ attr_accessor :response
6
+
5
7
  def build
6
8
  self
7
9
  end
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
12
14
 
13
15
  request = send("#{request_method}_request", uri.request_uri, headers)
14
16
 
15
- Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
17
+ @response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
16
18
  http.request(request)
17
19
  end
20
+ self
21
+ end
22
+
23
+ def html
24
+ @response.body
18
25
  end
19
26
 
20
27
  private
data/lib/makuri/spider.rb CHANGED
@@ -18,23 +18,39 @@ module Makuri
18
18
  @start_urls = urls
19
19
  end
20
20
 
21
+ def spider_options(**options)
22
+ @engine = options.fetch(:engine, :net_http)
23
+ @headless = options.fetch(:headless, true)
24
+ end
25
+
21
26
  def run
22
27
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
23
28
 
24
- new(@start_urls).parse
29
+ @engine ||= :net_http
30
+ @headless = defined?(@headless) ? @headless : true
31
+
32
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
25
33
  end
26
34
  end
27
35
 
28
- attr_accessor :start_urls, :response
36
+ attr_accessor :engine, :headless, :response
29
37
 
30
- def initialize(start_urls)
31
- @start_urls = start_urls
38
+ def initialize(**config)
39
+ @start_url = config.fetch(:start_url, nil)
40
+ @engine = config.fetch(:engine, :net_http)
41
+ @headless = config.fetch(:headless, true)
32
42
 
33
- update_response(@start_urls[0])
43
+ update_response(@start_url)
44
+ browser_quit_wrapper { parse }
34
45
  end
35
46
 
36
47
  def browser
37
- @browser ||= Makuri::Browser.new
48
+ @browser ||= Makuri::Browser.new(engine: engine, headless: headless)
49
+ end
50
+
51
+ def browser_quit_wrapper
52
+ yield
53
+ browser.quit
38
54
  end
39
55
 
40
56
  def parse
@@ -55,18 +71,22 @@ module Makuri
55
71
  end
56
72
 
57
73
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.url, relative_url).to_s
74
+ Addressable::URI.join(base_url, relative_url).to_s
59
75
  end
60
76
 
61
77
  private
62
78
 
79
+ def base_url
80
+ @start_url || browser.url.to_s
81
+ end
82
+
63
83
  def valid_url?(url)
64
84
  defined?(url) && !url.to_s.empty?
65
85
  end
66
86
 
67
87
  def update_response(url)
68
- html = browser.request absolute_url(url)
69
- @response = Nokogiri::HTML(html)
88
+ res = browser.request(absolute_url(url)).html
89
+ @response = Nokogiri::HTML(res)
70
90
  end
71
91
  end
72
92
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.2.0'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
data/lib/makuri.rb CHANGED
@@ -5,7 +5,7 @@ require 'makuri/spider'
5
5
  require 'makuri/version'
6
6
 
7
7
  require 'makuri/browser_builder/base'
8
- require 'makuri/browser_builder/chrome'
8
+ require 'makuri/browser_builder/ferrum'
9
9
  require 'makuri/browser_builder/net_http'
10
10
 
11
11
  module Makuri
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-25 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,84 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: '2.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
27
- - !ruby/object:Gem::Dependency
28
- name: capybara
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.34'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.34'
26
+ version: '2.8'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: nokogiri
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 1.10.10
33
+ version: 1.15.5
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 1.10.10
40
+ version: 1.15.5
55
41
  - !ruby/object:Gem::Dependency
56
- name: selenium-webdriver
42
+ name: ferrum
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '3.5'
47
+ version: '0.15'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '3.5'
54
+ version: '0.15'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '2.0'
61
+ version: 2.5.0
76
62
  type: :development
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '2.0'
68
+ version: 2.5.0
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: webmock
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
73
  - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: '3.11'
75
+ version: '3.20'
90
76
  type: :development
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
80
  - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: '3.11'
82
+ version: '3.20'
97
83
  description:
98
84
  email: lalusaud@gmail.com
99
85
  executables: []
@@ -106,7 +92,7 @@ files:
106
92
  - lib/makuri.rb
107
93
  - lib/makuri/browser.rb
108
94
  - lib/makuri/browser_builder/base.rb
109
- - lib/makuri/browser_builder/chrome.rb
95
+ - lib/makuri/browser_builder/ferrum.rb
110
96
  - lib/makuri/browser_builder/net_http.rb
111
97
  - lib/makuri/spider.rb
112
98
  - lib/makuri/version.rb
@@ -122,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- version: '2.5'
111
+ version: '2.7'
126
112
  required_rubygems_version: !ruby/object:Gem::Requirement
127
113
  requirements:
128
114
  - - ">="
129
115
  - !ruby/object:Gem::Version
130
116
  version: '0'
131
117
  requirements: []
132
- rubygems_version: 3.1.4
118
+ rubygems_version: 3.5.11
133
119
  signing_key:
134
120
  specification_version: 4
135
121
  summary: Web-crawling framework for Ruby
@@ -1,34 +0,0 @@
1
- require 'capybara'
2
- require 'selenium-webdriver'
3
-
4
- module Makuri::BrowserBuilder
5
- class Chrome < Base
6
- def initialize(options = {})
7
- super
8
- @headless = options.fetch(:headless, true)
9
- end
10
-
11
- def build
12
- Capybara.register_driver :selenium_chrome do |app|
13
- Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
14
- end
15
- Capybara.threadsafe = true
16
- Capybara::Session.new :selenium_chrome
17
- end
18
-
19
- private
20
-
21
- def browser_options
22
- args = %w[
23
- --disable-gpu
24
- --no-sandbox
25
- --disable-translate
26
- --blink-settings=imagesEnabled=false
27
- --ignore-certificate-errors
28
- ]
29
- args << '--headless' if @headless
30
- args << "--user-agent=#{@user_agent}"
31
- Selenium::WebDriver::Chrome::Options.new(args: args)
32
- end
33
- end
34
- end