makuri 0.2.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
- data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
3
+ metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
4
+ data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
5
5
  SHA512:
6
- metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
- data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
6
+ metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
7
+ data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
data/CHANGELOG.md CHANGED
@@ -1,10 +1,20 @@
1
+ ## 0.4.0 (2024-06-13)
2
+ - Add Ferrum support
3
+ - Remove Capybara/chrome support
4
+ - Bug fixes
5
+
6
+ ## 0.3.0 (2021-02-07)
7
+ - Allow scrape from multiple start_urls
8
+ - Add :engine support to Spider
9
+ - Add :enable_images support to :chrome engine
10
+ - Bug fixes and restructure #request_method & #request_body
11
+
1
12
  ## 0.2.0 (2021-01-25)
2
13
  - Add Capybara Support
3
14
  - Add :headless option to Browser object
4
15
  - Bug fixes & restructure Browser #follow, #request methods
5
16
 
6
17
  ## 0.1.0 (2021-01-05)
7
-
8
18
  - First Release
9
19
  - Add Basic Browser Builders for crawling web pages
10
20
  - Add Spider Framework to support scraping
@@ -1,50 +1,50 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @url = options.fetch(:url, '')
7
- @request_method = options.fetch(:request_method, :get)
8
- @request_body = options.fetch(:request_body, {})
9
-
10
- @js = options.fetch(:js, false)
6
+ @engine = options.fetch(:engine, :net_http)
11
7
  @headless = options.fetch(:headless, true)
12
8
  @user_agent = options.fetch(
13
9
  :user_agent,
14
10
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
15
11
  )
12
+
13
+ # Defaults
14
+ @request_method = :get
15
+ @request_body = {}
16
16
  end
17
17
 
18
18
  def browser
19
19
  @browser ||= create_browser
20
20
  end
21
21
 
22
- # Only allow for capybara get request
23
- def follow(current_url = '')
24
- @url = current_url unless current_url.empty?
25
- raise 'Invalid URL supplied' if url.empty?
22
+ # Allow for NetHttp get request
23
+ def request(url, options = {})
24
+ @url = url
25
+ validate_url
26
26
 
27
- raise invalid_request_without_get if request_method != :get
27
+ return follow if ['Ferrum', 'Chrome'].include? browser_engine
28
28
 
29
- browser.visit(url)
30
- browser
29
+ @request_method = options.fetch(:method, request_method)
30
+ @request_body = options.fetch(:body, request_body)
31
+ browser.visit(@url)
31
32
  end
32
33
 
33
- # Allow for NetHttp get request
34
- def request(current_url = '')
35
- @url = current_url unless current_url.empty?
36
- raise 'Invalid URL supplied' if url.empty?
37
-
38
- raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
-
40
- browser.visit(url).body
34
+ def quit
35
+ browser.respond_to?(:quit) ? browser.quit : nil
41
36
  end
42
37
 
43
38
  private
44
39
 
40
+ # Only allow for capybara get request
41
+ def follow
42
+ browser.visit(@url)
43
+ browser
44
+ end
45
+
45
46
  def create_browser
46
- engine = js ? 'Chrome' : 'NetHttp'
47
- builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
47
+ builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
48
48
  builder.new(browser_params).build
49
49
  end
50
50
 
@@ -57,8 +57,18 @@ module Makuri
57
57
  }
58
58
  end
59
59
 
60
- def invalid_request_without_get
61
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
60
+ def validate_url
61
+ uri = URI.parse(@url)
62
+ raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
63
+ end
64
+
65
+ def browser_engine
66
+ case engine
67
+ when :ferrum
68
+ 'Ferrum'
69
+ else
70
+ 'NetHttp'
71
+ end
62
72
  end
63
73
  end
64
74
  end
@@ -0,0 +1,44 @@
1
+ require 'ferrum'
2
+
3
+ module Makuri::BrowserBuilder
4
+ class Ferrum < Base
5
+ attr_accessor :headless, :html
6
+
7
+ def initialize(options = {})
8
+ super
9
+ @headless = options.fetch(:headless, true)
10
+ end
11
+
12
+ def build
13
+ headers = { 'User-Agent': user_agent }
14
+ @ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
15
+ self
16
+ end
17
+
18
+ def visit(url)
19
+ @ferrum.go_to url
20
+ @html = @ferrum.body
21
+ self
22
+ end
23
+
24
+ def quit
25
+ @ferrum.quit
26
+ end
27
+
28
+ private
29
+
30
+ # TODO:
31
+ # def browser_options
32
+ # args = %w[
33
+ # --disable-gpu
34
+ # --no-sandbox
35
+ # --disable-translate
36
+ # --ignore-certificate-errors
37
+ # ]
38
+ # args << '--headless' if headless
39
+ # args << "--user-agent=#{user_agent}"
40
+ # args << "--blink-settings=imagesEnabled=#{enable_images}"
41
+ # Selenium::WebDriver::Chrome::Options.new(args: args)
42
+ # end
43
+ end
44
+ end
@@ -2,6 +2,8 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ attr_accessor :response
6
+
5
7
  def build
6
8
  self
7
9
  end
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
12
14
 
13
15
  request = send("#{request_method}_request", uri.request_uri, headers)
14
16
 
15
- Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
17
+ @response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
16
18
  http.request(request)
17
19
  end
20
+ self
21
+ end
22
+
23
+ def html
24
+ @response.body
18
25
  end
19
26
 
20
27
  private
data/lib/makuri/spider.rb CHANGED
@@ -18,23 +18,39 @@ module Makuri
18
18
  @start_urls = urls
19
19
  end
20
20
 
21
+ def spider_options(**options)
22
+ @engine = options.fetch(:engine, :net_http)
23
+ @headless = options.fetch(:headless, true)
24
+ end
25
+
21
26
  def run
22
27
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
23
28
 
24
- new(@start_urls).parse
29
+ @engine ||= :net_http
30
+ @headless = defined?(@headless) ? @headless : true
31
+
32
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
25
33
  end
26
34
  end
27
35
 
28
- attr_accessor :start_urls, :response
36
+ attr_accessor :engine, :headless, :response
29
37
 
30
- def initialize(start_urls)
31
- @start_urls = start_urls
38
+ def initialize(**config)
39
+ @start_url = config.fetch(:start_url, nil)
40
+ @engine = config.fetch(:engine, :net_http)
41
+ @headless = config.fetch(:headless, true)
32
42
 
33
- update_response(@start_urls[0])
43
+ update_response(@start_url)
44
+ browser_quit_wrapper { parse }
34
45
  end
35
46
 
36
47
  def browser
37
- @browser ||= Makuri::Browser.new
48
+ @browser ||= Makuri::Browser.new(engine: engine, headless: headless)
49
+ end
50
+
51
+ def browser_quit_wrapper
52
+ yield
53
+ browser.quit
38
54
  end
39
55
 
40
56
  def parse
@@ -55,18 +71,22 @@ module Makuri
55
71
  end
56
72
 
57
73
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.url, relative_url).to_s
74
+ Addressable::URI.join(base_url, relative_url).to_s
59
75
  end
60
76
 
61
77
  private
62
78
 
79
+ def base_url
80
+ @start_url || browser.url.to_s
81
+ end
82
+
63
83
  def valid_url?(url)
64
84
  defined?(url) && !url.to_s.empty?
65
85
  end
66
86
 
67
87
  def update_response(url)
68
- html = browser.request absolute_url(url)
69
- @response = Nokogiri::HTML(html)
88
+ res = browser.request(absolute_url(url)).html
89
+ @response = Nokogiri::HTML(res)
70
90
  end
71
91
  end
72
92
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.2.0'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
data/lib/makuri.rb CHANGED
@@ -5,7 +5,7 @@ require 'makuri/spider'
5
5
  require 'makuri/version'
6
6
 
7
7
  require 'makuri/browser_builder/base'
8
- require 'makuri/browser_builder/chrome'
8
+ require 'makuri/browser_builder/ferrum'
9
9
  require 'makuri/browser_builder/net_http'
10
10
 
11
11
  module Makuri
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-25 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -16,84 +16,70 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.7'
19
+ version: '2.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.7'
27
- - !ruby/object:Gem::Dependency
28
- name: capybara
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.34'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.34'
26
+ version: '2.8'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: nokogiri
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: 1.10.10
33
+ version: 1.15.5
48
34
  type: :runtime
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: 1.10.10
40
+ version: 1.15.5
55
41
  - !ruby/object:Gem::Dependency
56
- name: selenium-webdriver
42
+ name: ferrum
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '3.5'
47
+ version: '0.15'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '3.5'
54
+ version: '0.15'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: bundler
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '2.0'
61
+ version: 2.5.0
76
62
  type: :development
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '2.0'
68
+ version: 2.5.0
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: webmock
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
73
  - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: '3.11'
75
+ version: '3.20'
90
76
  type: :development
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
80
  - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: '3.11'
82
+ version: '3.20'
97
83
  description:
98
84
  email: lalusaud@gmail.com
99
85
  executables: []
@@ -106,7 +92,7 @@ files:
106
92
  - lib/makuri.rb
107
93
  - lib/makuri/browser.rb
108
94
  - lib/makuri/browser_builder/base.rb
109
- - lib/makuri/browser_builder/chrome.rb
95
+ - lib/makuri/browser_builder/ferrum.rb
110
96
  - lib/makuri/browser_builder/net_http.rb
111
97
  - lib/makuri/spider.rb
112
98
  - lib/makuri/version.rb
@@ -122,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
122
108
  requirements:
123
109
  - - ">="
124
110
  - !ruby/object:Gem::Version
125
- version: '2.5'
111
+ version: '2.7'
126
112
  required_rubygems_version: !ruby/object:Gem::Requirement
127
113
  requirements:
128
114
  - - ">="
129
115
  - !ruby/object:Gem::Version
130
116
  version: '0'
131
117
  requirements: []
132
- rubygems_version: 3.1.4
118
+ rubygems_version: 3.5.11
133
119
  signing_key:
134
120
  specification_version: 4
135
121
  summary: Web-crawling framework for Ruby
@@ -1,34 +0,0 @@
1
- require 'capybara'
2
- require 'selenium-webdriver'
3
-
4
- module Makuri::BrowserBuilder
5
- class Chrome < Base
6
- def initialize(options = {})
7
- super
8
- @headless = options.fetch(:headless, true)
9
- end
10
-
11
- def build
12
- Capybara.register_driver :selenium_chrome do |app|
13
- Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
14
- end
15
- Capybara.threadsafe = true
16
- Capybara::Session.new :selenium_chrome
17
- end
18
-
19
- private
20
-
21
- def browser_options
22
- args = %w[
23
- --disable-gpu
24
- --no-sandbox
25
- --disable-translate
26
- --blink-settings=imagesEnabled=false
27
- --ignore-certificate-errors
28
- ]
29
- args << '--headless' if @headless
30
- args << "--user-agent=#{@user_agent}"
31
- Selenium::WebDriver::Chrome::Options.new(args: args)
32
- end
33
- end
34
- end