makuri 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/lib/makuri/browser.rb +35 -25
- data/lib/makuri/browser_builder/ferrum.rb +44 -0
- data/lib/makuri/browser_builder/net_http.rb +8 -1
- data/lib/makuri/spider.rb +29 -9
- data/lib/makuri/version.rb +1 -1
- data/lib/makuri.rb +1 -1
- metadata +16 -30
- data/lib/makuri/browser_builder/chrome.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
|
4
|
+
data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
|
7
|
+
data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,20 @@
|
|
1
|
+
## 0.4.0 (2024-06-13)
|
2
|
+
- Add Ferrum support
|
3
|
+
- Remove Capybara/chrome support
|
4
|
+
- Bug fixes
|
5
|
+
|
6
|
+
## 0.3.0 (2021-02-07)
|
7
|
+
- Allow scrape from multiple start_urls
|
8
|
+
- Add :engine support to Spider
|
9
|
+
- Add :enable_images support to :chrome engine
|
10
|
+
- Bug fixes and restructure #request_method & #request_body
|
11
|
+
|
1
12
|
## 0.2.0 (2021-01-25)
|
2
13
|
- Add Capybara Support
|
3
14
|
- Add :headless option to Browser object
|
4
15
|
- Bug fixes & restructure Browser #follow, #request methods
|
5
16
|
|
6
17
|
## 0.1.0 (2021-01-05)
|
7
|
-
|
8
18
|
- First Release
|
9
19
|
- Add Basic Browser Builders for crawling web pages
|
10
20
|
- Add Spider Framework to support scraping
|
data/lib/makuri/browser.rb
CHANGED
@@ -1,50 +1,50 @@
|
|
1
1
|
module Makuri
|
2
2
|
class Browser
|
3
|
-
attr_accessor :url, :
|
3
|
+
attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
|
-
@
|
7
|
-
@request_method = options.fetch(:request_method, :get)
|
8
|
-
@request_body = options.fetch(:request_body, {})
|
9
|
-
|
10
|
-
@js = options.fetch(:js, false)
|
6
|
+
@engine = options.fetch(:engine, :net_http)
|
11
7
|
@headless = options.fetch(:headless, true)
|
12
8
|
@user_agent = options.fetch(
|
13
9
|
:user_agent,
|
14
10
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
|
15
11
|
)
|
12
|
+
|
13
|
+
# Defaults
|
14
|
+
@request_method = :get
|
15
|
+
@request_body = {}
|
16
16
|
end
|
17
17
|
|
18
18
|
def browser
|
19
19
|
@browser ||= create_browser
|
20
20
|
end
|
21
21
|
|
22
|
-
#
|
23
|
-
def
|
24
|
-
@url =
|
25
|
-
|
22
|
+
# Allow for NetHttp get request
|
23
|
+
def request(url, options = {})
|
24
|
+
@url = url
|
25
|
+
validate_url
|
26
26
|
|
27
|
-
|
27
|
+
return follow if ['Ferrum', 'Chrome'].include? browser_engine
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
@request_method = options.fetch(:method, request_method)
|
30
|
+
@request_body = options.fetch(:body, request_body)
|
31
|
+
browser.visit(@url)
|
31
32
|
end
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
@url = current_url unless current_url.empty?
|
36
|
-
raise 'Invalid URL supplied' if url.empty?
|
37
|
-
|
38
|
-
raise 'Invalid request type for js=true. Use #follow in place of #get' if js
|
39
|
-
|
40
|
-
browser.visit(url).body
|
34
|
+
def quit
|
35
|
+
browser.respond_to?(:quit) ? browser.quit : nil
|
41
36
|
end
|
42
37
|
|
43
38
|
private
|
44
39
|
|
40
|
+
# Only allow for capybara get request
|
41
|
+
def follow
|
42
|
+
browser.visit(@url)
|
43
|
+
browser
|
44
|
+
end
|
45
|
+
|
45
46
|
def create_browser
|
46
|
-
|
47
|
-
builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
|
47
|
+
builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
|
48
48
|
builder.new(browser_params).build
|
49
49
|
end
|
50
50
|
|
@@ -57,8 +57,18 @@ module Makuri
|
|
57
57
|
}
|
58
58
|
end
|
59
59
|
|
60
|
-
def
|
61
|
-
|
60
|
+
def validate_url
|
61
|
+
uri = URI.parse(@url)
|
62
|
+
raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
|
63
|
+
end
|
64
|
+
|
65
|
+
def browser_engine
|
66
|
+
case engine
|
67
|
+
when :ferrum
|
68
|
+
'Ferrum'
|
69
|
+
else
|
70
|
+
'NetHttp'
|
71
|
+
end
|
62
72
|
end
|
63
73
|
end
|
64
74
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'ferrum'
|
2
|
+
|
3
|
+
module Makuri::BrowserBuilder
|
4
|
+
class Ferrum < Base
|
5
|
+
attr_accessor :headless, :html
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
super
|
9
|
+
@headless = options.fetch(:headless, true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build
|
13
|
+
headers = { 'User-Agent': user_agent }
|
14
|
+
@ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def visit(url)
|
19
|
+
@ferrum.go_to url
|
20
|
+
@html = @ferrum.body
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
def quit
|
25
|
+
@ferrum.quit
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# TODO:
|
31
|
+
# def browser_options
|
32
|
+
# args = %w[
|
33
|
+
# --disable-gpu
|
34
|
+
# --no-sandbox
|
35
|
+
# --disable-translate
|
36
|
+
# --ignore-certificate-errors
|
37
|
+
# ]
|
38
|
+
# args << '--headless' if headless
|
39
|
+
# args << "--user-agent=#{user_agent}"
|
40
|
+
# args << "--blink-settings=imagesEnabled=#{enable_images}"
|
41
|
+
# Selenium::WebDriver::Chrome::Options.new(args: args)
|
42
|
+
# end
|
43
|
+
end
|
44
|
+
end
|
@@ -2,6 +2,8 @@ require 'net/http'
|
|
2
2
|
|
3
3
|
module Makuri::BrowserBuilder
|
4
4
|
class NetHttp < Base
|
5
|
+
attr_accessor :response
|
6
|
+
|
5
7
|
def build
|
6
8
|
self
|
7
9
|
end
|
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
|
|
12
14
|
|
13
15
|
request = send("#{request_method}_request", uri.request_uri, headers)
|
14
16
|
|
15
|
-
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
17
|
+
@response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
16
18
|
http.request(request)
|
17
19
|
end
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def html
|
24
|
+
@response.body
|
18
25
|
end
|
19
26
|
|
20
27
|
private
|
data/lib/makuri/spider.rb
CHANGED
@@ -18,23 +18,39 @@ module Makuri
|
|
18
18
|
@start_urls = urls
|
19
19
|
end
|
20
20
|
|
21
|
+
def spider_options(**options)
|
22
|
+
@engine = options.fetch(:engine, :net_http)
|
23
|
+
@headless = options.fetch(:headless, true)
|
24
|
+
end
|
25
|
+
|
21
26
|
def run
|
22
27
|
raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
|
23
28
|
|
24
|
-
|
29
|
+
@engine ||= :net_http
|
30
|
+
@headless = defined?(@headless) ? @headless : true
|
31
|
+
|
32
|
+
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
|
25
33
|
end
|
26
34
|
end
|
27
35
|
|
28
|
-
attr_accessor :
|
36
|
+
attr_accessor :engine, :headless, :response
|
29
37
|
|
30
|
-
def initialize(
|
31
|
-
@
|
38
|
+
def initialize(**config)
|
39
|
+
@start_url = config.fetch(:start_url, nil)
|
40
|
+
@engine = config.fetch(:engine, :net_http)
|
41
|
+
@headless = config.fetch(:headless, true)
|
32
42
|
|
33
|
-
update_response(@
|
43
|
+
update_response(@start_url)
|
44
|
+
browser_quit_wrapper { parse }
|
34
45
|
end
|
35
46
|
|
36
47
|
def browser
|
37
|
-
@browser ||= Makuri::Browser.new
|
48
|
+
@browser ||= Makuri::Browser.new(engine: engine, headless: headless)
|
49
|
+
end
|
50
|
+
|
51
|
+
def browser_quit_wrapper
|
52
|
+
yield
|
53
|
+
browser.quit
|
38
54
|
end
|
39
55
|
|
40
56
|
def parse
|
@@ -55,18 +71,22 @@ module Makuri
|
|
55
71
|
end
|
56
72
|
|
57
73
|
def absolute_url(relative_url)
|
58
|
-
Addressable::URI.join(
|
74
|
+
Addressable::URI.join(base_url, relative_url).to_s
|
59
75
|
end
|
60
76
|
|
61
77
|
private
|
62
78
|
|
79
|
+
def base_url
|
80
|
+
@start_url || browser.url.to_s
|
81
|
+
end
|
82
|
+
|
63
83
|
def valid_url?(url)
|
64
84
|
defined?(url) && !url.to_s.empty?
|
65
85
|
end
|
66
86
|
|
67
87
|
def update_response(url)
|
68
|
-
|
69
|
-
@response = Nokogiri::HTML(
|
88
|
+
res = browser.request(absolute_url(url)).html
|
89
|
+
@response = Nokogiri::HTML(res)
|
70
90
|
end
|
71
91
|
end
|
72
92
|
end
|
data/lib/makuri/version.rb
CHANGED
data/lib/makuri.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,84 +16,70 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2.
|
19
|
+
version: '2.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: capybara
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '3.34'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '3.34'
|
26
|
+
version: '2.8'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: nokogiri
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
31
|
- - "~>"
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.
|
33
|
+
version: 1.15.5
|
48
34
|
type: :runtime
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
38
|
- - "~>"
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.
|
40
|
+
version: 1.15.5
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
42
|
+
name: ferrum
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
58
44
|
requirements:
|
59
45
|
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '0.15'
|
62
48
|
type: :runtime
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
52
|
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '0.15'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: bundler
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
59
|
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
61
|
+
version: 2.5.0
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
68
|
+
version: 2.5.0
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: webmock
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
86
72
|
requirements:
|
87
73
|
- - "~>"
|
88
74
|
- !ruby/object:Gem::Version
|
89
|
-
version: '3.
|
75
|
+
version: '3.20'
|
90
76
|
type: :development
|
91
77
|
prerelease: false
|
92
78
|
version_requirements: !ruby/object:Gem::Requirement
|
93
79
|
requirements:
|
94
80
|
- - "~>"
|
95
81
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3.
|
82
|
+
version: '3.20'
|
97
83
|
description:
|
98
84
|
email: lalusaud@gmail.com
|
99
85
|
executables: []
|
@@ -106,7 +92,7 @@ files:
|
|
106
92
|
- lib/makuri.rb
|
107
93
|
- lib/makuri/browser.rb
|
108
94
|
- lib/makuri/browser_builder/base.rb
|
109
|
-
- lib/makuri/browser_builder/
|
95
|
+
- lib/makuri/browser_builder/ferrum.rb
|
110
96
|
- lib/makuri/browser_builder/net_http.rb
|
111
97
|
- lib/makuri/spider.rb
|
112
98
|
- lib/makuri/version.rb
|
@@ -122,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
122
108
|
requirements:
|
123
109
|
- - ">="
|
124
110
|
- !ruby/object:Gem::Version
|
125
|
-
version: '2.
|
111
|
+
version: '2.7'
|
126
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
127
113
|
requirements:
|
128
114
|
- - ">="
|
129
115
|
- !ruby/object:Gem::Version
|
130
116
|
version: '0'
|
131
117
|
requirements: []
|
132
|
-
rubygems_version: 3.
|
118
|
+
rubygems_version: 3.5.11
|
133
119
|
signing_key:
|
134
120
|
specification_version: 4
|
135
121
|
summary: Web-crawling framework for Ruby
|
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'capybara'
|
2
|
-
require 'selenium-webdriver'
|
3
|
-
|
4
|
-
module Makuri::BrowserBuilder
|
5
|
-
class Chrome < Base
|
6
|
-
def initialize(options = {})
|
7
|
-
super
|
8
|
-
@headless = options.fetch(:headless, true)
|
9
|
-
end
|
10
|
-
|
11
|
-
def build
|
12
|
-
Capybara.register_driver :selenium_chrome do |app|
|
13
|
-
Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
|
14
|
-
end
|
15
|
-
Capybara.threadsafe = true
|
16
|
-
Capybara::Session.new :selenium_chrome
|
17
|
-
end
|
18
|
-
|
19
|
-
private
|
20
|
-
|
21
|
-
def browser_options
|
22
|
-
args = %w[
|
23
|
-
--disable-gpu
|
24
|
-
--no-sandbox
|
25
|
-
--disable-translate
|
26
|
-
--blink-settings=imagesEnabled=false
|
27
|
-
--ignore-certificate-errors
|
28
|
-
]
|
29
|
-
args << '--headless' if @headless
|
30
|
-
args << "--user-agent=#{@user_agent}"
|
31
|
-
Selenium::WebDriver::Chrome::Options.new(args: args)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|