makuri 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/makuri/browser.rb +14 -6
- data/lib/makuri/browser_builder/ferrum.rb +44 -0
- data/lib/makuri/spider.rb +12 -4
- data/lib/makuri/version.rb +1 -1
- data/lib/makuri.rb +1 -1
- metadata +16 -44
- data/lib/makuri/browser_builder/chrome.rb +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
|
4
|
+
data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
|
7
|
+
data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
|
data/CHANGELOG.md
CHANGED
data/lib/makuri/browser.rb
CHANGED
@@ -24,23 +24,26 @@ module Makuri
|
|
24
24
|
@url = url
|
25
25
|
validate_url
|
26
26
|
|
27
|
-
return follow if
|
27
|
+
return follow if ['Ferrum', 'Chrome'].include? browser_engine
|
28
28
|
|
29
29
|
@request_method = options.fetch(:method, request_method)
|
30
30
|
@request_body = options.fetch(:body, request_body)
|
31
31
|
browser.visit(@url)
|
32
32
|
end
|
33
33
|
|
34
|
+
def quit
|
35
|
+
browser.respond_to?(:quit) ? browser.quit : nil
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
34
40
|
# Only allow for capybara get request
|
35
41
|
def follow
|
36
42
|
browser.visit(@url)
|
37
43
|
browser
|
38
44
|
end
|
39
45
|
|
40
|
-
private
|
41
|
-
|
42
46
|
def create_browser
|
43
|
-
browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
|
44
47
|
builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
|
45
48
|
builder.new(browser_params).build
|
46
49
|
end
|
@@ -59,8 +62,13 @@ module Makuri
|
|
59
62
|
raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
|
60
63
|
end
|
61
64
|
|
62
|
-
def
|
63
|
-
engine
|
65
|
+
def browser_engine
|
66
|
+
case engine
|
67
|
+
when :ferrum
|
68
|
+
'Ferrum'
|
69
|
+
else
|
70
|
+
'NetHttp'
|
71
|
+
end
|
64
72
|
end
|
65
73
|
end
|
66
74
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'ferrum'
|
2
|
+
|
3
|
+
module Makuri::BrowserBuilder
|
4
|
+
class Ferrum < Base
|
5
|
+
attr_accessor :headless, :html
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
super
|
9
|
+
@headless = options.fetch(:headless, true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build
|
13
|
+
headers = { 'User-Agent': user_agent }
|
14
|
+
@ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def visit(url)
|
19
|
+
@ferrum.go_to url
|
20
|
+
@html = @ferrum.body
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
def quit
|
25
|
+
@ferrum.quit
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# TODO:
|
31
|
+
# def browser_options
|
32
|
+
# args = %w[
|
33
|
+
# --disable-gpu
|
34
|
+
# --no-sandbox
|
35
|
+
# --disable-translate
|
36
|
+
# --ignore-certificate-errors
|
37
|
+
# ]
|
38
|
+
# args << '--headless' if headless
|
39
|
+
# args << "--user-agent=#{user_agent}"
|
40
|
+
# args << "--blink-settings=imagesEnabled=#{enable_images}"
|
41
|
+
# Selenium::WebDriver::Chrome::Options.new(args: args)
|
42
|
+
# end
|
43
|
+
end
|
44
|
+
end
|
data/lib/makuri/spider.rb
CHANGED
@@ -20,28 +20,37 @@ module Makuri
|
|
20
20
|
|
21
21
|
def spider_options(**options)
|
22
22
|
@engine = options.fetch(:engine, :net_http)
|
23
|
+
@headless = options.fetch(:headless, true)
|
23
24
|
end
|
24
25
|
|
25
26
|
def run
|
26
27
|
raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
|
27
28
|
|
28
29
|
@engine ||= :net_http
|
30
|
+
@headless = defined?(@headless) ? @headless : true
|
29
31
|
|
30
|
-
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
|
32
|
+
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
|
-
attr_accessor :engine, :response
|
36
|
+
attr_accessor :engine, :headless, :response
|
35
37
|
|
36
38
|
def initialize(**config)
|
37
39
|
@start_url = config.fetch(:start_url, nil)
|
38
40
|
@engine = config.fetch(:engine, :net_http)
|
41
|
+
@headless = config.fetch(:headless, true)
|
39
42
|
|
40
43
|
update_response(@start_url)
|
44
|
+
browser_quit_wrapper { parse }
|
41
45
|
end
|
42
46
|
|
43
47
|
def browser
|
44
|
-
@browser ||= Makuri::Browser.new(engine: engine)
|
48
|
+
@browser ||= Makuri::Browser.new(engine: engine, headless: headless)
|
49
|
+
end
|
50
|
+
|
51
|
+
def browser_quit_wrapper
|
52
|
+
yield
|
53
|
+
browser.quit
|
45
54
|
end
|
46
55
|
|
47
56
|
def parse
|
@@ -77,7 +86,6 @@ module Makuri
|
|
77
86
|
|
78
87
|
def update_response(url)
|
79
88
|
res = browser.request(absolute_url(url)).html
|
80
|
-
# res = res.html if @engine == :chrome
|
81
89
|
@response = Nokogiri::HTML(res)
|
82
90
|
end
|
83
91
|
end
|
data/lib/makuri/version.rb
CHANGED
data/lib/makuri.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,98 +16,70 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2.
|
19
|
+
version: '2.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: capybara
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '3.34'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '3.34'
|
26
|
+
version: '2.8'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: nokogiri
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
31
|
- - "~>"
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.
|
33
|
+
version: 1.15.5
|
48
34
|
type: :runtime
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
38
|
- - "~>"
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.
|
40
|
+
version: 1.15.5
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
42
|
+
name: ferrum
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
58
44
|
requirements:
|
59
45
|
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '0.15'
|
62
48
|
type: :runtime
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
52
|
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '0.15'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: bundler
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
59
|
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '2.0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: webdrivers
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - "~>"
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '4.0'
|
61
|
+
version: 2.5.0
|
90
62
|
type: :development
|
91
63
|
prerelease: false
|
92
64
|
version_requirements: !ruby/object:Gem::Requirement
|
93
65
|
requirements:
|
94
66
|
- - "~>"
|
95
67
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
68
|
+
version: 2.5.0
|
97
69
|
- !ruby/object:Gem::Dependency
|
98
70
|
name: webmock
|
99
71
|
requirement: !ruby/object:Gem::Requirement
|
100
72
|
requirements:
|
101
73
|
- - "~>"
|
102
74
|
- !ruby/object:Gem::Version
|
103
|
-
version: '3.
|
75
|
+
version: '3.20'
|
104
76
|
type: :development
|
105
77
|
prerelease: false
|
106
78
|
version_requirements: !ruby/object:Gem::Requirement
|
107
79
|
requirements:
|
108
80
|
- - "~>"
|
109
81
|
- !ruby/object:Gem::Version
|
110
|
-
version: '3.
|
82
|
+
version: '3.20'
|
111
83
|
description:
|
112
84
|
email: lalusaud@gmail.com
|
113
85
|
executables: []
|
@@ -120,7 +92,7 @@ files:
|
|
120
92
|
- lib/makuri.rb
|
121
93
|
- lib/makuri/browser.rb
|
122
94
|
- lib/makuri/browser_builder/base.rb
|
123
|
-
- lib/makuri/browser_builder/
|
95
|
+
- lib/makuri/browser_builder/ferrum.rb
|
124
96
|
- lib/makuri/browser_builder/net_http.rb
|
125
97
|
- lib/makuri/spider.rb
|
126
98
|
- lib/makuri/version.rb
|
@@ -136,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
136
108
|
requirements:
|
137
109
|
- - ">="
|
138
110
|
- !ruby/object:Gem::Version
|
139
|
-
version: '2.
|
111
|
+
version: '2.7'
|
140
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
113
|
requirements:
|
142
114
|
- - ">="
|
143
115
|
- !ruby/object:Gem::Version
|
144
116
|
version: '0'
|
145
117
|
requirements: []
|
146
|
-
rubygems_version: 3.
|
118
|
+
rubygems_version: 3.5.11
|
147
119
|
signing_key:
|
148
120
|
specification_version: 4
|
149
121
|
summary: Web-crawling framework for Ruby
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'capybara'
|
2
|
-
require 'selenium-webdriver'
|
3
|
-
|
4
|
-
module Makuri::BrowserBuilder
|
5
|
-
class Chrome < Base
|
6
|
-
attr_accessor :headless, :enable_images
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super
|
10
|
-
@headless = options.fetch(:headless, true)
|
11
|
-
@enable_images = options.fetch(:enable_images, true)
|
12
|
-
end
|
13
|
-
|
14
|
-
def build
|
15
|
-
Capybara.register_driver :selenium_chrome do |app|
|
16
|
-
Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
|
17
|
-
end
|
18
|
-
Capybara.threadsafe = true
|
19
|
-
Capybara::Session.new :selenium_chrome
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def browser_options
|
25
|
-
args = %w[
|
26
|
-
--disable-gpu
|
27
|
-
--no-sandbox
|
28
|
-
--disable-translate
|
29
|
-
--ignore-certificate-errors
|
30
|
-
]
|
31
|
-
args << '--headless' if headless
|
32
|
-
args << "--user-agent=#{user_agent}"
|
33
|
-
args << "--blink-settings=imagesEnabled=#{enable_images}"
|
34
|
-
Selenium::WebDriver::Chrome::Options.new(args: args)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|