makuri 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/makuri/browser.rb +14 -6
- data/lib/makuri/browser_builder/ferrum.rb +44 -0
- data/lib/makuri/spider.rb +12 -4
- data/lib/makuri/version.rb +1 -1
- data/lib/makuri.rb +1 -1
- metadata +16 -44
- data/lib/makuri/browser_builder/chrome.rb +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '08d39eda46b02f5f6894d581eb0d837ac4680f4c2f6239b6f585a4c187ed44d7'
|
4
|
+
data.tar.gz: 433093581b01eb5327e1bb8e6dd6653f0b51fcfdae54276a914339b0bff61b2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: daaf82155b49cc1efe7da7d092c617fb47e8948758108afbea79a5d484a08757781a2f156d066946d821c0c25b5984693b69dbdde764bf2dd14c07a9997d3112
|
7
|
+
data.tar.gz: e8655c684cfa1087d1046a7cf0447f8a4b52ee8d17c6b25028c78d8f50bfa8df0a076ecd38f0cb504311dc1b55bae53b199fea2bc5982b65bf5b9b9d10221e33
|
data/CHANGELOG.md
CHANGED
data/lib/makuri/browser.rb
CHANGED
@@ -24,23 +24,26 @@ module Makuri
|
|
24
24
|
@url = url
|
25
25
|
validate_url
|
26
26
|
|
27
|
-
return follow if
|
27
|
+
return follow if ['Ferrum', 'Chrome'].include? browser_engine
|
28
28
|
|
29
29
|
@request_method = options.fetch(:method, request_method)
|
30
30
|
@request_body = options.fetch(:body, request_body)
|
31
31
|
browser.visit(@url)
|
32
32
|
end
|
33
33
|
|
34
|
+
def quit
|
35
|
+
browser.respond_to?(:quit) ? browser.quit : nil
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
34
40
|
# Only allow for capybara get request
|
35
41
|
def follow
|
36
42
|
browser.visit(@url)
|
37
43
|
browser
|
38
44
|
end
|
39
45
|
|
40
|
-
private
|
41
|
-
|
42
46
|
def create_browser
|
43
|
-
browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
|
44
47
|
builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
|
45
48
|
builder.new(browser_params).build
|
46
49
|
end
|
@@ -59,8 +62,13 @@ module Makuri
|
|
59
62
|
raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
|
60
63
|
end
|
61
64
|
|
62
|
-
def
|
63
|
-
engine
|
65
|
+
def browser_engine
|
66
|
+
case engine
|
67
|
+
when :ferrum
|
68
|
+
'Ferrum'
|
69
|
+
else
|
70
|
+
'NetHttp'
|
71
|
+
end
|
64
72
|
end
|
65
73
|
end
|
66
74
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'ferrum'
|
2
|
+
|
3
|
+
module Makuri::BrowserBuilder
|
4
|
+
class Ferrum < Base
|
5
|
+
attr_accessor :headless, :html
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
super
|
9
|
+
@headless = options.fetch(:headless, true)
|
10
|
+
end
|
11
|
+
|
12
|
+
def build
|
13
|
+
headers = { 'User-Agent': user_agent }
|
14
|
+
@ferrum = ::Ferrum::Browser.new(headless: @headless, headers: headers)
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def visit(url)
|
19
|
+
@ferrum.go_to url
|
20
|
+
@html = @ferrum.body
|
21
|
+
self
|
22
|
+
end
|
23
|
+
|
24
|
+
def quit
|
25
|
+
@ferrum.quit
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# TODO:
|
31
|
+
# def browser_options
|
32
|
+
# args = %w[
|
33
|
+
# --disable-gpu
|
34
|
+
# --no-sandbox
|
35
|
+
# --disable-translate
|
36
|
+
# --ignore-certificate-errors
|
37
|
+
# ]
|
38
|
+
# args << '--headless' if headless
|
39
|
+
# args << "--user-agent=#{user_agent}"
|
40
|
+
# args << "--blink-settings=imagesEnabled=#{enable_images}"
|
41
|
+
# Selenium::WebDriver::Chrome::Options.new(args: args)
|
42
|
+
# end
|
43
|
+
end
|
44
|
+
end
|
data/lib/makuri/spider.rb
CHANGED
@@ -20,28 +20,37 @@ module Makuri
|
|
20
20
|
|
21
21
|
def spider_options(**options)
|
22
22
|
@engine = options.fetch(:engine, :net_http)
|
23
|
+
@headless = options.fetch(:headless, true)
|
23
24
|
end
|
24
25
|
|
25
26
|
def run
|
26
27
|
raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
|
27
28
|
|
28
29
|
@engine ||= :net_http
|
30
|
+
@headless = defined?(@headless) ? @headless : true
|
29
31
|
|
30
|
-
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
|
32
|
+
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine, headless: @headless).parse }
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
|
-
attr_accessor :engine, :response
|
36
|
+
attr_accessor :engine, :headless, :response
|
35
37
|
|
36
38
|
def initialize(**config)
|
37
39
|
@start_url = config.fetch(:start_url, nil)
|
38
40
|
@engine = config.fetch(:engine, :net_http)
|
41
|
+
@headless = config.fetch(:headless, true)
|
39
42
|
|
40
43
|
update_response(@start_url)
|
44
|
+
browser_quit_wrapper { parse }
|
41
45
|
end
|
42
46
|
|
43
47
|
def browser
|
44
|
-
@browser ||= Makuri::Browser.new(engine: engine)
|
48
|
+
@browser ||= Makuri::Browser.new(engine: engine, headless: headless)
|
49
|
+
end
|
50
|
+
|
51
|
+
def browser_quit_wrapper
|
52
|
+
yield
|
53
|
+
browser.quit
|
45
54
|
end
|
46
55
|
|
47
56
|
def parse
|
@@ -77,7 +86,6 @@ module Makuri
|
|
77
86
|
|
78
87
|
def update_response(url)
|
79
88
|
res = browser.request(absolute_url(url)).html
|
80
|
-
# res = res.html if @engine == :chrome
|
81
89
|
@response = Nokogiri::HTML(res)
|
82
90
|
end
|
83
91
|
end
|
data/lib/makuri/version.rb
CHANGED
data/lib/makuri.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -16,98 +16,70 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '2.
|
19
|
+
version: '2.8'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '2.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: capybara
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '3.34'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '3.34'
|
26
|
+
version: '2.8'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: nokogiri
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
31
|
- - "~>"
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.
|
33
|
+
version: 1.15.5
|
48
34
|
type: :runtime
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
38
|
- - "~>"
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.
|
40
|
+
version: 1.15.5
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
42
|
+
name: ferrum
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
58
44
|
requirements:
|
59
45
|
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '0.15'
|
62
48
|
type: :runtime
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
52
|
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '0.15'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: bundler
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
59
|
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '2.0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: webdrivers
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - "~>"
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '4.0'
|
61
|
+
version: 2.5.0
|
90
62
|
type: :development
|
91
63
|
prerelease: false
|
92
64
|
version_requirements: !ruby/object:Gem::Requirement
|
93
65
|
requirements:
|
94
66
|
- - "~>"
|
95
67
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
68
|
+
version: 2.5.0
|
97
69
|
- !ruby/object:Gem::Dependency
|
98
70
|
name: webmock
|
99
71
|
requirement: !ruby/object:Gem::Requirement
|
100
72
|
requirements:
|
101
73
|
- - "~>"
|
102
74
|
- !ruby/object:Gem::Version
|
103
|
-
version: '3.
|
75
|
+
version: '3.20'
|
104
76
|
type: :development
|
105
77
|
prerelease: false
|
106
78
|
version_requirements: !ruby/object:Gem::Requirement
|
107
79
|
requirements:
|
108
80
|
- - "~>"
|
109
81
|
- !ruby/object:Gem::Version
|
110
|
-
version: '3.
|
82
|
+
version: '3.20'
|
111
83
|
description:
|
112
84
|
email: lalusaud@gmail.com
|
113
85
|
executables: []
|
@@ -120,7 +92,7 @@ files:
|
|
120
92
|
- lib/makuri.rb
|
121
93
|
- lib/makuri/browser.rb
|
122
94
|
- lib/makuri/browser_builder/base.rb
|
123
|
-
- lib/makuri/browser_builder/
|
95
|
+
- lib/makuri/browser_builder/ferrum.rb
|
124
96
|
- lib/makuri/browser_builder/net_http.rb
|
125
97
|
- lib/makuri/spider.rb
|
126
98
|
- lib/makuri/version.rb
|
@@ -136,14 +108,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
136
108
|
requirements:
|
137
109
|
- - ">="
|
138
110
|
- !ruby/object:Gem::Version
|
139
|
-
version: '2.
|
111
|
+
version: '2.7'
|
140
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
113
|
requirements:
|
142
114
|
- - ">="
|
143
115
|
- !ruby/object:Gem::Version
|
144
116
|
version: '0'
|
145
117
|
requirements: []
|
146
|
-
rubygems_version: 3.
|
118
|
+
rubygems_version: 3.5.11
|
147
119
|
signing_key:
|
148
120
|
specification_version: 4
|
149
121
|
summary: Web-crawling framework for Ruby
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'capybara'
|
2
|
-
require 'selenium-webdriver'
|
3
|
-
|
4
|
-
module Makuri::BrowserBuilder
|
5
|
-
class Chrome < Base
|
6
|
-
attr_accessor :headless, :enable_images
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super
|
10
|
-
@headless = options.fetch(:headless, true)
|
11
|
-
@enable_images = options.fetch(:enable_images, true)
|
12
|
-
end
|
13
|
-
|
14
|
-
def build
|
15
|
-
Capybara.register_driver :selenium_chrome do |app|
|
16
|
-
Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
|
17
|
-
end
|
18
|
-
Capybara.threadsafe = true
|
19
|
-
Capybara::Session.new :selenium_chrome
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def browser_options
|
25
|
-
args = %w[
|
26
|
-
--disable-gpu
|
27
|
-
--no-sandbox
|
28
|
-
--disable-translate
|
29
|
-
--ignore-certificate-errors
|
30
|
-
]
|
31
|
-
args << '--headless' if headless
|
32
|
-
args << "--user-agent=#{user_agent}"
|
33
|
-
args << "--blink-settings=imagesEnabled=#{enable_images}"
|
34
|
-
Selenium::WebDriver::Chrome::Options.new(args: args)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|