makuri 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/lib/makuri/browser.rb +27 -25
- data/lib/makuri/browser_builder/chrome.rb +7 -4
- data/lib/makuri/browser_builder/net_http.rb +8 -1
- data/lib/makuri/spider.rb +21 -9
- data/lib/makuri/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
|
4
|
+
data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
|
7
|
+
data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,15 @@
|
|
1
|
+
## 0.3.0 (2021-02-07)
|
2
|
+
- Allow scrape from multiple start_urls
|
3
|
+
- Add :engine support to Spider
|
4
|
+
- Add :enable_images support to :chrome engine
|
5
|
+
- Bug fixes and restructure #request_method & #request_body
|
6
|
+
|
1
7
|
## 0.2.0 (2021-01-25)
|
2
8
|
- Add Capybara Support
|
3
9
|
- Add :headless option to Browser object
|
4
10
|
- Bug fixes & restructure Browser #follow, #request methods
|
5
11
|
|
6
12
|
## 0.1.0 (2021-01-05)
|
7
|
-
|
8
13
|
- First Release
|
9
14
|
- Add Basic Browser Builders for crawling web pages
|
10
15
|
- Add Spider Framework to support scraping
|
data/lib/makuri/browser.rb
CHANGED
@@ -1,50 +1,47 @@
|
|
1
1
|
module Makuri
|
2
2
|
class Browser
|
3
|
-
attr_accessor :url, :
|
3
|
+
attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
|
-
@
|
7
|
-
@request_method = options.fetch(:request_method, :get)
|
8
|
-
@request_body = options.fetch(:request_body, {})
|
9
|
-
|
10
|
-
@js = options.fetch(:js, false)
|
6
|
+
@engine = options.fetch(:engine, :net_http)
|
11
7
|
@headless = options.fetch(:headless, true)
|
12
8
|
@user_agent = options.fetch(
|
13
9
|
:user_agent,
|
14
10
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
|
15
11
|
)
|
12
|
+
|
13
|
+
# Defaults
|
14
|
+
@request_method = :get
|
15
|
+
@request_body = {}
|
16
16
|
end
|
17
17
|
|
18
18
|
def browser
|
19
19
|
@browser ||= create_browser
|
20
20
|
end
|
21
21
|
|
22
|
-
#
|
23
|
-
def
|
24
|
-
@url =
|
25
|
-
|
22
|
+
# Allow for NetHttp get request
|
23
|
+
def request(url, options = {})
|
24
|
+
@url = url
|
25
|
+
validate_url
|
26
26
|
|
27
|
-
|
27
|
+
return follow if engine_chrome?
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
@request_method = options.fetch(:method, request_method)
|
30
|
+
@request_body = options.fetch(:body, request_body)
|
31
|
+
browser.visit(@url)
|
31
32
|
end
|
32
33
|
|
33
|
-
#
|
34
|
-
def
|
35
|
-
@url
|
36
|
-
|
37
|
-
|
38
|
-
raise 'Invalid request type for js=true. Use #follow in place of #get' if js
|
39
|
-
|
40
|
-
browser.visit(url).body
|
34
|
+
# Only allow for capybara get request
|
35
|
+
def follow
|
36
|
+
browser.visit(@url)
|
37
|
+
browser
|
41
38
|
end
|
42
39
|
|
43
40
|
private
|
44
41
|
|
45
42
|
def create_browser
|
46
|
-
|
47
|
-
builder
|
43
|
+
browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
|
44
|
+
builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
|
48
45
|
builder.new(browser_params).build
|
49
46
|
end
|
50
47
|
|
@@ -57,8 +54,13 @@ module Makuri
|
|
57
54
|
}
|
58
55
|
end
|
59
56
|
|
60
|
-
def
|
61
|
-
|
57
|
+
def validate_url
|
58
|
+
uri = URI.parse(@url)
|
59
|
+
raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
|
60
|
+
end
|
61
|
+
|
62
|
+
def engine_chrome?
|
63
|
+
engine == :chrome
|
62
64
|
end
|
63
65
|
end
|
64
66
|
end
|
@@ -3,9 +3,12 @@ require 'selenium-webdriver'
|
|
3
3
|
|
4
4
|
module Makuri::BrowserBuilder
|
5
5
|
class Chrome < Base
|
6
|
+
attr_accessor :headless, :enable_images
|
7
|
+
|
6
8
|
def initialize(options = {})
|
7
9
|
super
|
8
|
-
@headless
|
10
|
+
@headless = options.fetch(:headless, true)
|
11
|
+
@enable_images = options.fetch(:enable_images, true)
|
9
12
|
end
|
10
13
|
|
11
14
|
def build
|
@@ -23,11 +26,11 @@ module Makuri::BrowserBuilder
|
|
23
26
|
--disable-gpu
|
24
27
|
--no-sandbox
|
25
28
|
--disable-translate
|
26
|
-
--blink-settings=imagesEnabled=false
|
27
29
|
--ignore-certificate-errors
|
28
30
|
]
|
29
|
-
args << '--headless' if
|
30
|
-
args << "--user-agent=#{
|
31
|
+
args << '--headless' if headless
|
32
|
+
args << "--user-agent=#{user_agent}"
|
33
|
+
args << "--blink-settings=imagesEnabled=#{enable_images}"
|
31
34
|
Selenium::WebDriver::Chrome::Options.new(args: args)
|
32
35
|
end
|
33
36
|
end
|
@@ -2,6 +2,8 @@ require 'net/http'
|
|
2
2
|
|
3
3
|
module Makuri::BrowserBuilder
|
4
4
|
class NetHttp < Base
|
5
|
+
attr_accessor :response
|
6
|
+
|
5
7
|
def build
|
6
8
|
self
|
7
9
|
end
|
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
|
|
12
14
|
|
13
15
|
request = send("#{request_method}_request", uri.request_uri, headers)
|
14
16
|
|
15
|
-
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
17
|
+
@response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
16
18
|
http.request(request)
|
17
19
|
end
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def html
|
24
|
+
@response.body
|
18
25
|
end
|
19
26
|
|
20
27
|
private
|
data/lib/makuri/spider.rb
CHANGED
@@ -18,23 +18,30 @@ module Makuri
|
|
18
18
|
@start_urls = urls
|
19
19
|
end
|
20
20
|
|
21
|
+
def spider_options(**options)
|
22
|
+
@engine = options.fetch(:engine, :net_http)
|
23
|
+
end
|
24
|
+
|
21
25
|
def run
|
22
26
|
raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
|
23
27
|
|
24
|
-
|
28
|
+
@engine ||= :net_http
|
29
|
+
|
30
|
+
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
28
|
-
attr_accessor :
|
34
|
+
attr_accessor :engine, :response
|
29
35
|
|
30
|
-
def initialize(
|
31
|
-
@
|
36
|
+
def initialize(**config)
|
37
|
+
@start_url = config.fetch(:start_url, nil)
|
38
|
+
@engine = config.fetch(:engine, :net_http)
|
32
39
|
|
33
|
-
update_response(@
|
40
|
+
update_response(@start_url)
|
34
41
|
end
|
35
42
|
|
36
43
|
def browser
|
37
|
-
@browser ||= Makuri::Browser.new
|
44
|
+
@browser ||= Makuri::Browser.new(engine: engine)
|
38
45
|
end
|
39
46
|
|
40
47
|
def parse
|
@@ -55,18 +62,23 @@ module Makuri
|
|
55
62
|
end
|
56
63
|
|
57
64
|
def absolute_url(relative_url)
|
58
|
-
Addressable::URI.join(
|
65
|
+
Addressable::URI.join(base_url, relative_url).to_s
|
59
66
|
end
|
60
67
|
|
61
68
|
private
|
62
69
|
|
70
|
+
def base_url
|
71
|
+
@start_url || browser.url.to_s
|
72
|
+
end
|
73
|
+
|
63
74
|
def valid_url?(url)
|
64
75
|
defined?(url) && !url.to_s.empty?
|
65
76
|
end
|
66
77
|
|
67
78
|
def update_response(url)
|
68
|
-
|
69
|
-
|
79
|
+
res = browser.request(absolute_url(url)).html
|
80
|
+
# res = res.html if @engine == :chrome
|
81
|
+
@response = Nokogiri::HTML(res)
|
70
82
|
end
|
71
83
|
end
|
72
84
|
end
|
data/lib/makuri/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '2.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webdrivers
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '4.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '4.0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: webmock
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|