makuri 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/lib/makuri/browser.rb +27 -25
- data/lib/makuri/browser_builder/chrome.rb +7 -4
- data/lib/makuri/browser_builder/net_http.rb +8 -1
- data/lib/makuri/spider.rb +21 -9
- data/lib/makuri/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
|
4
|
+
data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
|
7
|
+
data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
|
data/CHANGELOG.md
CHANGED
@@ -1,10 +1,15 @@
|
|
1
|
+
## 0.3.0 (2021-02-07)
|
2
|
+
- Allow scrape from multiple start_urls
|
3
|
+
- Add :engine support to Spider
|
4
|
+
- Add :enable_images support to :chrome engine
|
5
|
+
- Bug fixes and restructure #request_method & #request_body
|
6
|
+
|
1
7
|
## 0.2.0 (2021-01-25)
|
2
8
|
- Add Capybara Support
|
3
9
|
- Add :headless option to Browser object
|
4
10
|
- Bug fixes & restructure Browser #follow, #request methods
|
5
11
|
|
6
12
|
## 0.1.0 (2021-01-05)
|
7
|
-
|
8
13
|
- First Release
|
9
14
|
- Add Basic Browser Builders for crawling web pages
|
10
15
|
- Add Spider Framework to support scraping
|
data/lib/makuri/browser.rb
CHANGED
@@ -1,50 +1,47 @@
|
|
1
1
|
module Makuri
|
2
2
|
class Browser
|
3
|
-
attr_accessor :url, :
|
3
|
+
attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
|
-
@
|
7
|
-
@request_method = options.fetch(:request_method, :get)
|
8
|
-
@request_body = options.fetch(:request_body, {})
|
9
|
-
|
10
|
-
@js = options.fetch(:js, false)
|
6
|
+
@engine = options.fetch(:engine, :net_http)
|
11
7
|
@headless = options.fetch(:headless, true)
|
12
8
|
@user_agent = options.fetch(
|
13
9
|
:user_agent,
|
14
10
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
|
15
11
|
)
|
12
|
+
|
13
|
+
# Defaults
|
14
|
+
@request_method = :get
|
15
|
+
@request_body = {}
|
16
16
|
end
|
17
17
|
|
18
18
|
def browser
|
19
19
|
@browser ||= create_browser
|
20
20
|
end
|
21
21
|
|
22
|
-
#
|
23
|
-
def
|
24
|
-
@url =
|
25
|
-
|
22
|
+
# Allow for NetHttp get request
|
23
|
+
def request(url, options = {})
|
24
|
+
@url = url
|
25
|
+
validate_url
|
26
26
|
|
27
|
-
|
27
|
+
return follow if engine_chrome?
|
28
28
|
|
29
|
-
|
30
|
-
|
29
|
+
@request_method = options.fetch(:method, request_method)
|
30
|
+
@request_body = options.fetch(:body, request_body)
|
31
|
+
browser.visit(@url)
|
31
32
|
end
|
32
33
|
|
33
|
-
#
|
34
|
-
def
|
35
|
-
@url
|
36
|
-
|
37
|
-
|
38
|
-
raise 'Invalid request type for js=true. Use #follow in place of #get' if js
|
39
|
-
|
40
|
-
browser.visit(url).body
|
34
|
+
# Only allow for capybara get request
|
35
|
+
def follow
|
36
|
+
browser.visit(@url)
|
37
|
+
browser
|
41
38
|
end
|
42
39
|
|
43
40
|
private
|
44
41
|
|
45
42
|
def create_browser
|
46
|
-
|
47
|
-
builder
|
43
|
+
browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
|
44
|
+
builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
|
48
45
|
builder.new(browser_params).build
|
49
46
|
end
|
50
47
|
|
@@ -57,8 +54,13 @@ module Makuri
|
|
57
54
|
}
|
58
55
|
end
|
59
56
|
|
60
|
-
def
|
61
|
-
|
57
|
+
def validate_url
|
58
|
+
uri = URI.parse(@url)
|
59
|
+
raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
|
60
|
+
end
|
61
|
+
|
62
|
+
def engine_chrome?
|
63
|
+
engine == :chrome
|
62
64
|
end
|
63
65
|
end
|
64
66
|
end
|
@@ -3,9 +3,12 @@ require 'selenium-webdriver'
|
|
3
3
|
|
4
4
|
module Makuri::BrowserBuilder
|
5
5
|
class Chrome < Base
|
6
|
+
attr_accessor :headless, :enable_images
|
7
|
+
|
6
8
|
def initialize(options = {})
|
7
9
|
super
|
8
|
-
@headless
|
10
|
+
@headless = options.fetch(:headless, true)
|
11
|
+
@enable_images = options.fetch(:enable_images, true)
|
9
12
|
end
|
10
13
|
|
11
14
|
def build
|
@@ -23,11 +26,11 @@ module Makuri::BrowserBuilder
|
|
23
26
|
--disable-gpu
|
24
27
|
--no-sandbox
|
25
28
|
--disable-translate
|
26
|
-
--blink-settings=imagesEnabled=false
|
27
29
|
--ignore-certificate-errors
|
28
30
|
]
|
29
|
-
args << '--headless' if
|
30
|
-
args << "--user-agent=#{
|
31
|
+
args << '--headless' if headless
|
32
|
+
args << "--user-agent=#{user_agent}"
|
33
|
+
args << "--blink-settings=imagesEnabled=#{enable_images}"
|
31
34
|
Selenium::WebDriver::Chrome::Options.new(args: args)
|
32
35
|
end
|
33
36
|
end
|
@@ -2,6 +2,8 @@ require 'net/http'
|
|
2
2
|
|
3
3
|
module Makuri::BrowserBuilder
|
4
4
|
class NetHttp < Base
|
5
|
+
attr_accessor :response
|
6
|
+
|
5
7
|
def build
|
6
8
|
self
|
7
9
|
end
|
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
|
|
12
14
|
|
13
15
|
request = send("#{request_method}_request", uri.request_uri, headers)
|
14
16
|
|
15
|
-
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
17
|
+
@response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
16
18
|
http.request(request)
|
17
19
|
end
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
def html
|
24
|
+
@response.body
|
18
25
|
end
|
19
26
|
|
20
27
|
private
|
data/lib/makuri/spider.rb
CHANGED
@@ -18,23 +18,30 @@ module Makuri
|
|
18
18
|
@start_urls = urls
|
19
19
|
end
|
20
20
|
|
21
|
+
def spider_options(**options)
|
22
|
+
@engine = options.fetch(:engine, :net_http)
|
23
|
+
end
|
24
|
+
|
21
25
|
def run
|
22
26
|
raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
|
23
27
|
|
24
|
-
|
28
|
+
@engine ||= :net_http
|
29
|
+
|
30
|
+
@start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
|
25
31
|
end
|
26
32
|
end
|
27
33
|
|
28
|
-
attr_accessor :
|
34
|
+
attr_accessor :engine, :response
|
29
35
|
|
30
|
-
def initialize(
|
31
|
-
@
|
36
|
+
def initialize(**config)
|
37
|
+
@start_url = config.fetch(:start_url, nil)
|
38
|
+
@engine = config.fetch(:engine, :net_http)
|
32
39
|
|
33
|
-
update_response(@
|
40
|
+
update_response(@start_url)
|
34
41
|
end
|
35
42
|
|
36
43
|
def browser
|
37
|
-
@browser ||= Makuri::Browser.new
|
44
|
+
@browser ||= Makuri::Browser.new(engine: engine)
|
38
45
|
end
|
39
46
|
|
40
47
|
def parse
|
@@ -55,18 +62,23 @@ module Makuri
|
|
55
62
|
end
|
56
63
|
|
57
64
|
def absolute_url(relative_url)
|
58
|
-
Addressable::URI.join(
|
65
|
+
Addressable::URI.join(base_url, relative_url).to_s
|
59
66
|
end
|
60
67
|
|
61
68
|
private
|
62
69
|
|
70
|
+
def base_url
|
71
|
+
@start_url || browser.url.to_s
|
72
|
+
end
|
73
|
+
|
63
74
|
def valid_url?(url)
|
64
75
|
defined?(url) && !url.to_s.empty?
|
65
76
|
end
|
66
77
|
|
67
78
|
def update_response(url)
|
68
|
-
|
69
|
-
|
79
|
+
res = browser.request(absolute_url(url)).html
|
80
|
+
# res = res.html if @engine == :chrome
|
81
|
+
@response = Nokogiri::HTML(res)
|
70
82
|
end
|
71
83
|
end
|
72
84
|
end
|
data/lib/makuri/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-02-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '2.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webdrivers
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '4.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '4.0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: webmock
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|