makuri 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -2
- data/README.md +21 -6
- data/lib/makuri/browser.rb +33 -10
- data/lib/makuri/browser_builder/base.rb +8 -4
- data/lib/makuri/browser_builder/chrome.rb +18 -12
- data/lib/makuri/browser_builder/net_http.rb +5 -1
- data/lib/makuri/spider.rb +2 -2
- data/lib/makuri/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
|
4
|
+
data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
|
7
|
+
data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
-
## 0.
|
1
|
+
## 0.2.0 (2021-01-25)
|
2
|
+
- Add Capybara Support
|
3
|
+
- Add :headless option to Browser object
|
4
|
+
- Bug fixes & restructure Browser #follow, #request methods
|
2
5
|
|
3
|
-
|
6
|
+
## 0.1.0 (2021-01-05)
|
7
|
+
|
8
|
+
- First Release
|
9
|
+
- Add Basic Browser Builders for crawling web pages
|
10
|
+
- Add Spider Framework to support scraping
|
data/README.md
CHANGED
@@ -2,6 +2,21 @@
|
|
2
2
|
|
3
3
|
Makuri is a Web-crawling framework for Ruby.
|
4
4
|
|
5
|
+
# Install
|
6
|
+
|
7
|
+
Add this to your application's Gemfile
|
8
|
+
```ruby
|
9
|
+
gem 'makuri'
|
10
|
+
```
|
11
|
+
And execute
|
12
|
+
```sh
|
13
|
+
$ bundle
|
14
|
+
```
|
15
|
+
Or install it as:
|
16
|
+
```sh
|
17
|
+
$ gem install makuri
|
18
|
+
```
|
19
|
+
|
5
20
|
# Usage
|
6
21
|
In this example, we are going to crawl the [quotes website](https://quotes.toscrape.com) and scrape data as:
|
7
22
|
```ruby
|
@@ -32,8 +47,8 @@ end
|
|
32
47
|
QuotesSpider.run
|
33
48
|
```
|
34
49
|
Now save the file to ```quotes_spider.rb``` file and run it as:
|
35
|
-
```
|
36
|
-
|
50
|
+
```sh
|
51
|
+
$ ruby quotes_spider.rb > quotes.json
|
37
52
|
```
|
38
53
|
When it's done, you will find all the quotes saved to ```quotes.json``` file. It's that easy.
|
39
54
|
|
@@ -53,8 +68,8 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
|
|
53
68
|
To get started with development:
|
54
69
|
|
55
70
|
```sh
|
56
|
-
git clone https://github.com/lalusaud/makuri.git
|
57
|
-
cd
|
58
|
-
bundle install
|
59
|
-
bundle exec rake test
|
71
|
+
$ git clone https://github.com/lalusaud/makuri.git
|
72
|
+
$ cd makuri
|
73
|
+
$ bundle install
|
74
|
+
$ bundle exec rake test
|
60
75
|
```
|
data/lib/makuri/browser.rb
CHANGED
@@ -1,41 +1,64 @@
|
|
1
1
|
module Makuri
|
2
2
|
class Browser
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
|
-
@
|
6
|
+
@url = options.fetch(:url, '')
|
7
7
|
@request_method = options.fetch(:request_method, :get)
|
8
8
|
@request_body = options.fetch(:request_body, {})
|
9
9
|
|
10
|
-
@
|
10
|
+
@js = options.fetch(:js, false)
|
11
|
+
@headless = options.fetch(:headless, true)
|
11
12
|
@user_agent = options.fetch(
|
12
13
|
:user_agent,
|
13
14
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
|
14
15
|
)
|
15
16
|
end
|
16
17
|
|
17
|
-
def
|
18
|
-
@
|
19
|
-
|
18
|
+
def browser
|
19
|
+
@browser ||= create_browser
|
20
|
+
end
|
21
|
+
|
22
|
+
# Only allow for capybara get request
|
23
|
+
def follow(current_url = '')
|
24
|
+
@url = current_url unless current_url.empty?
|
25
|
+
raise 'Invalid URL supplied' if url.empty?
|
26
|
+
|
27
|
+
raise invalid_request_without_get if request_method != :get
|
20
28
|
|
21
|
-
|
22
|
-
|
29
|
+
browser.visit(url)
|
30
|
+
browser
|
31
|
+
end
|
32
|
+
|
33
|
+
# Allow for NetHttp get request
|
34
|
+
def request(current_url = '')
|
35
|
+
@url = current_url unless current_url.empty?
|
36
|
+
raise 'Invalid URL supplied' if url.empty?
|
37
|
+
|
38
|
+
raise 'Invalid request type for js=true. Use #follow in place of #get' if js
|
39
|
+
|
40
|
+
browser.visit(url).body
|
23
41
|
end
|
24
42
|
|
25
43
|
private
|
26
44
|
|
27
45
|
def create_browser
|
28
|
-
engine =
|
46
|
+
engine = js ? 'Chrome' : 'NetHttp'
|
29
47
|
builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
|
30
|
-
builder.new
|
48
|
+
builder.new(browser_params).build
|
31
49
|
end
|
32
50
|
|
33
51
|
def browser_params
|
34
52
|
{
|
35
53
|
user_agent: user_agent,
|
54
|
+
headless: headless,
|
36
55
|
request_method: request_method,
|
37
56
|
request_body: request_body
|
38
57
|
}
|
39
58
|
end
|
59
|
+
|
60
|
+
def invalid_request_without_get
|
61
|
+
"#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
|
62
|
+
end
|
40
63
|
end
|
41
64
|
end
|
@@ -2,10 +2,14 @@ module Makuri::BrowserBuilder
|
|
2
2
|
class Base
|
3
3
|
attr_accessor :user_agent, :request_method, :request_body
|
4
4
|
|
5
|
-
def initialize(options)
|
6
|
-
@user_agent = options.fetch(:user_agent)
|
7
|
-
@request_method = options.fetch(:request_method)
|
8
|
-
@request_body = options.fetch(:request_body)
|
5
|
+
def initialize(options = {})
|
6
|
+
@user_agent = options.fetch(:user_agent, '')
|
7
|
+
@request_method = options.fetch(:request_method, :get)
|
8
|
+
@request_body = options.fetch(:request_body, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def build
|
12
|
+
raise NotImplementedError
|
9
13
|
end
|
10
14
|
end
|
11
15
|
end
|
@@ -1,28 +1,34 @@
|
|
1
|
+
require 'capybara'
|
1
2
|
require 'selenium-webdriver'
|
2
3
|
|
3
4
|
module Makuri::BrowserBuilder
|
4
5
|
class Chrome < Base
|
5
|
-
def
|
6
|
-
|
6
|
+
def initialize(options = {})
|
7
|
+
super
|
8
|
+
@headless = options.fetch(:headless, true)
|
9
|
+
end
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
+
def build
|
12
|
+
Capybara.register_driver :selenium_chrome do |app|
|
13
|
+
Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
|
14
|
+
end
|
15
|
+
Capybara.threadsafe = true
|
16
|
+
Capybara::Session.new :selenium_chrome
|
11
17
|
end
|
12
18
|
|
13
19
|
private
|
14
20
|
|
15
21
|
def browser_options
|
16
22
|
args = %w[
|
17
|
-
--
|
18
|
-
--
|
23
|
+
--disable-gpu
|
24
|
+
--no-sandbox
|
25
|
+
--disable-translate
|
26
|
+
--blink-settings=imagesEnabled=false
|
27
|
+
--ignore-certificate-errors
|
19
28
|
]
|
20
|
-
args <<
|
29
|
+
args << '--headless' if @headless
|
30
|
+
args << "--user-agent=#{@user_agent}"
|
21
31
|
Selenium::WebDriver::Chrome::Options.new(args: args)
|
22
32
|
end
|
23
|
-
|
24
|
-
def invalid_request_message
|
25
|
-
"#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'render_js=true' argument!"
|
26
|
-
end
|
27
33
|
end
|
28
34
|
end
|
@@ -2,6 +2,10 @@ require 'net/http'
|
|
2
2
|
|
3
3
|
module Makuri::BrowserBuilder
|
4
4
|
class NetHttp < Base
|
5
|
+
def build
|
6
|
+
self
|
7
|
+
end
|
8
|
+
|
5
9
|
def visit(url)
|
6
10
|
uri = URI(url)
|
7
11
|
headers = { 'User-Agent': user_agent }
|
@@ -10,7 +14,7 @@ module Makuri::BrowserBuilder
|
|
10
14
|
|
11
15
|
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
12
16
|
http.request(request)
|
13
|
-
end
|
17
|
+
end
|
14
18
|
end
|
15
19
|
|
16
20
|
private
|
data/lib/makuri/spider.rb
CHANGED
@@ -55,7 +55,7 @@ module Makuri
|
|
55
55
|
end
|
56
56
|
|
57
57
|
def absolute_url(relative_url)
|
58
|
-
Addressable::URI.join(browser.
|
58
|
+
Addressable::URI.join(browser.url, relative_url).to_s
|
59
59
|
end
|
60
60
|
|
61
61
|
private
|
@@ -65,7 +65,7 @@ module Makuri
|
|
65
65
|
end
|
66
66
|
|
67
67
|
def update_response(url)
|
68
|
-
html = browser.
|
68
|
+
html = browser.request absolute_url(url)
|
69
69
|
@response = Nokogiri::HTML(html)
|
70
70
|
end
|
71
71
|
end
|
data/lib/makuri/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-01-
|
11
|
+
date: 2021-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: capybara
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.34'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.34'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: nokogiri
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,14 +58,14 @@ dependencies:
|
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '3.
|
61
|
+
version: '3.5'
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '3.
|
68
|
+
version: '3.5'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: bundler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|