makuri 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -2
- data/README.md +21 -6
- data/lib/makuri/browser.rb +33 -10
- data/lib/makuri/browser_builder/base.rb +8 -4
- data/lib/makuri/browser_builder/chrome.rb +18 -12
- data/lib/makuri/browser_builder/net_http.rb +5 -1
- data/lib/makuri/spider.rb +2 -2
- data/lib/makuri/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
|
4
|
+
data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
|
7
|
+
data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
-
## 0.
|
1
|
+
## 0.2.0 (2021-01-25)
|
2
|
+
- Add Capybara Support
|
3
|
+
- Add :headless option to Browser object
|
4
|
+
- Bug fixes & restructure Browser #follow, #request methods
|
2
5
|
|
3
|
-
|
6
|
+
## 0.1.0 (2021-01-05)
|
7
|
+
|
8
|
+
- First Release
|
9
|
+
- Add Basic Browser Builders for crawling web pages
|
10
|
+
- Add Spider Framework to support scraping
|
data/README.md
CHANGED
@@ -2,6 +2,21 @@
|
|
2
2
|
|
3
3
|
Makuri is a Web-crawling framework for Ruby.
|
4
4
|
|
5
|
+
# Install
|
6
|
+
|
7
|
+
Add this to your application's Gemfile
|
8
|
+
```ruby
|
9
|
+
gem 'makuri'
|
10
|
+
```
|
11
|
+
And execute
|
12
|
+
```sh
|
13
|
+
$ bundle
|
14
|
+
```
|
15
|
+
Or install it as:
|
16
|
+
```sh
|
17
|
+
$ gem install makuri
|
18
|
+
```
|
19
|
+
|
5
20
|
# Usage
|
6
21
|
In this example, we are going to crawl the [quotes website](https://quotes.toscrape.com) and scrape data as:
|
7
22
|
```ruby
|
@@ -32,8 +47,8 @@ end
|
|
32
47
|
QuotesSpider.run
|
33
48
|
```
|
34
49
|
Now save the file to ```quotes_spider.rb``` file and run it as:
|
35
|
-
```
|
36
|
-
|
50
|
+
```sh
|
51
|
+
$ ruby quotes_spider.rb > quotes.json
|
37
52
|
```
|
38
53
|
When it's done, you will find all the quotes saved to ```quotes.json``` file. It's that easy.
|
39
54
|
|
@@ -53,8 +68,8 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
|
|
53
68
|
To get started with development:
|
54
69
|
|
55
70
|
```sh
|
56
|
-
git clone https://github.com/lalusaud/makuri.git
|
57
|
-
cd
|
58
|
-
bundle install
|
59
|
-
bundle exec rake test
|
71
|
+
$ git clone https://github.com/lalusaud/makuri.git
|
72
|
+
$ cd makuri
|
73
|
+
$ bundle install
|
74
|
+
$ bundle exec rake test
|
60
75
|
```
|
data/lib/makuri/browser.rb
CHANGED
@@ -1,41 +1,64 @@
|
|
1
1
|
module Makuri
|
2
2
|
class Browser
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
|
4
4
|
|
5
5
|
def initialize(options = {})
|
6
|
-
@
|
6
|
+
@url = options.fetch(:url, '')
|
7
7
|
@request_method = options.fetch(:request_method, :get)
|
8
8
|
@request_body = options.fetch(:request_body, {})
|
9
9
|
|
10
|
-
@
|
10
|
+
@js = options.fetch(:js, false)
|
11
|
+
@headless = options.fetch(:headless, true)
|
11
12
|
@user_agent = options.fetch(
|
12
13
|
:user_agent,
|
13
14
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
|
14
15
|
)
|
15
16
|
end
|
16
17
|
|
17
|
-
def
|
18
|
-
@
|
19
|
-
|
18
|
+
def browser
|
19
|
+
@browser ||= create_browser
|
20
|
+
end
|
21
|
+
|
22
|
+
# Only allow for capybara get request
|
23
|
+
def follow(current_url = '')
|
24
|
+
@url = current_url unless current_url.empty?
|
25
|
+
raise 'Invalid URL supplied' if url.empty?
|
26
|
+
|
27
|
+
raise invalid_request_without_get if request_method != :get
|
20
28
|
|
21
|
-
|
22
|
-
|
29
|
+
browser.visit(url)
|
30
|
+
browser
|
31
|
+
end
|
32
|
+
|
33
|
+
# Allow for NetHttp get request
|
34
|
+
def request(current_url = '')
|
35
|
+
@url = current_url unless current_url.empty?
|
36
|
+
raise 'Invalid URL supplied' if url.empty?
|
37
|
+
|
38
|
+
raise 'Invalid request type for js=true. Use #follow in place of #get' if js
|
39
|
+
|
40
|
+
browser.visit(url).body
|
23
41
|
end
|
24
42
|
|
25
43
|
private
|
26
44
|
|
27
45
|
def create_browser
|
28
|
-
engine =
|
46
|
+
engine = js ? 'Chrome' : 'NetHttp'
|
29
47
|
builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
|
30
|
-
builder.new
|
48
|
+
builder.new(browser_params).build
|
31
49
|
end
|
32
50
|
|
33
51
|
def browser_params
|
34
52
|
{
|
35
53
|
user_agent: user_agent,
|
54
|
+
headless: headless,
|
36
55
|
request_method: request_method,
|
37
56
|
request_body: request_body
|
38
57
|
}
|
39
58
|
end
|
59
|
+
|
60
|
+
def invalid_request_without_get
|
61
|
+
"#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
|
62
|
+
end
|
40
63
|
end
|
41
64
|
end
|
@@ -2,10 +2,14 @@ module Makuri::BrowserBuilder
|
|
2
2
|
class Base
|
3
3
|
attr_accessor :user_agent, :request_method, :request_body
|
4
4
|
|
5
|
-
def initialize(options)
|
6
|
-
@user_agent = options.fetch(:user_agent)
|
7
|
-
@request_method = options.fetch(:request_method)
|
8
|
-
@request_body = options.fetch(:request_body)
|
5
|
+
def initialize(options = {})
|
6
|
+
@user_agent = options.fetch(:user_agent, '')
|
7
|
+
@request_method = options.fetch(:request_method, :get)
|
8
|
+
@request_body = options.fetch(:request_body, '')
|
9
|
+
end
|
10
|
+
|
11
|
+
def build
|
12
|
+
raise NotImplementedError
|
9
13
|
end
|
10
14
|
end
|
11
15
|
end
|
@@ -1,28 +1,34 @@
|
|
1
|
+
require 'capybara'
|
1
2
|
require 'selenium-webdriver'
|
2
3
|
|
3
4
|
module Makuri::BrowserBuilder
|
4
5
|
class Chrome < Base
|
5
|
-
def
|
6
|
-
|
6
|
+
def initialize(options = {})
|
7
|
+
super
|
8
|
+
@headless = options.fetch(:headless, true)
|
9
|
+
end
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
+
def build
|
12
|
+
Capybara.register_driver :selenium_chrome do |app|
|
13
|
+
Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
|
14
|
+
end
|
15
|
+
Capybara.threadsafe = true
|
16
|
+
Capybara::Session.new :selenium_chrome
|
11
17
|
end
|
12
18
|
|
13
19
|
private
|
14
20
|
|
15
21
|
def browser_options
|
16
22
|
args = %w[
|
17
|
-
--
|
18
|
-
--
|
23
|
+
--disable-gpu
|
24
|
+
--no-sandbox
|
25
|
+
--disable-translate
|
26
|
+
--blink-settings=imagesEnabled=false
|
27
|
+
--ignore-certificate-errors
|
19
28
|
]
|
20
|
-
args <<
|
29
|
+
args << '--headless' if @headless
|
30
|
+
args << "--user-agent=#{@user_agent}"
|
21
31
|
Selenium::WebDriver::Chrome::Options.new(args: args)
|
22
32
|
end
|
23
|
-
|
24
|
-
def invalid_request_message
|
25
|
-
"#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'render_js=true' argument!"
|
26
|
-
end
|
27
33
|
end
|
28
34
|
end
|
@@ -2,6 +2,10 @@ require 'net/http'
|
|
2
2
|
|
3
3
|
module Makuri::BrowserBuilder
|
4
4
|
class NetHttp < Base
|
5
|
+
def build
|
6
|
+
self
|
7
|
+
end
|
8
|
+
|
5
9
|
def visit(url)
|
6
10
|
uri = URI(url)
|
7
11
|
headers = { 'User-Agent': user_agent }
|
@@ -10,7 +14,7 @@ module Makuri::BrowserBuilder
|
|
10
14
|
|
11
15
|
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
|
12
16
|
http.request(request)
|
13
|
-
end
|
17
|
+
end
|
14
18
|
end
|
15
19
|
|
16
20
|
private
|
data/lib/makuri/spider.rb
CHANGED
@@ -55,7 +55,7 @@ module Makuri
|
|
55
55
|
end
|
56
56
|
|
57
57
|
def absolute_url(relative_url)
|
58
|
-
Addressable::URI.join(browser.
|
58
|
+
Addressable::URI.join(browser.url, relative_url).to_s
|
59
59
|
end
|
60
60
|
|
61
61
|
private
|
@@ -65,7 +65,7 @@ module Makuri
|
|
65
65
|
end
|
66
66
|
|
67
67
|
def update_response(url)
|
68
|
-
html = browser.
|
68
|
+
html = browser.request absolute_url(url)
|
69
69
|
@response = Nokogiri::HTML(html)
|
70
70
|
end
|
71
71
|
end
|
data/lib/makuri/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: makuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lal Saud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-01-
|
11
|
+
date: 2021-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: capybara
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.34'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.34'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: nokogiri
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,14 +58,14 @@ dependencies:
|
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '3.
|
61
|
+
version: '3.5'
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '3.
|
68
|
+
version: '3.5'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: bundler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|