makuri 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a5ecdfca240407bdcc2554114a82c0f7e66790fed4097d4b5b9cb654c603bbc
4
- data.tar.gz: bfc53dea621f432bd764bdb371e48a8da6aceeb2a3b1c364fa12634f87e5a982
3
+ metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
+ data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
5
5
  SHA512:
6
- metadata.gz: 98763e884eeb48faf3febfb6f2b2872cb59ec4dbe4be31e5dd45c5b1b8a228b317ad5e15b911ccb119d18494eb1e6be7e76864fe490e483ab9d4aba1b12704bb
7
- data.tar.gz: cd8dc00f1ec717983f8ee715ba891a86af53eaebec85333c1b0248096860cf899df1c4ce9b371ca56394c5873a0c00b485e950f6f29ec782b1cc59b741d2c95f
6
+ metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
+ data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
@@ -1,3 +1,10 @@
1
- ## 0.1.0 (2020-12-31)
1
+ ## 0.2.0 (2021-01-25)
2
+ - Add Capybara Support
3
+ - Add :headless option to Browser object
4
+ - Bug fixes & restructure Browser #follow, #request methods
2
5
 
3
- - Unreleased - Work in Progress
6
+ ## 0.1.0 (2021-01-05)
7
+
8
+ - First Release
9
+ - Add Basic Browser Builders for crawling web pages
10
+ - Add Spider Framework to support scraping
data/README.md CHANGED
@@ -2,6 +2,21 @@
2
2
 
3
3
  Makuri is a Web-crawling framework for Ruby.
4
4
 
5
+ # Install
6
+
7
+ Add this to your application's Gemfile
8
+ ```ruby
9
+ gem 'makuri'
10
+ ```
11
+ And execute
12
+ ```sh
13
+ $ bundle
14
+ ```
15
+ Or install it as:
16
+ ```sh
17
+ $ gem install makuri
18
+ ```
19
+
5
20
  # Usage
6
21
  In this example, we are going to crawl the [quotes website](https://quotes.toscrape.com) and scrape data as:
7
22
  ```ruby
@@ -32,8 +47,8 @@ end
32
47
  QuotesSpider.run
33
48
  ```
34
49
  Now save the file to ```quotes_spider.rb``` file and run it as:
35
- ```
36
- $> ruby quotes_spider.rb > quotes.json
50
+ ```sh
51
+ $ ruby quotes_spider.rb > quotes.json
37
52
  ```
38
53
  When it's done, you will find all the quotes saved to ```quotes.json``` file. It's that easy.
39
54
 
@@ -53,8 +68,8 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
53
68
  To get started with development:
54
69
 
55
70
  ```sh
56
- git clone https://github.com/lalusaud/makuri.git
57
- cd rover
58
- bundle install
59
- bundle exec rake test
71
+ $ git clone https://github.com/lalusaud/makuri.git
72
+ $ cd makuri
73
+ $ bundle install
74
+ $ bundle exec rake test
60
75
  ```
@@ -1,41 +1,64 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :current_url, :render_js, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @current_url = options.fetch(:url, '')
6
+ @url = options.fetch(:url, '')
7
7
  @request_method = options.fetch(:request_method, :get)
8
8
  @request_body = options.fetch(:request_body, {})
9
9
 
10
- @render_js = options.fetch(:render_js, false)
10
+ @js = options.fetch(:js, false)
11
+ @headless = options.fetch(:headless, true)
11
12
  @user_agent = options.fetch(
12
13
  :user_agent,
13
14
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
14
15
  )
15
16
  end
16
17
 
17
- def follow(url = '')
18
- @current_url = url unless url.empty?
19
- raise 'Invalid URL supplied' if @current_url.empty?
18
+ def browser
19
+ @browser ||= create_browser
20
+ end
21
+
22
+ # Only allow for capybara get request
23
+ def follow(current_url = '')
24
+ @url = current_url unless current_url.empty?
25
+ raise 'Invalid URL supplied' if url.empty?
26
+
27
+ raise invalid_request_without_get if request_method != :get
20
28
 
21
- @browser = create_browser
22
- @browser.visit(@current_url)
29
+ browser.visit(url)
30
+ browser
31
+ end
32
+
33
+ # Allow for NetHttp get request
34
+ def request(current_url = '')
35
+ @url = current_url unless current_url.empty?
36
+ raise 'Invalid URL supplied' if url.empty?
37
+
38
+ raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
+
40
+ browser.visit(url).body
23
41
  end
24
42
 
25
43
  private
26
44
 
27
45
  def create_browser
28
- engine = render_js ? 'Chrome' : 'NetHttp'
46
+ engine = js ? 'Chrome' : 'NetHttp'
29
47
  builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
30
- builder.new browser_params
48
+ builder.new(browser_params).build
31
49
  end
32
50
 
33
51
  def browser_params
34
52
  {
35
53
  user_agent: user_agent,
54
+ headless: headless,
36
55
  request_method: request_method,
37
56
  request_body: request_body
38
57
  }
39
58
  end
59
+
60
+ def invalid_request_without_get
61
+ "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
62
+ end
40
63
  end
41
64
  end
@@ -2,10 +2,14 @@ module Makuri::BrowserBuilder
2
2
  class Base
3
3
  attr_accessor :user_agent, :request_method, :request_body
4
4
 
5
- def initialize(options)
6
- @user_agent = options.fetch(:user_agent)
7
- @request_method = options.fetch(:request_method)
8
- @request_body = options.fetch(:request_body)
5
+ def initialize(options = {})
6
+ @user_agent = options.fetch(:user_agent, '')
7
+ @request_method = options.fetch(:request_method, :get)
8
+ @request_body = options.fetch(:request_body, '')
9
+ end
10
+
11
+ def build
12
+ raise NotImplementedError
9
13
  end
10
14
  end
11
15
  end
@@ -1,28 +1,34 @@
1
+ require 'capybara'
1
2
  require 'selenium-webdriver'
2
3
 
3
4
  module Makuri::BrowserBuilder
4
5
  class Chrome < Base
5
- def visit(url)
6
- raise invalid_request_message if request_method != :get
6
+ def initialize(options = {})
7
+ super
8
+ @headless = options.fetch(:headless, true)
9
+ end
7
10
 
8
- @browser = Selenium::WebDriver.for :chrome, options: browser_options
9
- @browser.get url
10
- @browser.page_source
11
+ def build
12
+ Capybara.register_driver :selenium_chrome do |app|
13
+ Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
14
+ end
15
+ Capybara.threadsafe = true
16
+ Capybara::Session.new :selenium_chrome
11
17
  end
12
18
 
13
19
  private
14
20
 
15
21
  def browser_options
16
22
  args = %w[
17
- --headless --disable-gpu --no-sandbox --disable-translate
18
- --blink-settings=imagesEnabled=false --ignore-certificate-errors
23
+ --disable-gpu
24
+ --no-sandbox
25
+ --disable-translate
26
+ --blink-settings=imagesEnabled=false
27
+ --ignore-certificate-errors
19
28
  ]
20
- args << "--user-agent=#{user_agent}"
29
+ args << '--headless' if @headless
30
+ args << "--user-agent=#{@user_agent}"
21
31
  Selenium::WebDriver::Chrome::Options.new(args: args)
22
32
  end
23
-
24
- def invalid_request_message
25
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'render_js=true' argument!"
26
- end
27
33
  end
28
34
  end
@@ -2,6 +2,10 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ def build
6
+ self
7
+ end
8
+
5
9
  def visit(url)
6
10
  uri = URI(url)
7
11
  headers = { 'User-Agent': user_agent }
@@ -10,7 +14,7 @@ module Makuri::BrowserBuilder
10
14
 
11
15
  Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
12
16
  http.request(request)
13
- end.body
17
+ end
14
18
  end
15
19
 
16
20
  private
@@ -55,7 +55,7 @@ module Makuri
55
55
  end
56
56
 
57
57
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.current_url, relative_url).to_s
58
+ Addressable::URI.join(browser.url, relative_url).to_s
59
59
  end
60
60
 
61
61
  private
@@ -65,7 +65,7 @@ module Makuri
65
65
  end
66
66
 
67
67
  def update_response(url)
68
- html = browser.follow absolute_url(url)
68
+ html = browser.request absolute_url(url)
69
69
  @response = Nokogiri::HTML(html)
70
70
  end
71
71
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.2.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-06 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: capybara
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.34'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.34'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: nokogiri
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,14 +58,14 @@ dependencies:
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: '3.142'
61
+ version: '3.5'
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: '3.142'
68
+ version: '3.5'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement