makuri 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a5ecdfca240407bdcc2554114a82c0f7e66790fed4097d4b5b9cb654c603bbc
4
- data.tar.gz: bfc53dea621f432bd764bdb371e48a8da6aceeb2a3b1c364fa12634f87e5a982
3
+ metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
+ data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
5
5
  SHA512:
6
- metadata.gz: 98763e884eeb48faf3febfb6f2b2872cb59ec4dbe4be31e5dd45c5b1b8a228b317ad5e15b911ccb119d18494eb1e6be7e76864fe490e483ab9d4aba1b12704bb
7
- data.tar.gz: cd8dc00f1ec717983f8ee715ba891a86af53eaebec85333c1b0248096860cf899df1c4ce9b371ca56394c5873a0c00b485e950f6f29ec782b1cc59b741d2c95f
6
+ metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
+ data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
@@ -1,3 +1,10 @@
1
- ## 0.1.0 (2020-12-31)
1
+ ## 0.2.0 (2021-01-25)
2
+ - Add Capybara Support
3
+ - Add :headless option to Browser object
4
+ - Bug fixes & restructure Browser #follow, #request methods
2
5
 
3
- - Unreleased - Work in Progress
6
+ ## 0.1.0 (2021-01-05)
7
+
8
+ - First Release
9
+ - Add Basic Browser Builders for crawling web pages
10
+ - Add Spider Framework to support scraping
data/README.md CHANGED
@@ -2,6 +2,21 @@
2
2
 
3
3
  Makuri is a Web-crawling framework for Ruby.
4
4
 
5
+ # Install
6
+
7
+ Add this to your application's Gemfile
8
+ ```ruby
9
+ gem 'makuri'
10
+ ```
11
+ And execute
12
+ ```sh
13
+ $ bundle
14
+ ```
15
+ Or install it as:
16
+ ```sh
17
+ $ gem install makuri
18
+ ```
19
+
5
20
  # Usage
6
21
  In this example, we are going to crawl the [quotes website](https://quotes.toscrape.com) and scrape data as:
7
22
  ```ruby
@@ -32,8 +47,8 @@ end
32
47
  QuotesSpider.run
33
48
  ```
34
49
  Now save the file to ```quotes_spider.rb``` file and run it as:
35
- ```
36
- $> ruby quotes_spider.rb > quotes.json
50
+ ```sh
51
+ $ ruby quotes_spider.rb > quotes.json
37
52
  ```
38
53
  When it's done, you will find all the quotes saved to ```quotes.json``` file. It's that easy.
39
54
 
@@ -53,8 +68,8 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
53
68
  To get started with development:
54
69
 
55
70
  ```sh
56
- git clone https://github.com/lalusaud/makuri.git
57
- cd rover
58
- bundle install
59
- bundle exec rake test
71
+ $ git clone https://github.com/lalusaud/makuri.git
72
+ $ cd makuri
73
+ $ bundle install
74
+ $ bundle exec rake test
60
75
  ```
@@ -1,41 +1,64 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :current_url, :render_js, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @current_url = options.fetch(:url, '')
6
+ @url = options.fetch(:url, '')
7
7
  @request_method = options.fetch(:request_method, :get)
8
8
  @request_body = options.fetch(:request_body, {})
9
9
 
10
- @render_js = options.fetch(:render_js, false)
10
+ @js = options.fetch(:js, false)
11
+ @headless = options.fetch(:headless, true)
11
12
  @user_agent = options.fetch(
12
13
  :user_agent,
13
14
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
14
15
  )
15
16
  end
16
17
 
17
- def follow(url = '')
18
- @current_url = url unless url.empty?
19
- raise 'Invalid URL supplied' if @current_url.empty?
18
+ def browser
19
+ @browser ||= create_browser
20
+ end
21
+
22
+ # Only allow for capybara get request
23
+ def follow(current_url = '')
24
+ @url = current_url unless current_url.empty?
25
+ raise 'Invalid URL supplied' if url.empty?
26
+
27
+ raise invalid_request_without_get if request_method != :get
20
28
 
21
- @browser = create_browser
22
- @browser.visit(@current_url)
29
+ browser.visit(url)
30
+ browser
31
+ end
32
+
33
+ # Allow for NetHttp get request
34
+ def request(current_url = '')
35
+ @url = current_url unless current_url.empty?
36
+ raise 'Invalid URL supplied' if url.empty?
37
+
38
+ raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
+
40
+ browser.visit(url).body
23
41
  end
24
42
 
25
43
  private
26
44
 
27
45
  def create_browser
28
- engine = render_js ? 'Chrome' : 'NetHttp'
46
+ engine = js ? 'Chrome' : 'NetHttp'
29
47
  builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
30
- builder.new browser_params
48
+ builder.new(browser_params).build
31
49
  end
32
50
 
33
51
  def browser_params
34
52
  {
35
53
  user_agent: user_agent,
54
+ headless: headless,
36
55
  request_method: request_method,
37
56
  request_body: request_body
38
57
  }
39
58
  end
59
+
60
+ def invalid_request_without_get
61
+ "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
62
+ end
40
63
  end
41
64
  end
@@ -2,10 +2,14 @@ module Makuri::BrowserBuilder
2
2
  class Base
3
3
  attr_accessor :user_agent, :request_method, :request_body
4
4
 
5
- def initialize(options)
6
- @user_agent = options.fetch(:user_agent)
7
- @request_method = options.fetch(:request_method)
8
- @request_body = options.fetch(:request_body)
5
+ def initialize(options = {})
6
+ @user_agent = options.fetch(:user_agent, '')
7
+ @request_method = options.fetch(:request_method, :get)
8
+ @request_body = options.fetch(:request_body, '')
9
+ end
10
+
11
+ def build
12
+ raise NotImplementedError
9
13
  end
10
14
  end
11
15
  end
@@ -1,28 +1,34 @@
1
+ require 'capybara'
1
2
  require 'selenium-webdriver'
2
3
 
3
4
  module Makuri::BrowserBuilder
4
5
  class Chrome < Base
5
- def visit(url)
6
- raise invalid_request_message if request_method != :get
6
+ def initialize(options = {})
7
+ super
8
+ @headless = options.fetch(:headless, true)
9
+ end
7
10
 
8
- @browser = Selenium::WebDriver.for :chrome, options: browser_options
9
- @browser.get url
10
- @browser.page_source
11
+ def build
12
+ Capybara.register_driver :selenium_chrome do |app|
13
+ Capybara::Selenium::Driver.new app, browser: :chrome, options: browser_options
14
+ end
15
+ Capybara.threadsafe = true
16
+ Capybara::Session.new :selenium_chrome
11
17
  end
12
18
 
13
19
  private
14
20
 
15
21
  def browser_options
16
22
  args = %w[
17
- --headless --disable-gpu --no-sandbox --disable-translate
18
- --blink-settings=imagesEnabled=false --ignore-certificate-errors
23
+ --disable-gpu
24
+ --no-sandbox
25
+ --disable-translate
26
+ --blink-settings=imagesEnabled=false
27
+ --ignore-certificate-errors
19
28
  ]
20
- args << "--user-agent=#{user_agent}"
29
+ args << '--headless' if @headless
30
+ args << "--user-agent=#{@user_agent}"
21
31
  Selenium::WebDriver::Chrome::Options.new(args: args)
22
32
  end
23
-
24
- def invalid_request_message
25
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'render_js=true' argument!"
26
- end
27
33
  end
28
34
  end
@@ -2,6 +2,10 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ def build
6
+ self
7
+ end
8
+
5
9
  def visit(url)
6
10
  uri = URI(url)
7
11
  headers = { 'User-Agent': user_agent }
@@ -10,7 +14,7 @@ module Makuri::BrowserBuilder
10
14
 
11
15
  Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
12
16
  http.request(request)
13
- end.body
17
+ end
14
18
  end
15
19
 
16
20
  private
@@ -55,7 +55,7 @@ module Makuri
55
55
  end
56
56
 
57
57
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.current_url, relative_url).to_s
58
+ Addressable::URI.join(browser.url, relative_url).to_s
59
59
  end
60
60
 
61
61
  private
@@ -65,7 +65,7 @@ module Makuri
65
65
  end
66
66
 
67
67
  def update_response(url)
68
- html = browser.follow absolute_url(url)
68
+ html = browser.request absolute_url(url)
69
69
  @response = Nokogiri::HTML(html)
70
70
  end
71
71
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.2.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-06 00:00:00.000000000 Z
11
+ date: 2021-01-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: capybara
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.34'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.34'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: nokogiri
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,14 +58,14 @@ dependencies:
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: '3.142'
61
+ version: '3.5'
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: '3.142'
68
+ version: '3.5'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: bundler
57
71
  requirement: !ruby/object:Gem::Requirement