makuri 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
- data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
3
+ metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
4
+ data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
5
5
  SHA512:
6
- metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
- data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
6
+ metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
7
+ data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
data/CHANGELOG.md CHANGED
@@ -1,10 +1,15 @@
1
+ ## 0.3.0 (2021-02-07)
2
+ - Allow scrape from multiple start_urls
3
+ - Add :engine support to Spider
4
+ - Add :enable_images support to :chrome engine
5
+ - Bug fixes and restructure #request_method & #request_body
6
+
1
7
  ## 0.2.0 (2021-01-25)
2
8
  - Add Capybara Support
3
9
  - Add :headless option to Browser object
4
10
  - Bug fixes & restructure Browser #follow, #request methods
5
11
 
6
12
  ## 0.1.0 (2021-01-05)
7
-
8
13
  - First Release
9
14
  - Add Basic Browser Builders for crawling web pages
10
15
  - Add Spider Framework to support scraping
@@ -1,50 +1,47 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @url = options.fetch(:url, '')
7
- @request_method = options.fetch(:request_method, :get)
8
- @request_body = options.fetch(:request_body, {})
9
-
10
- @js = options.fetch(:js, false)
6
+ @engine = options.fetch(:engine, :net_http)
11
7
  @headless = options.fetch(:headless, true)
12
8
  @user_agent = options.fetch(
13
9
  :user_agent,
14
10
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
15
11
  )
12
+
13
+ # Defaults
14
+ @request_method = :get
15
+ @request_body = {}
16
16
  end
17
17
 
18
18
  def browser
19
19
  @browser ||= create_browser
20
20
  end
21
21
 
22
- # Only allow for capybara get request
23
- def follow(current_url = '')
24
- @url = current_url unless current_url.empty?
25
- raise 'Invalid URL supplied' if url.empty?
22
+ # Allow for NetHttp get request
23
+ def request(url, options = {})
24
+ @url = url
25
+ validate_url
26
26
 
27
- raise invalid_request_without_get if request_method != :get
27
+ return follow if engine_chrome?
28
28
 
29
- browser.visit(url)
30
- browser
29
+ @request_method = options.fetch(:method, request_method)
30
+ @request_body = options.fetch(:body, request_body)
31
+ browser.visit(@url)
31
32
  end
32
33
 
33
- # Allow for NetHttp get request
34
- def request(current_url = '')
35
- @url = current_url unless current_url.empty?
36
- raise 'Invalid URL supplied' if url.empty?
37
-
38
- raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
-
40
- browser.visit(url).body
34
+ # Only allow for capybara get request
35
+ def follow
36
+ browser.visit(@url)
37
+ browser
41
38
  end
42
39
 
43
40
  private
44
41
 
45
42
  def create_browser
46
- engine = js ? 'Chrome' : 'NetHttp'
47
- builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
43
+ browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
44
+ builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
48
45
  builder.new(browser_params).build
49
46
  end
50
47
 
@@ -57,8 +54,13 @@ module Makuri
57
54
  }
58
55
  end
59
56
 
60
- def invalid_request_without_get
61
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
57
+ def validate_url
58
+ uri = URI.parse(@url)
59
+ raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
60
+ end
61
+
62
+ def engine_chrome?
63
+ engine == :chrome
62
64
  end
63
65
  end
64
66
  end
@@ -3,9 +3,12 @@ require 'selenium-webdriver'
3
3
 
4
4
  module Makuri::BrowserBuilder
5
5
  class Chrome < Base
6
+ attr_accessor :headless, :enable_images
7
+
6
8
  def initialize(options = {})
7
9
  super
8
- @headless = options.fetch(:headless, true)
10
+ @headless = options.fetch(:headless, true)
11
+ @enable_images = options.fetch(:enable_images, true)
9
12
  end
10
13
 
11
14
  def build
@@ -23,11 +26,11 @@ module Makuri::BrowserBuilder
23
26
  --disable-gpu
24
27
  --no-sandbox
25
28
  --disable-translate
26
- --blink-settings=imagesEnabled=false
27
29
  --ignore-certificate-errors
28
30
  ]
29
- args << '--headless' if @headless
30
- args << "--user-agent=#{@user_agent}"
31
+ args << '--headless' if headless
32
+ args << "--user-agent=#{user_agent}"
33
+ args << "--blink-settings=imagesEnabled=#{enable_images}"
31
34
  Selenium::WebDriver::Chrome::Options.new(args: args)
32
35
  end
33
36
  end
@@ -2,6 +2,8 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ attr_accessor :response
6
+
5
7
  def build
6
8
  self
7
9
  end
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
12
14
 
13
15
  request = send("#{request_method}_request", uri.request_uri, headers)
14
16
 
15
- Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
17
+ @response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
16
18
  http.request(request)
17
19
  end
20
+ self
21
+ end
22
+
23
+ def html
24
+ @response.body
18
25
  end
19
26
 
20
27
  private
data/lib/makuri/spider.rb CHANGED
@@ -18,23 +18,30 @@ module Makuri
18
18
  @start_urls = urls
19
19
  end
20
20
 
21
+ def spider_options(**options)
22
+ @engine = options.fetch(:engine, :net_http)
23
+ end
24
+
21
25
  def run
22
26
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
23
27
 
24
- new(@start_urls).parse
28
+ @engine ||= :net_http
29
+
30
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
25
31
  end
26
32
  end
27
33
 
28
- attr_accessor :start_urls, :response
34
+ attr_accessor :engine, :response
29
35
 
30
- def initialize(start_urls)
31
- @start_urls = start_urls
36
+ def initialize(**config)
37
+ @start_url = config.fetch(:start_url, nil)
38
+ @engine = config.fetch(:engine, :net_http)
32
39
 
33
- update_response(@start_urls[0])
40
+ update_response(@start_url)
34
41
  end
35
42
 
36
43
  def browser
37
- @browser ||= Makuri::Browser.new
44
+ @browser ||= Makuri::Browser.new(engine: engine)
38
45
  end
39
46
 
40
47
  def parse
@@ -55,18 +62,23 @@ module Makuri
55
62
  end
56
63
 
57
64
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.url, relative_url).to_s
65
+ Addressable::URI.join(base_url, relative_url).to_s
59
66
  end
60
67
 
61
68
  private
62
69
 
70
+ def base_url
71
+ @start_url || browser.url.to_s
72
+ end
73
+
63
74
  def valid_url?(url)
64
75
  defined?(url) && !url.to_s.empty?
65
76
  end
66
77
 
67
78
  def update_response(url)
68
- html = browser.request absolute_url(url)
69
- @response = Nokogiri::HTML(html)
79
+ res = browser.request(absolute_url(url)).html
80
+ # res = res.html if @engine == :chrome
81
+ @response = Nokogiri::HTML(res)
70
82
  end
71
83
  end
72
84
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.2.0'.freeze
2
+ VERSION = '0.3.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-25 00:00:00.000000000 Z
11
+ date: 2021-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webdrivers
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: webmock
85
99
  requirement: !ruby/object:Gem::Requirement