makuri 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1106d01d72194894872b6fa5409c9f6951f27ff462a1ad81c793dd10c043974
4
- data.tar.gz: 2d8ae6ef66759cdcdcf980e9e3b44f840f954db9ea3536b1a9ad3d656fec5e17
3
+ metadata.gz: efda242d320f75d0c7d483bd71f65f638a62624d0319543386b085914610010e
4
+ data.tar.gz: 12a830ef4e4a72926ee91e50f378fc0170acfd491bbc6c6a16dee08c0116288e
5
5
  SHA512:
6
- metadata.gz: ddf5f382fccbcbfff68fa26f3bed25c07accaee86d2d3379b1471ea4690ff431b1f70330e9a29be1d65af64bc03652e06938dc5b31b3d5f506c96a375438c473
7
- data.tar.gz: '0046904279c80dd1c9e6db0621dddcbcf8796ed8977940fc79dd9f500287128591cc2206f289ae271bbb0114c38d03ebe4d9a2ad894f4ceb34f99a6768493b6b'
6
+ metadata.gz: aaa1e7fb93b2cd25892dea2d34ea0e9edce13a6ac31aed7dc44b62b49b8b5080d4515eb903cba916d2a9634101f37980a872035f41a5b3632f2a608dd6e936b7
7
+ data.tar.gz: e32c252ed9940070600ce4f08ff9faa1ff93f5e225c674a5ad501ae5d4f9a6c3c02cdfd935d49284e981651fa86126298dcd1b06783ced0e85ef699f6d5dddad
data/CHANGELOG.md CHANGED
@@ -1,10 +1,15 @@
1
+ ## 0.3.0 (2021-02-07)
2
+ - Allow scrape from multiple start_urls
3
+ - Add :engine support to Spider
4
+ - Add :enable_images support to :chrome engine
5
+ - Bug fixes and restructure #request_method & #request_body
6
+
1
7
  ## 0.2.0 (2021-01-25)
2
8
  - Add Capybara Support
3
9
  - Add :headless option to Browser object
4
10
  - Bug fixes & restructure Browser #follow, #request methods
5
11
 
6
12
  ## 0.1.0 (2021-01-05)
7
-
8
13
  - First Release
9
14
  - Add Basic Browser Builders for crawling web pages
10
15
  - Add Spider Framework to support scraping
@@ -1,50 +1,47 @@
1
1
  module Makuri
2
2
  class Browser
3
- attr_accessor :url, :js, :headless, :user_agent, :request_method, :request_body
3
+ attr_accessor :url, :engine, :headless, :user_agent, :request_method, :request_body
4
4
 
5
5
  def initialize(options = {})
6
- @url = options.fetch(:url, '')
7
- @request_method = options.fetch(:request_method, :get)
8
- @request_body = options.fetch(:request_body, {})
9
-
10
- @js = options.fetch(:js, false)
6
+ @engine = options.fetch(:engine, :net_http)
11
7
  @headless = options.fetch(:headless, true)
12
8
  @user_agent = options.fetch(
13
9
  :user_agent,
14
10
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
15
11
  )
12
+
13
+ # Defaults
14
+ @request_method = :get
15
+ @request_body = {}
16
16
  end
17
17
 
18
18
  def browser
19
19
  @browser ||= create_browser
20
20
  end
21
21
 
22
- # Only allow for capybara get request
23
- def follow(current_url = '')
24
- @url = current_url unless current_url.empty?
25
- raise 'Invalid URL supplied' if url.empty?
22
+ # Allow for NetHttp get request
23
+ def request(url, options = {})
24
+ @url = url
25
+ validate_url
26
26
 
27
- raise invalid_request_without_get if request_method != :get
27
+ return follow if engine_chrome?
28
28
 
29
- browser.visit(url)
30
- browser
29
+ @request_method = options.fetch(:method, request_method)
30
+ @request_body = options.fetch(:body, request_body)
31
+ browser.visit(@url)
31
32
  end
32
33
 
33
- # Allow for NetHttp get request
34
- def request(current_url = '')
35
- @url = current_url unless current_url.empty?
36
- raise 'Invalid URL supplied' if url.empty?
37
-
38
- raise 'Invalid request type for js=true. Use #follow in place of #get' if js
39
-
40
- browser.visit(url).body
34
+ # Only allow for capybara get request
35
+ def follow
36
+ browser.visit(@url)
37
+ browser
41
38
  end
42
39
 
43
40
  private
44
41
 
45
42
  def create_browser
46
- engine = js ? 'Chrome' : 'NetHttp'
47
- builder = Object.const_get "Makuri::BrowserBuilder::#{engine}"
43
+ browser_engine = engine_chrome? ? 'Chrome' : 'NetHttp'
44
+ builder = Object.const_get "Makuri::BrowserBuilder::#{browser_engine}"
48
45
  builder.new(browser_params).build
49
46
  end
50
47
 
@@ -57,8 +54,13 @@ module Makuri
57
54
  }
58
55
  end
59
56
 
60
- def invalid_request_without_get
61
- "#{request_method.to_s.upcase} request not allowed for JS Engine. Try without 'js=true' argument!"
57
+ def validate_url
58
+ uri = URI.parse(@url)
59
+ raise 'Invalid URL supplied' unless uri.is_a?(URI::HTTP) && !uri.host.nil?
60
+ end
61
+
62
+ def engine_chrome?
63
+ engine == :chrome
62
64
  end
63
65
  end
64
66
  end
@@ -3,9 +3,12 @@ require 'selenium-webdriver'
3
3
 
4
4
  module Makuri::BrowserBuilder
5
5
  class Chrome < Base
6
+ attr_accessor :headless, :enable_images
7
+
6
8
  def initialize(options = {})
7
9
  super
8
- @headless = options.fetch(:headless, true)
10
+ @headless = options.fetch(:headless, true)
11
+ @enable_images = options.fetch(:enable_images, true)
9
12
  end
10
13
 
11
14
  def build
@@ -23,11 +26,11 @@ module Makuri::BrowserBuilder
23
26
  --disable-gpu
24
27
  --no-sandbox
25
28
  --disable-translate
26
- --blink-settings=imagesEnabled=false
27
29
  --ignore-certificate-errors
28
30
  ]
29
- args << '--headless' if @headless
30
- args << "--user-agent=#{@user_agent}"
31
+ args << '--headless' if headless
32
+ args << "--user-agent=#{user_agent}"
33
+ args << "--blink-settings=imagesEnabled=#{enable_images}"
31
34
  Selenium::WebDriver::Chrome::Options.new(args: args)
32
35
  end
33
36
  end
@@ -2,6 +2,8 @@ require 'net/http'
2
2
 
3
3
  module Makuri::BrowserBuilder
4
4
  class NetHttp < Base
5
+ attr_accessor :response
6
+
5
7
  def build
6
8
  self
7
9
  end
@@ -12,9 +14,14 @@ module Makuri::BrowserBuilder
12
14
 
13
15
  request = send("#{request_method}_request", uri.request_uri, headers)
14
16
 
15
- Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
17
+ @response = Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == 'https')) do |http|
16
18
  http.request(request)
17
19
  end
20
+ self
21
+ end
22
+
23
+ def html
24
+ @response.body
18
25
  end
19
26
 
20
27
  private
data/lib/makuri/spider.rb CHANGED
@@ -18,23 +18,30 @@ module Makuri
18
18
  @start_urls = urls
19
19
  end
20
20
 
21
+ def spider_options(**options)
22
+ @engine = options.fetch(:engine, :net_http)
23
+ end
24
+
21
25
  def run
22
26
  raise "Start URLs not found. Define start_urls for #{self}." unless defined? @start_urls
23
27
 
24
- new(@start_urls).parse
28
+ @engine ||= :net_http
29
+
30
+ @start_urls.each { |start_url| new(start_url: start_url, engine: @engine).parse }
25
31
  end
26
32
  end
27
33
 
28
- attr_accessor :start_urls, :response
34
+ attr_accessor :engine, :response
29
35
 
30
- def initialize(start_urls)
31
- @start_urls = start_urls
36
+ def initialize(**config)
37
+ @start_url = config.fetch(:start_url, nil)
38
+ @engine = config.fetch(:engine, :net_http)
32
39
 
33
- update_response(@start_urls[0])
40
+ update_response(@start_url)
34
41
  end
35
42
 
36
43
  def browser
37
- @browser ||= Makuri::Browser.new
44
+ @browser ||= Makuri::Browser.new(engine: engine)
38
45
  end
39
46
 
40
47
  def parse
@@ -55,18 +62,23 @@ module Makuri
55
62
  end
56
63
 
57
64
  def absolute_url(relative_url)
58
- Addressable::URI.join(browser.url, relative_url).to_s
65
+ Addressable::URI.join(base_url, relative_url).to_s
59
66
  end
60
67
 
61
68
  private
62
69
 
70
+ def base_url
71
+ @start_url || browser.url.to_s
72
+ end
73
+
63
74
  def valid_url?(url)
64
75
  defined?(url) && !url.to_s.empty?
65
76
  end
66
77
 
67
78
  def update_response(url)
68
- html = browser.request absolute_url(url)
69
- @response = Nokogiri::HTML(html)
79
+ res = browser.request(absolute_url(url)).html
80
+ # res = res.html if @engine == :chrome
81
+ @response = Nokogiri::HTML(res)
70
82
  end
71
83
  end
72
84
  end
@@ -1,3 +1,3 @@
1
1
  module Makuri
2
- VERSION = '0.2.0'.freeze
2
+ VERSION = '0.3.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: makuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lal Saud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-25 00:00:00.000000000 Z
11
+ date: 2021-02-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webdrivers
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '4.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '4.0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: webmock
85
99
  requirement: !ruby/object:Gem::Requirement