web_stat 0.3.5 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b35413482a93f316bbd3d99a037f0e97102544a4e04604b5add670ac1a1500a
4
- data.tar.gz: 4517b7754b2096901b005c26497fa2addad572b02f08a7aed385f6dc7de2e55a
3
+ metadata.gz: 29a2c3d0e37310f75c009e696044495e1f1b1f3d9ee8840d08aec711b67b4a11
4
+ data.tar.gz: 1214f85ed921b82f964cb77f5b882c3f41a47db36188c7ce789208644918fcf7
5
5
  SHA512:
6
- metadata.gz: a54c666953b0c51e1e5ea8d230069bf608e6284629740070f77f3816468553037852c0cbc6c39c35cd53000435fbced4acee6a7b0ca855f2c241e0fb769d32da
7
- data.tar.gz: 3fef9c1c48f272e27c877a4a588ce45f841ad316429bb78fc3aa6de5748d6471b9e5aaed2ee076af0d3fa0779b5696c452c1b0fc2f77644c7914b171377def8e
6
+ metadata.gz: bad407ab3233b5a3fc11e5bb770862bd6346a42759830c66f04b9b525773c9459972d5ab78d8b86281914dfd78051898ad14c6a77d04ed38392b75dc4a3760e3
7
+ data.tar.gz: 2854708ae4aec9833f288ca3bece3a03a548aeaf5177582fb1ed569c85c27fd97b7897a9d080d81b8b670590e52d5b1a666eb918606c5643cc9286593ea9fd12
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.5)
4
+ web_stat (0.3.10)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
@@ -5,14 +5,29 @@ module WebStat
5
5
  # @param [String] url
6
6
  # @param [Integer] delay
7
7
  def get_last_url(url, delay=nil)
8
+ driver = get_driver(url, delay)
9
+ last_url = driver.current_url
10
+ driver.quit
11
+ last_url
12
+ end
13
+ # Get source of html
14
+ # @param [String] url
15
+ # @param [Integer] delay
16
+ def get_source(url, delay=nil)
17
+ driver = get_driver(url, delay)
18
+ source = driver.page_source
19
+ driver.quit
20
+ source
21
+ end
22
+
23
+ private
24
+ def get_driver(url, delay=nil)
8
25
  Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
9
26
  Selenium::WebDriver.logger.level = :info
10
27
  options = Selenium::WebDriver::Chrome::Options.new(args: [
11
28
  'headless',
12
29
  'no-sandbox',
13
- 'disable-gpu',
14
- 'start-maximized',
15
- 'window-size=1920,1080'
30
+ 'disable-gpu'
16
31
  ])
17
32
  driver = Selenium::WebDriver.for(:chrome, options: options)
18
33
  driver.manage.timeouts.implicit_wait = 10
@@ -21,9 +36,7 @@ module WebStat
21
36
  if delay.is_a?(Integer)
22
37
  sleep delay
23
38
  end
24
- last_url = driver.current_url
25
- driver.quit
26
- last_url
39
+ driver
27
40
  end
28
41
  end
29
42
  end
@@ -47,6 +47,9 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
+ if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
51
+ path = @nokogiri.at('body').xpath('//img').first.attr('src')
52
+ end
50
53
  if ! path.nil? && path.match(/^\//)
51
54
  "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
52
55
  else
@@ -75,17 +78,25 @@ module WebStat
75
78
  # @param [String] url
76
79
  # @param [String] body
77
80
  def get_url(url)
78
- agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
81
+ mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
79
82
  # Enable to read Robots.txt
80
- agent.robots = true
83
+ mech.robots = true
81
84
  begin
82
- document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
83
- if document.class == Mechanize::File
84
- body = document.body
85
+ if mech.agent.robots_disallowed?(url)
86
+ raise Mechanize::RobotsDisallowedError.new(url)
87
+ end
88
+ if WebStat::Configure.get["use_chromedirver"]
89
+ body = WebStat::WebDriverHelper.get_source(url)
90
+ @status = 200
85
91
  else
86
- body = document.body.encode('UTF-8', document.encoding)
92
+ document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
93
+ if document.class == Mechanize::File
94
+ body = document.body
95
+ else
96
+ body = document.body.encode('UTF-8', document.encoding)
97
+ end
98
+ @status = document.code
87
99
  end
88
- @status = document.code
89
100
  rescue Mechanize::ResponseCodeError => e
90
101
  body = e.page.body
91
102
  @status = e.page.code
@@ -96,7 +107,7 @@ module WebStat
96
107
  # Get the informations of @url
97
108
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
98
109
  def stat(userdics: nil)
99
- clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
110
+ clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
100
111
  language_code = CLD.detect_language(clean_content)[:code]
101
112
  if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
102
113
  tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-13 00:00:00.000000000 Z
11
+ date: 2020-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler