web_stat 0.3.5 → 0.3.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b35413482a93f316bbd3d99a037f0e97102544a4e04604b5add670ac1a1500a
4
- data.tar.gz: 4517b7754b2096901b005c26497fa2addad572b02f08a7aed385f6dc7de2e55a
3
+ metadata.gz: 29a2c3d0e37310f75c009e696044495e1f1b1f3d9ee8840d08aec711b67b4a11
4
+ data.tar.gz: 1214f85ed921b82f964cb77f5b882c3f41a47db36188c7ce789208644918fcf7
5
5
  SHA512:
6
- metadata.gz: a54c666953b0c51e1e5ea8d230069bf608e6284629740070f77f3816468553037852c0cbc6c39c35cd53000435fbced4acee6a7b0ca855f2c241e0fb769d32da
7
- data.tar.gz: 3fef9c1c48f272e27c877a4a588ce45f841ad316429bb78fc3aa6de5748d6471b9e5aaed2ee076af0d3fa0779b5696c452c1b0fc2f77644c7914b171377def8e
6
+ metadata.gz: bad407ab3233b5a3fc11e5bb770862bd6346a42759830c66f04b9b525773c9459972d5ab78d8b86281914dfd78051898ad14c6a77d04ed38392b75dc4a3760e3
7
+ data.tar.gz: 2854708ae4aec9833f288ca3bece3a03a548aeaf5177582fb1ed569c85c27fd97b7897a9d080d81b8b670590e52d5b1a666eb918606c5643cc9286593ea9fd12
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.5)
4
+ web_stat (0.3.10)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
@@ -5,14 +5,29 @@ module WebStat
5
5
  # @param [String] url
6
6
  # @param [Integer] delay
7
7
  def get_last_url(url, delay=nil)
8
+ driver = get_driver(url, delay)
9
+ last_url = driver.current_url
10
+ driver.quit
11
+ last_url
12
+ end
13
+ # Get source of html
14
+ # @param [String] url
15
+ # @param [Integer] delay
16
+ def get_source(url, delay=nil)
17
+ driver = get_driver(url, delay)
18
+ source = driver.page_source
19
+ driver.quit
20
+ source
21
+ end
22
+
23
+ private
24
+ def get_driver(url, delay=nil)
8
25
  Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
9
26
  Selenium::WebDriver.logger.level = :info
10
27
  options = Selenium::WebDriver::Chrome::Options.new(args: [
11
28
  'headless',
12
29
  'no-sandbox',
13
- 'disable-gpu',
14
- 'start-maximized',
15
- 'window-size=1920,1080'
30
+ 'disable-gpu'
16
31
  ])
17
32
  driver = Selenium::WebDriver.for(:chrome, options: options)
18
33
  driver.manage.timeouts.implicit_wait = 10
@@ -21,9 +36,7 @@ module WebStat
21
36
  if delay.is_a?(Integer)
22
37
  sleep delay
23
38
  end
24
- last_url = driver.current_url
25
- driver.quit
26
- last_url
39
+ driver
27
40
  end
28
41
  end
29
42
  end
@@ -47,6 +47,9 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
+ if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
51
+ path = @nokogiri.at('body').xpath('//img').first.attr('src')
52
+ end
50
53
  if ! path.nil? && path.match(/^\//)
51
54
  "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
52
55
  else
@@ -75,17 +78,25 @@ module WebStat
75
78
  # @param [String] url
76
79
  # @param [String] body
77
80
  def get_url(url)
78
- agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
81
+ mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
79
82
  # Enable to read Robots.txt
80
- agent.robots = true
83
+ mech.robots = true
81
84
  begin
82
- document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
83
- if document.class == Mechanize::File
84
- body = document.body
85
+ if mech.agent.robots_disallowed?(url)
86
+ raise Mechanize::RobotsDisallowedError.new(url)
87
+ end
88
+ if WebStat::Configure.get["use_chromedirver"]
89
+ body = WebStat::WebDriverHelper.get_source(url)
90
+ @status = 200
85
91
  else
86
- body = document.body.encode('UTF-8', document.encoding)
92
+ document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
93
+ if document.class == Mechanize::File
94
+ body = document.body
95
+ else
96
+ body = document.body.encode('UTF-8', document.encoding)
97
+ end
98
+ @status = document.code
87
99
  end
88
- @status = document.code
89
100
  rescue Mechanize::ResponseCodeError => e
90
101
  body = e.page.body
91
102
  @status = e.page.code
@@ -96,7 +107,7 @@ module WebStat
96
107
  # Get the informations of @url
97
108
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
98
109
  def stat(userdics: nil)
99
- clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
110
+ clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
100
111
  language_code = CLD.detect_language(clean_content)[:code]
101
112
  if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
102
113
  tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-13 00:00:00.000000000 Z
11
+ date: 2020-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler