web_stat 0.3.4 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 746fe3666bc5a47c315e0499fd23dc84319142a8d850b872e5118553b1e711b6
4
- data.tar.gz: d2335c2d0324fc81d5b891e7cf383646f8573ef0147ac4b34672add61ecdb6c6
3
+ metadata.gz: ff258b9a50e224dfb465ad698871b2dc95dc2aaf028f5e966b0b920ca127f5f6
4
+ data.tar.gz: 0fe37dff66de85ac77137f95e94ef2e3a2e42ae1931b3b138017481c3cda9734
5
5
  SHA512:
6
- metadata.gz: d23e332cb9114aec4c7302aafb1ed4ca6fc081b3b71a21586779ec5d67de74a196508c38dd70bb8c1e5af3828ced4bdeda7f7c86ea84336fbd6fc476ba87de31
7
- data.tar.gz: 9c4e7955432b59b8f66edf71d814412e6eb7720c81f1c647e9f9420f7592febbcfe349c7ceb7a2b6d371c1b7d1de7d98148f7c015d5cc1f4ef52cf08a84afcbe
6
+ metadata.gz: c2158946ad78a192e902e6b74b9fd72ecfc001afe2c3b531c2e8cea9d7b37239d280174066329e3f908d843c0173ddc03181bf8af529870326f29bd19bf5ee06
7
+ data.tar.gz: eea91e8addd528e53989c94385c9d339895d138fea4fa911383b6c8157ade28a9c1c75566f2f40ca40d0f5c3b73267188b3695a573bc155a14150b22e3bebb47
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.4)
4
+ web_stat (0.3.7)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
@@ -83,7 +83,7 @@ GEM
83
83
  nokogiri (>= 1.6.0)
84
84
  rubyzip (2.3.0)
85
85
  safe_yaml (1.0.5)
86
- sanitize (5.2.0)
86
+ sanitize (5.2.1)
87
87
  crass (~> 1.0.2)
88
88
  nokogiri (>= 1.8.0)
89
89
  nokogumbo (~> 2.0)
@@ -10,9 +10,7 @@ module WebStat
10
10
  options = Selenium::WebDriver::Chrome::Options.new(args: [
11
11
  'headless',
12
12
  'no-sandbox',
13
- 'disable-gpu',
14
- 'start-maximized',
15
- 'window-size=1920,1080'
13
+ 'disable-gpu'
16
14
  ])
17
15
  driver = Selenium::WebDriver.for(:chrome, options: options)
18
16
  driver.manage.timeouts.implicit_wait = 10
@@ -47,6 +47,9 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
+ if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
51
+ path = @nokogiri.at('body').xpath('//img').first.attr('src')
52
+ end
50
53
  if ! path.nil? && path.match(/^\//)
51
54
  "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
52
55
  else
@@ -57,7 +60,7 @@ module WebStat
57
60
  # Get local path to save url
58
61
  # @param [String] url
59
62
  def save_local_path(url)
60
- return nil if url.nil? || url.empty?
63
+ return nil if url.nil? || ! url.match(%{^http})
61
64
  tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
62
65
  agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
63
66
  image = agent.get(url)
@@ -75,17 +78,25 @@ module WebStat
75
78
  # @param [String] url
76
79
  # @param [String] body
77
80
  def get_url(url)
78
- agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
81
+ mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
79
82
  # Enable to read Robots.txt
80
- agent.robots = true
83
+ mech.robots = true
81
84
  begin
82
- document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
83
- if document.class == Mechanize::File
84
- body = document.body
85
+ if mech.agent.robots_disallowed?(url)
86
+ raise Mechanize::RobotsDisallowedError.new(url)
87
+ end
88
+ if WebStat::Configure.get["use_chromedirver"]
89
+ document = WebStat::WebDriverHelper.get_last_url(url)
90
+ @status = 200
85
91
  else
86
- body = document.body.encode('UTF-8', document.encoding)
92
+ document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
93
+ if document.class == Mechanize::File
94
+ body = document.body
95
+ else
96
+ body = document.body.encode('UTF-8', document.encoding)
97
+ end
98
+ @status = document.code
87
99
  end
88
- @status = document.code
89
100
  rescue Mechanize::ResponseCodeError => e
90
101
  body = e.page.body
91
102
  @status = e.page.code
@@ -96,7 +107,7 @@ module WebStat
96
107
  # Get the informations of @url
97
108
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
98
109
  def stat(userdics: nil)
99
- clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
110
+ clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
100
111
  language_code = CLD.detect_language(clean_content)[:code]
101
112
  if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
102
113
  tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.4"
2
+ VERSION = "0.3.9"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-13 00:00:00.000000000 Z
11
+ date: 2020-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler