web_stat 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/helpers/web_drive_helper.rb +26 -24
- data/lib/web_stat/config/web_stat.yml +21 -14
- data/lib/web_stat/configure.rb +30 -24
- data/lib/web_stat/fetch.rb +2 -2
- data/lib/web_stat/final_redirect_url.rb +47 -43
- data/lib/web_stat/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b35413482a93f316bbd3d99a037f0e97102544a4e04604b5add670ac1a1500a
|
4
|
+
data.tar.gz: 4517b7754b2096901b005c26497fa2addad572b02f08a7aed385f6dc7de2e55a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a54c666953b0c51e1e5ea8d230069bf608e6284629740070f77f3816468553037852c0cbc6c39c35cd53000435fbced4acee6a7b0ca855f2c241e0fb769d32da
|
7
|
+
data.tar.gz: 3fef9c1c48f272e27c877a4a588ce45f841ad316429bb78fc3aa6de5748d6471b9e5aaed2ee076af0d3fa0779b5696c452c1b0fc2f77644c7914b171377def8e
|
data/Gemfile.lock
CHANGED
@@ -1,28 +1,30 @@
|
|
1
|
-
|
2
|
-
class
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
1
|
+
module WebStat
|
2
|
+
class WebDriverHelper
|
3
|
+
class << self
|
4
|
+
# Get last url
|
5
|
+
# @param [String] url
|
6
|
+
# @param [Integer] delay
|
7
|
+
def get_last_url(url, delay=nil)
|
8
|
+
Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
|
9
|
+
Selenium::WebDriver.logger.level = :info
|
10
|
+
options = Selenium::WebDriver::Chrome::Options.new(args: [
|
11
|
+
'headless',
|
12
|
+
'no-sandbox',
|
13
|
+
'disable-gpu',
|
14
|
+
'start-maximized',
|
15
|
+
'window-size=1920,1080'
|
16
|
+
])
|
17
|
+
driver = Selenium::WebDriver.for(:chrome, options: options)
|
18
|
+
driver.manage.timeouts.implicit_wait = 10
|
19
|
+
Selenium::WebDriver::Wait.new(timeout: 10)
|
20
|
+
driver.get(url)
|
21
|
+
if delay.is_a?(Integer)
|
22
|
+
sleep delay
|
23
|
+
end
|
24
|
+
last_url = driver.current_url
|
25
|
+
driver.quit
|
26
|
+
last_url
|
22
27
|
end
|
23
|
-
last_url = driver.current_url
|
24
|
-
driver.quit
|
25
|
-
last_url
|
26
28
|
end
|
27
29
|
end
|
28
30
|
end
|
@@ -1,14 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
1
|
+
development: &development
|
2
|
+
# Minimum number of characters to detect meta title
|
3
|
+
min_length_of_meta_title: 10
|
4
|
+
# Split regular expression for titles
|
5
|
+
regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
|
6
|
+
# User Agent
|
7
|
+
user_agent: "web_stat gem agent"
|
8
|
+
# Eyecatch image xpaths
|
9
|
+
eyecatch_image_xpaths:
|
10
|
+
- '/html/head/meta[@property="twitter:image"]/@content'
|
11
|
+
- '/html/head/meta[@property="og:image"]/@content'
|
12
|
+
- '//img[@class="attachment-post-thumbnail"]/@src'
|
13
|
+
- '//div[@id="content"]//img/@src'
|
14
|
+
- '//img/@src'
|
15
|
+
userdic: ""
|
16
|
+
use_chromedirver: false
|
17
|
+
test:
|
18
|
+
<<: *development
|
19
|
+
production:
|
20
|
+
<<: *development
|
21
|
+
use_chromedirver: true
|
data/lib/web_stat/configure.rb
CHANGED
@@ -3,31 +3,37 @@ module WebStat
|
|
3
3
|
class Configure
|
4
4
|
DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
self.get_custom_configure_path
|
15
|
-
else
|
16
|
-
self.get_default_configure_path
|
6
|
+
class << self
|
7
|
+
# Get yaml
|
8
|
+
def get
|
9
|
+
if defined? Rails
|
10
|
+
YAML.load_file(get_configure_path)[Rails.env]
|
11
|
+
else
|
12
|
+
YAML.load_file(get_configure_path)["production"]
|
13
|
+
end
|
17
14
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
File.join(
|
15
|
+
|
16
|
+
# Get configure path
|
17
|
+
def get_configure_path
|
18
|
+
if File.exists?(get_custom_configure_path)
|
19
|
+
get_custom_configure_path
|
20
|
+
else
|
21
|
+
get_default_configure_path
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get default configure path
|
26
|
+
def get_default_configure_path
|
27
|
+
File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get custom configure path
|
31
|
+
def get_custom_configure_path
|
32
|
+
if defined? Rails
|
33
|
+
File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
|
34
|
+
else
|
35
|
+
File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
|
36
|
+
end
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
data/lib/web_stat/fetch.rb
CHANGED
@@ -57,7 +57,7 @@ module WebStat
|
|
57
57
|
# Get local path to save url
|
58
58
|
# @param [String] url
|
59
59
|
def save_local_path(url)
|
60
|
-
return nil if url.nil?
|
60
|
+
return nil if url.nil? || ! url.match(%{^http})
|
61
61
|
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
62
62
|
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
63
63
|
image = agent.get(url)
|
@@ -122,7 +122,7 @@ module WebStat
|
|
122
122
|
# Get original url
|
123
123
|
# @param [String] url
|
124
124
|
def original_url(url)
|
125
|
-
last_url = FinalRedirectUrl.final_redirect_url(url)
|
125
|
+
last_url = WebStat::FinalRedirectUrl.final_redirect_url(url)
|
126
126
|
unless last_url.nil? || last_url.scrub('').empty?
|
127
127
|
last_url
|
128
128
|
else
|
@@ -1,50 +1,54 @@
|
|
1
1
|
# ref) https://github.com/indyarocks/final_redirect_url
|
2
2
|
# customize
|
3
3
|
# Changed
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
4
|
+
module WebStat
|
5
|
+
class FinalRedirectUrl
|
6
|
+
class << self
|
7
|
+
def final_redirect_url(url, options={})
|
8
|
+
final_url = ''
|
9
|
+
if is_valid_url?(url)
|
10
|
+
begin
|
11
|
+
redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
|
12
|
+
response_uri = get_final_redirect_url(url, redirect_lookup_depth)
|
13
|
+
final_url = url_string_from_uri(response_uri)
|
14
|
+
rescue Exception => ex
|
15
|
+
# nothing
|
16
|
+
end
|
17
|
+
end
|
18
|
+
final_url
|
16
19
|
end
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
20
|
+
|
21
|
+
private
|
22
|
+
def is_valid_url?(url)
|
23
|
+
url.to_s.match? URI::regexp(['http', 'https'])
|
24
|
+
end
|
25
|
+
def get_final_redirect_url(url, limit = 10)
|
26
|
+
return url if limit <= 0
|
27
|
+
uri = URI.parse(url)
|
28
|
+
response = ::Net::HTTP.get_response(uri)
|
29
|
+
if response.class == Net::HTTPOK
|
30
|
+
if WebStat::Configure.get["use_chromedirver"]
|
31
|
+
return URI.parse(WebStat::WebDriverHelper.get_last_url(uri))
|
32
|
+
else
|
33
|
+
return URI.parse(uri)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
redirect_location = response['location']
|
37
|
+
location_uri = URI.parse(redirect_location)
|
38
|
+
if location_uri.host.nil?
|
39
|
+
redirect_location = uri.scheme + '://' + uri.host + redirect_location
|
40
|
+
end
|
41
|
+
warn "redirected to #{redirect_location}"
|
42
|
+
get_final_redirect_url(redirect_location, limit - 1)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
def url_string_from_uri(uri)
|
46
|
+
url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
|
47
|
+
if uri.fragment
|
48
|
+
url_str = url_str + "##{uri.fragment}"
|
49
|
+
end
|
50
|
+
url_str
|
37
51
|
end
|
38
|
-
warn "redirected to #{redirect_location}"
|
39
|
-
get_final_redirect_url(redirect_location, limit - 1)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def self.url_string_from_uri(uri)
|
44
|
-
url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
|
45
|
-
if uri.fragment
|
46
|
-
url_str = url_str + "##{uri.fragment}"
|
47
52
|
end
|
48
|
-
url_str
|
49
53
|
end
|
50
|
-
end
|
54
|
+
end
|
data/lib/web_stat/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|