web_stat 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/helpers/web_drive_helper.rb +26 -24
- data/lib/web_stat/config/web_stat.yml +21 -14
- data/lib/web_stat/configure.rb +30 -24
- data/lib/web_stat/fetch.rb +2 -2
- data/lib/web_stat/final_redirect_url.rb +47 -43
- data/lib/web_stat/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b35413482a93f316bbd3d99a037f0e97102544a4e04604b5add670ac1a1500a
|
4
|
+
data.tar.gz: 4517b7754b2096901b005c26497fa2addad572b02f08a7aed385f6dc7de2e55a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a54c666953b0c51e1e5ea8d230069bf608e6284629740070f77f3816468553037852c0cbc6c39c35cd53000435fbced4acee6a7b0ca855f2c241e0fb769d32da
|
7
|
+
data.tar.gz: 3fef9c1c48f272e27c877a4a588ce45f841ad316429bb78fc3aa6de5748d6471b9e5aaed2ee076af0d3fa0779b5696c452c1b0fc2f77644c7914b171377def8e
|
data/Gemfile.lock
CHANGED
@@ -1,28 +1,30 @@
|
|
1
|
-
|
2
|
-
class
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
1
|
+
module WebStat
|
2
|
+
class WebDriverHelper
|
3
|
+
class << self
|
4
|
+
# Get last url
|
5
|
+
# @param [String] url
|
6
|
+
# @param [Integer] delay
|
7
|
+
def get_last_url(url, delay=nil)
|
8
|
+
Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
|
9
|
+
Selenium::WebDriver.logger.level = :info
|
10
|
+
options = Selenium::WebDriver::Chrome::Options.new(args: [
|
11
|
+
'headless',
|
12
|
+
'no-sandbox',
|
13
|
+
'disable-gpu',
|
14
|
+
'start-maximized',
|
15
|
+
'window-size=1920,1080'
|
16
|
+
])
|
17
|
+
driver = Selenium::WebDriver.for(:chrome, options: options)
|
18
|
+
driver.manage.timeouts.implicit_wait = 10
|
19
|
+
Selenium::WebDriver::Wait.new(timeout: 10)
|
20
|
+
driver.get(url)
|
21
|
+
if delay.is_a?(Integer)
|
22
|
+
sleep delay
|
23
|
+
end
|
24
|
+
last_url = driver.current_url
|
25
|
+
driver.quit
|
26
|
+
last_url
|
22
27
|
end
|
23
|
-
last_url = driver.current_url
|
24
|
-
driver.quit
|
25
|
-
last_url
|
26
28
|
end
|
27
29
|
end
|
28
30
|
end
|
@@ -1,14 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
1
|
+
development: &development
|
2
|
+
# Minimum number of characters to detect meta title
|
3
|
+
min_length_of_meta_title: 10
|
4
|
+
# Split regular expression for titles
|
5
|
+
regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
|
6
|
+
# User Agent
|
7
|
+
user_agent: "web_stat gem agent"
|
8
|
+
# Eyecatch image xpaths
|
9
|
+
eyecatch_image_xpaths:
|
10
|
+
- '/html/head/meta[@property="twitter:image"]/@content'
|
11
|
+
- '/html/head/meta[@property="og:image"]/@content'
|
12
|
+
- '//img[@class="attachment-post-thumbnail"]/@src'
|
13
|
+
- '//div[@id="content"]//img/@src'
|
14
|
+
- '//img/@src'
|
15
|
+
userdic: ""
|
16
|
+
use_chromedirver: false
|
17
|
+
test:
|
18
|
+
<<: *development
|
19
|
+
production:
|
20
|
+
<<: *development
|
21
|
+
use_chromedirver: true
|
data/lib/web_stat/configure.rb
CHANGED
@@ -3,31 +3,37 @@ module WebStat
|
|
3
3
|
class Configure
|
4
4
|
DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
self.get_custom_configure_path
|
15
|
-
else
|
16
|
-
self.get_default_configure_path
|
6
|
+
class << self
|
7
|
+
# Get yaml
|
8
|
+
def get
|
9
|
+
if defined? Rails
|
10
|
+
YAML.load_file(get_configure_path)[Rails.env]
|
11
|
+
else
|
12
|
+
YAML.load_file(get_configure_path)["production"]
|
13
|
+
end
|
17
14
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
File.join(
|
15
|
+
|
16
|
+
# Get configure path
|
17
|
+
def get_configure_path
|
18
|
+
if File.exists?(get_custom_configure_path)
|
19
|
+
get_custom_configure_path
|
20
|
+
else
|
21
|
+
get_default_configure_path
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get default configure path
|
26
|
+
def get_default_configure_path
|
27
|
+
File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get custom configure path
|
31
|
+
def get_custom_configure_path
|
32
|
+
if defined? Rails
|
33
|
+
File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
|
34
|
+
else
|
35
|
+
File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
|
36
|
+
end
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
data/lib/web_stat/fetch.rb
CHANGED
@@ -57,7 +57,7 @@ module WebStat
|
|
57
57
|
# Get local path to save url
|
58
58
|
# @param [String] url
|
59
59
|
def save_local_path(url)
|
60
|
-
return nil if url.nil?
|
60
|
+
return nil if url.nil? || ! url.match(%{^http})
|
61
61
|
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
62
62
|
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
63
63
|
image = agent.get(url)
|
@@ -122,7 +122,7 @@ module WebStat
|
|
122
122
|
# Get original url
|
123
123
|
# @param [String] url
|
124
124
|
def original_url(url)
|
125
|
-
last_url = FinalRedirectUrl.final_redirect_url(url)
|
125
|
+
last_url = WebStat::FinalRedirectUrl.final_redirect_url(url)
|
126
126
|
unless last_url.nil? || last_url.scrub('').empty?
|
127
127
|
last_url
|
128
128
|
else
|
@@ -1,50 +1,54 @@
|
|
1
1
|
# ref) https://github.com/indyarocks/final_redirect_url
|
2
2
|
# customize
|
3
3
|
# Changed
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
4
|
+
module WebStat
|
5
|
+
class FinalRedirectUrl
|
6
|
+
class << self
|
7
|
+
def final_redirect_url(url, options={})
|
8
|
+
final_url = ''
|
9
|
+
if is_valid_url?(url)
|
10
|
+
begin
|
11
|
+
redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
|
12
|
+
response_uri = get_final_redirect_url(url, redirect_lookup_depth)
|
13
|
+
final_url = url_string_from_uri(response_uri)
|
14
|
+
rescue Exception => ex
|
15
|
+
# nothing
|
16
|
+
end
|
17
|
+
end
|
18
|
+
final_url
|
16
19
|
end
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
20
|
+
|
21
|
+
private
|
22
|
+
def is_valid_url?(url)
|
23
|
+
url.to_s.match? URI::regexp(['http', 'https'])
|
24
|
+
end
|
25
|
+
def get_final_redirect_url(url, limit = 10)
|
26
|
+
return url if limit <= 0
|
27
|
+
uri = URI.parse(url)
|
28
|
+
response = ::Net::HTTP.get_response(uri)
|
29
|
+
if response.class == Net::HTTPOK
|
30
|
+
if WebStat::Configure.get["use_chromedirver"]
|
31
|
+
return URI.parse(WebStat::WebDriverHelper.get_last_url(uri))
|
32
|
+
else
|
33
|
+
return URI.parse(uri)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
redirect_location = response['location']
|
37
|
+
location_uri = URI.parse(redirect_location)
|
38
|
+
if location_uri.host.nil?
|
39
|
+
redirect_location = uri.scheme + '://' + uri.host + redirect_location
|
40
|
+
end
|
41
|
+
warn "redirected to #{redirect_location}"
|
42
|
+
get_final_redirect_url(redirect_location, limit - 1)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
def url_string_from_uri(uri)
|
46
|
+
url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
|
47
|
+
if uri.fragment
|
48
|
+
url_str = url_str + "##{uri.fragment}"
|
49
|
+
end
|
50
|
+
url_str
|
37
51
|
end
|
38
|
-
warn "redirected to #{redirect_location}"
|
39
|
-
get_final_redirect_url(redirect_location, limit - 1)
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def self.url_string_from_uri(uri)
|
44
|
-
url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
|
45
|
-
if uri.fragment
|
46
|
-
url_str = url_str + "##{uri.fragment}"
|
47
52
|
end
|
48
|
-
url_str
|
49
53
|
end
|
50
|
-
end
|
54
|
+
end
|
data/lib/web_stat/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|