web_stat 0.3.4 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/lib/helpers/web_drive_helper.rb +1 -3
- data/lib/web_stat/fetch.rb +20 -9
- data/lib/web_stat/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ff258b9a50e224dfb465ad698871b2dc95dc2aaf028f5e966b0b920ca127f5f6
|
4
|
+
data.tar.gz: 0fe37dff66de85ac77137f95e94ef2e3a2e42ae1931b3b138017481c3cda9734
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2158946ad78a192e902e6b74b9fd72ecfc001afe2c3b531c2e8cea9d7b37239d280174066329e3f908d843c0173ddc03181bf8af529870326f29bd19bf5ee06
|
7
|
+
data.tar.gz: eea91e8addd528e53989c94385c9d339895d138fea4fa911383b6c8157ade28a9c1c75566f2f40ca40d0f5c3b73267188b3695a573bc155a14150b22e3bebb47
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
web_stat (0.3.
|
4
|
+
web_stat (0.3.7)
|
5
5
|
bundler (>= 2.0.2)
|
6
6
|
cld (>= 0.8.0)
|
7
7
|
mechanize (>= 2.7)
|
@@ -83,7 +83,7 @@ GEM
|
|
83
83
|
nokogiri (>= 1.6.0)
|
84
84
|
rubyzip (2.3.0)
|
85
85
|
safe_yaml (1.0.5)
|
86
|
-
sanitize (5.2.
|
86
|
+
sanitize (5.2.1)
|
87
87
|
crass (~> 1.0.2)
|
88
88
|
nokogiri (>= 1.8.0)
|
89
89
|
nokogumbo (~> 2.0)
|
@@ -10,9 +10,7 @@ module WebStat
|
|
10
10
|
options = Selenium::WebDriver::Chrome::Options.new(args: [
|
11
11
|
'headless',
|
12
12
|
'no-sandbox',
|
13
|
-
'disable-gpu'
|
14
|
-
'start-maximized',
|
15
|
-
'window-size=1920,1080'
|
13
|
+
'disable-gpu'
|
16
14
|
])
|
17
15
|
driver = Selenium::WebDriver.for(:chrome, options: options)
|
18
16
|
driver.manage.timeouts.implicit_wait = 10
|
data/lib/web_stat/fetch.rb
CHANGED
@@ -47,6 +47,9 @@ module WebStat
|
|
47
47
|
break
|
48
48
|
end
|
49
49
|
end
|
50
|
+
if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
|
51
|
+
path = @nokogiri.at('body').xpath('//img').first.attr('src')
|
52
|
+
end
|
50
53
|
if ! path.nil? && path.match(/^\//)
|
51
54
|
"#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
|
52
55
|
else
|
@@ -57,7 +60,7 @@ module WebStat
|
|
57
60
|
# Get local path to save url
|
58
61
|
# @param [String] url
|
59
62
|
def save_local_path(url)
|
60
|
-
return nil if url.nil? || url.
|
63
|
+
return nil if url.nil? || ! url.match(%{^http})
|
61
64
|
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
62
65
|
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
63
66
|
image = agent.get(url)
|
@@ -75,17 +78,25 @@ module WebStat
|
|
75
78
|
# @param [String] url
|
76
79
|
# @param [String] body
|
77
80
|
def get_url(url)
|
78
|
-
|
81
|
+
mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
|
79
82
|
# Enable to read Robots.txt
|
80
|
-
|
83
|
+
mech.robots = true
|
81
84
|
begin
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
+
if mech.agent.robots_disallowed?(url)
|
86
|
+
raise Mechanize::RobotsDisallowedError.new(url)
|
87
|
+
end
|
88
|
+
if WebStat::Configure.get["use_chromedirver"]
|
89
|
+
document = WebStat::WebDriverHelper.get_last_url(url)
|
90
|
+
@status = 200
|
85
91
|
else
|
86
|
-
|
92
|
+
document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
|
93
|
+
if document.class == Mechanize::File
|
94
|
+
body = document.body
|
95
|
+
else
|
96
|
+
body = document.body.encode('UTF-8', document.encoding)
|
97
|
+
end
|
98
|
+
@status = document.code
|
87
99
|
end
|
88
|
-
@status = document.code
|
89
100
|
rescue Mechanize::ResponseCodeError => e
|
90
101
|
body = e.page.body
|
91
102
|
@status = e.page.code
|
@@ -96,7 +107,7 @@ module WebStat
|
|
96
107
|
# Get the informations of @url
|
97
108
|
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
98
109
|
def stat(userdics: nil)
|
99
|
-
clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
|
110
|
+
clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
|
100
111
|
language_code = CLD.detect_language(clean_content)[:code]
|
101
112
|
if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
|
102
113
|
tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
|
data/lib/web_stat/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|