web_stat 0.2.6 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +23 -23
- data/README.md +11 -0
- data/lib/web_stat.rb +8 -6
- data/lib/web_stat/fetch.rb +49 -15
- data/lib/web_stat/version.rb +2 -2
- data/spec/spec_helper.rb +2 -1
- data/spec/web_stat/fetch_spec.rb +1 -0
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bcfaeb202076ea30cae6205877d0b0ad9060eb84116b84ef7bd3580cc4349aac
|
|
4
|
+
data.tar.gz: 98e586440c8f3aed29e38a003f6d1afc96b26b2ae4c21e43d47e1dfaf6aaa16a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dfbe6264256f08550ebb42244d92bd81c976dd5dd72a6a4dabffdf6a1366a8f010e8a20527d0bbe7a8334cdb028e86ecae0c63d6cc4368741abe815f1fcb3092
|
|
7
|
+
data.tar.gz: 9098d904f26dfdfe14c87352cb47ca3f0333f5424ef26d2cd8232c088c2cf8d7dcbeb07012a9294a5c4e4949b51fe71e94d0995465400d87cf7c037fd09ba978
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
GIT
|
|
2
|
-
remote: git@github.com:yubele/final_redirect_url
|
|
3
|
-
revision:
|
|
2
|
+
remote: git@github.com:yubele/final_redirect_url
|
|
3
|
+
revision: 45df878ec9495ebbfa06dc0a60cc5043c2519e16
|
|
4
4
|
specs:
|
|
5
|
-
final_redirect_url (0.1.
|
|
5
|
+
final_redirect_url (0.1.1)
|
|
6
6
|
|
|
7
7
|
PATH
|
|
8
8
|
remote: .
|
|
9
9
|
specs:
|
|
10
|
-
web_stat (0.2.
|
|
10
|
+
web_stat (0.2.11)
|
|
11
11
|
bundler (>= 2.0.2)
|
|
12
12
|
cld (>= 0.8.0)
|
|
13
13
|
mechanize (>= 2.7)
|
|
@@ -27,13 +27,13 @@ GEM
|
|
|
27
27
|
connection_pool (2.2.2)
|
|
28
28
|
crack (0.4.3)
|
|
29
29
|
safe_yaml (~> 1.0.0)
|
|
30
|
-
crass (1.0.
|
|
30
|
+
crass (1.0.6)
|
|
31
31
|
diff-lcs (1.3)
|
|
32
32
|
domain_name (0.5.20190701)
|
|
33
33
|
unf (>= 0.0.5, < 1.0.0)
|
|
34
|
-
ffi (1.
|
|
34
|
+
ffi (1.12.2)
|
|
35
35
|
guess_html_encoding (0.0.11)
|
|
36
|
-
hashdiff (1.0.
|
|
36
|
+
hashdiff (1.0.1)
|
|
37
37
|
http-cookie (1.0.3)
|
|
38
38
|
domain_name (~> 0.5)
|
|
39
39
|
mechanize (2.7.6)
|
|
@@ -45,39 +45,39 @@ GEM
|
|
|
45
45
|
nokogiri (~> 1.6)
|
|
46
46
|
ntlm-http (~> 0.1, >= 0.1.1)
|
|
47
47
|
webrobots (>= 0.0.9, < 0.2)
|
|
48
|
-
method_source (0.
|
|
48
|
+
method_source (1.0.0)
|
|
49
49
|
mime-types (3.3.1)
|
|
50
50
|
mime-types-data (~> 3.2015)
|
|
51
|
-
mime-types-data (3.
|
|
51
|
+
mime-types-data (3.2020.0425)
|
|
52
52
|
mini_portile2 (2.4.0)
|
|
53
|
-
natto (1.
|
|
53
|
+
natto (1.2.0)
|
|
54
54
|
ffi (>= 1.9.0)
|
|
55
55
|
net-http-digest_auth (1.4.1)
|
|
56
|
-
net-http-persistent (
|
|
56
|
+
net-http-persistent (4.0.0)
|
|
57
57
|
connection_pool (~> 2.2)
|
|
58
|
-
nokogiri (1.10.
|
|
58
|
+
nokogiri (1.10.9)
|
|
59
59
|
mini_portile2 (~> 2.4.0)
|
|
60
60
|
nokogumbo (2.0.2)
|
|
61
61
|
nokogiri (~> 1.8, >= 1.8.4)
|
|
62
62
|
ntlm-http (0.1.1)
|
|
63
|
-
pry (0.
|
|
64
|
-
coderay (~> 1.1
|
|
65
|
-
method_source (~>
|
|
66
|
-
public_suffix (4.0.
|
|
63
|
+
pry (0.13.1)
|
|
64
|
+
coderay (~> 1.1)
|
|
65
|
+
method_source (~> 1.0)
|
|
66
|
+
public_suffix (4.0.4)
|
|
67
67
|
rake (13.0.1)
|
|
68
68
|
rspec (3.9.0)
|
|
69
69
|
rspec-core (~> 3.9.0)
|
|
70
70
|
rspec-expectations (~> 3.9.0)
|
|
71
71
|
rspec-mocks (~> 3.9.0)
|
|
72
|
-
rspec-core (3.9.
|
|
73
|
-
rspec-support (~> 3.9.
|
|
74
|
-
rspec-expectations (3.9.
|
|
72
|
+
rspec-core (3.9.2)
|
|
73
|
+
rspec-support (~> 3.9.3)
|
|
74
|
+
rspec-expectations (3.9.1)
|
|
75
75
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
76
76
|
rspec-support (~> 3.9.0)
|
|
77
77
|
rspec-mocks (3.9.1)
|
|
78
78
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
79
79
|
rspec-support (~> 3.9.0)
|
|
80
|
-
rspec-support (3.9.
|
|
80
|
+
rspec-support (3.9.3)
|
|
81
81
|
ruby-readability (0.7.0)
|
|
82
82
|
guess_html_encoding (>= 0.0.4)
|
|
83
83
|
nokogiri (>= 1.6.0)
|
|
@@ -88,8 +88,8 @@ GEM
|
|
|
88
88
|
nokogumbo (~> 2.0)
|
|
89
89
|
unf (0.1.4)
|
|
90
90
|
unf_ext
|
|
91
|
-
unf_ext (0.0.7.
|
|
92
|
-
webmock (3.
|
|
91
|
+
unf_ext (0.0.7.7)
|
|
92
|
+
webmock (3.8.3)
|
|
93
93
|
addressable (>= 2.3.6)
|
|
94
94
|
crack (>= 0.3.2)
|
|
95
95
|
hashdiff (>= 0.4.0, < 2.0.0)
|
|
@@ -107,4 +107,4 @@ DEPENDENCIES
|
|
|
107
107
|
webmock (>= 3.6.0)
|
|
108
108
|
|
|
109
109
|
BUNDLED WITH
|
|
110
|
-
2.1.
|
|
110
|
+
2.1.4
|
data/README.md
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
# !!!!! Precautions when using with Rails !!!!!
|
|
2
|
+
|
|
3
|
+
Write this line your Gemfile.
|
|
4
|
+
```
|
|
5
|
+
gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
|
|
6
|
+
```
|
|
7
|
+
|
|
1
8
|
# WebStat
|
|
2
9
|
|
|
3
10
|
Fetch the web pages and stat.
|
|
@@ -18,6 +25,10 @@ Fetch the web pages and stat.
|
|
|
18
25
|
- "rake", "~> 10.0"
|
|
19
26
|
- "rspec", "~> 3.0"
|
|
20
27
|
|
|
28
|
+
### Install mecab
|
|
29
|
+
|
|
30
|
+
$ sudo apt install mecab-ipadic-utf8 libmecab
|
|
31
|
+
|
|
21
32
|
## Installation
|
|
22
33
|
|
|
23
34
|
Add this line to your application's Gemfile:
|
data/lib/web_stat.rb
CHANGED
|
@@ -21,24 +21,26 @@ require "web_stat/fetch/fetch_as_web"
|
|
|
21
21
|
module WebStat
|
|
22
22
|
class << self
|
|
23
23
|
# Get web page's stat by url
|
|
24
|
-
|
|
24
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
|
25
|
+
def stat_by_web(url, userdics: nil)
|
|
25
26
|
web_stat = WebStat::FetchAsWeb.new(url)
|
|
26
|
-
web_stat.stat
|
|
27
|
+
web_stat.stat(userdics: userdics)
|
|
27
28
|
end
|
|
28
29
|
|
|
29
30
|
# Get web page's stat by url
|
|
30
31
|
# @param String url
|
|
31
|
-
def stat_by_url(url)
|
|
32
|
-
stat_by_web(url)
|
|
32
|
+
def stat_by_url(url, userdics: nil)
|
|
33
|
+
stat_by_web(url, userdics: userdics)
|
|
33
34
|
end
|
|
34
35
|
|
|
35
36
|
# Get web page's stat by html
|
|
36
37
|
# @param String html
|
|
37
38
|
# @param [String] url
|
|
38
|
-
|
|
39
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
|
40
|
+
def stat_by_html(html, url=nil, userdics: nil)
|
|
39
41
|
web_stat = WebStat::FetchAsHtml.new(html)
|
|
40
42
|
web_stat.url = url unless url.nil?
|
|
41
|
-
web_stat.stat
|
|
43
|
+
web_stat.stat(userdics: userdics)
|
|
42
44
|
end
|
|
43
45
|
end
|
|
44
46
|
end
|
data/lib/web_stat/fetch.rb
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
module WebStat
|
|
2
2
|
class Fetch
|
|
3
|
-
attr_accessor :url, :html, :nokogiri, :userdic
|
|
3
|
+
attr_accessor :url, :html, :nokogiri, :userdic, :status
|
|
4
4
|
|
|
5
5
|
# Get title
|
|
6
6
|
# @return [String] title
|
|
@@ -13,7 +13,11 @@ module WebStat
|
|
|
13
13
|
rescue
|
|
14
14
|
title = @nokogiri.title
|
|
15
15
|
end
|
|
16
|
-
title.
|
|
16
|
+
if title.nil?
|
|
17
|
+
"No Title"
|
|
18
|
+
else
|
|
19
|
+
title.strip
|
|
20
|
+
end
|
|
17
21
|
end
|
|
18
22
|
|
|
19
23
|
# Get name of domain
|
|
@@ -23,7 +27,11 @@ module WebStat
|
|
|
23
27
|
rescue
|
|
24
28
|
site_name = @nokogiri.title
|
|
25
29
|
end
|
|
26
|
-
site_name.
|
|
30
|
+
if site_name.nil?
|
|
31
|
+
"No Sitename"
|
|
32
|
+
else
|
|
33
|
+
site_name.strip
|
|
34
|
+
end
|
|
27
35
|
end
|
|
28
36
|
[]
|
|
29
37
|
# Get main section
|
|
@@ -41,7 +49,7 @@ module WebStat
|
|
|
41
49
|
break
|
|
42
50
|
end
|
|
43
51
|
end
|
|
44
|
-
if path.match(/^\//)
|
|
52
|
+
if ! path.nil? && path.match(/^\//)
|
|
45
53
|
"#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
|
|
46
54
|
else
|
|
47
55
|
path
|
|
@@ -53,9 +61,13 @@ module WebStat
|
|
|
53
61
|
def save_local_path(url)
|
|
54
62
|
return nil if url.nil?
|
|
55
63
|
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
64
|
+
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
|
65
|
+
image = agent.get(url)
|
|
66
|
+
File.open(tmp_file, "w+b") do |_file|
|
|
67
|
+
if image.class == Mechanize::File
|
|
68
|
+
_file.puts(image.body)
|
|
69
|
+
else
|
|
70
|
+
_file.puts(image.body_io.read)
|
|
59
71
|
end
|
|
60
72
|
end
|
|
61
73
|
tmp_file
|
|
@@ -63,23 +75,44 @@ module WebStat
|
|
|
63
75
|
|
|
64
76
|
# Get url
|
|
65
77
|
# @param [String] url
|
|
78
|
+
# @param [String] body
|
|
66
79
|
def get_url(url)
|
|
67
80
|
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
|
68
81
|
# Enable to read Robots.txt
|
|
69
82
|
agent.robots = true
|
|
70
|
-
|
|
71
|
-
|
|
83
|
+
begin
|
|
84
|
+
document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
|
|
85
|
+
if document.class == Mechanize::File
|
|
86
|
+
body = document.body
|
|
87
|
+
else
|
|
88
|
+
body = document.body.encode('UTF-8', document.encoding)
|
|
89
|
+
end
|
|
90
|
+
@status = document.code
|
|
91
|
+
rescue Mechanize::ResponseCodeError => e
|
|
92
|
+
body = e.page.body
|
|
93
|
+
@status = e.page.code
|
|
94
|
+
end
|
|
95
|
+
body
|
|
72
96
|
end
|
|
73
97
|
|
|
74
98
|
# Get the informations of @url
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
99
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
|
100
|
+
def stat(userdics: nil)
|
|
101
|
+
clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
|
|
102
|
+
language_code = CLD.detect_language(clean_content)[:code]
|
|
103
|
+
if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
|
|
104
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
|
|
105
|
+
elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"])
|
|
106
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"])
|
|
107
|
+
else
|
|
108
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"])
|
|
109
|
+
end
|
|
78
110
|
{
|
|
79
111
|
title: title,
|
|
80
112
|
site_name: site_name,
|
|
81
113
|
content: clean_content,
|
|
82
|
-
language_code:
|
|
114
|
+
language_code: language_code,
|
|
115
|
+
status: @status,
|
|
83
116
|
url: @url,
|
|
84
117
|
eyecatch_image_path: save_local_path(eyecatch_image_path),
|
|
85
118
|
tags: tag.nouns
|
|
@@ -91,8 +124,9 @@ module WebStat
|
|
|
91
124
|
# Get original url
|
|
92
125
|
# @param [String] url
|
|
93
126
|
def original_url(url)
|
|
94
|
-
|
|
95
|
-
|
|
127
|
+
last_url = FinalRedirectUrl.final_redirect_url(url)
|
|
128
|
+
unless last_url.nil? || last_url.scrub('').empty?
|
|
129
|
+
last_url
|
|
96
130
|
else
|
|
97
131
|
url
|
|
98
132
|
end
|
data/lib/web_stat/version.rb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
module WebStat
|
|
2
|
-
VERSION = "0.2.
|
|
3
|
-
end
|
|
2
|
+
VERSION = "0.2.11"
|
|
3
|
+
end
|
data/spec/spec_helper.rb
CHANGED
|
@@ -59,9 +59,10 @@ end
|
|
|
59
59
|
|
|
60
60
|
# Set webmock
|
|
61
61
|
WebStatTestHelper.scheme_and_files.each do |url|
|
|
62
|
+
status = [200, 404, 503].sample
|
|
62
63
|
WebMock.stub_request(:get, url)
|
|
63
64
|
.to_return(
|
|
64
|
-
status:
|
|
65
|
+
status: status,
|
|
65
66
|
body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
|
|
66
67
|
headers: {content_type: 'application/html; charset=utf-8'})
|
|
67
68
|
end
|
data/spec/web_stat/fetch_spec.rb
CHANGED
|
@@ -83,6 +83,7 @@ RSpec.describe WebStat::Fetch do
|
|
|
83
83
|
expect(web_stat[:title]).to eq "gem作成でついまずいたところ"
|
|
84
84
|
expect(web_stat[:site_name]).to eq "newsdict.blog"
|
|
85
85
|
expect(web_stat[:content]).not_to eq nil
|
|
86
|
+
expect(web_stat[:status]).to eq("200").or eq("404").or eq("503")
|
|
86
87
|
expect(Sanitize.clean(web_stat[:content]).length).to eq web_stat[:content].length
|
|
87
88
|
expect(web_stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
|
88
89
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: web_stat
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yusuke abe
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-
|
|
11
|
+
date: 2020-06-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -224,7 +224,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
224
224
|
- !ruby/object:Gem::Version
|
|
225
225
|
version: '0'
|
|
226
226
|
requirements: []
|
|
227
|
-
rubygems_version: 3.
|
|
227
|
+
rubygems_version: 3.0.3
|
|
228
228
|
signing_key:
|
|
229
229
|
specification_version: 4
|
|
230
230
|
summary: Get the status of the web pages.
|