web_stat 0.2.6 → 0.2.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +23 -23
- data/README.md +11 -0
- data/lib/web_stat.rb +8 -6
- data/lib/web_stat/fetch.rb +49 -15
- data/lib/web_stat/version.rb +2 -2
- data/spec/spec_helper.rb +2 -1
- data/spec/web_stat/fetch_spec.rb +1 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bcfaeb202076ea30cae6205877d0b0ad9060eb84116b84ef7bd3580cc4349aac
|
4
|
+
data.tar.gz: 98e586440c8f3aed29e38a003f6d1afc96b26b2ae4c21e43d47e1dfaf6aaa16a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dfbe6264256f08550ebb42244d92bd81c976dd5dd72a6a4dabffdf6a1366a8f010e8a20527d0bbe7a8334cdb028e86ecae0c63d6cc4368741abe815f1fcb3092
|
7
|
+
data.tar.gz: 9098d904f26dfdfe14c87352cb47ca3f0333f5424ef26d2cd8232c088c2cf8d7dcbeb07012a9294a5c4e4949b51fe71e94d0995465400d87cf7c037fd09ba978
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
GIT
|
2
|
-
remote: git@github.com:yubele/final_redirect_url
|
3
|
-
revision:
|
2
|
+
remote: git@github.com:yubele/final_redirect_url
|
3
|
+
revision: 45df878ec9495ebbfa06dc0a60cc5043c2519e16
|
4
4
|
specs:
|
5
|
-
final_redirect_url (0.1.
|
5
|
+
final_redirect_url (0.1.1)
|
6
6
|
|
7
7
|
PATH
|
8
8
|
remote: .
|
9
9
|
specs:
|
10
|
-
web_stat (0.2.
|
10
|
+
web_stat (0.2.11)
|
11
11
|
bundler (>= 2.0.2)
|
12
12
|
cld (>= 0.8.0)
|
13
13
|
mechanize (>= 2.7)
|
@@ -27,13 +27,13 @@ GEM
|
|
27
27
|
connection_pool (2.2.2)
|
28
28
|
crack (0.4.3)
|
29
29
|
safe_yaml (~> 1.0.0)
|
30
|
-
crass (1.0.
|
30
|
+
crass (1.0.6)
|
31
31
|
diff-lcs (1.3)
|
32
32
|
domain_name (0.5.20190701)
|
33
33
|
unf (>= 0.0.5, < 1.0.0)
|
34
|
-
ffi (1.
|
34
|
+
ffi (1.12.2)
|
35
35
|
guess_html_encoding (0.0.11)
|
36
|
-
hashdiff (1.0.
|
36
|
+
hashdiff (1.0.1)
|
37
37
|
http-cookie (1.0.3)
|
38
38
|
domain_name (~> 0.5)
|
39
39
|
mechanize (2.7.6)
|
@@ -45,39 +45,39 @@ GEM
|
|
45
45
|
nokogiri (~> 1.6)
|
46
46
|
ntlm-http (~> 0.1, >= 0.1.1)
|
47
47
|
webrobots (>= 0.0.9, < 0.2)
|
48
|
-
method_source (0.
|
48
|
+
method_source (1.0.0)
|
49
49
|
mime-types (3.3.1)
|
50
50
|
mime-types-data (~> 3.2015)
|
51
|
-
mime-types-data (3.
|
51
|
+
mime-types-data (3.2020.0425)
|
52
52
|
mini_portile2 (2.4.0)
|
53
|
-
natto (1.
|
53
|
+
natto (1.2.0)
|
54
54
|
ffi (>= 1.9.0)
|
55
55
|
net-http-digest_auth (1.4.1)
|
56
|
-
net-http-persistent (
|
56
|
+
net-http-persistent (4.0.0)
|
57
57
|
connection_pool (~> 2.2)
|
58
|
-
nokogiri (1.10.
|
58
|
+
nokogiri (1.10.9)
|
59
59
|
mini_portile2 (~> 2.4.0)
|
60
60
|
nokogumbo (2.0.2)
|
61
61
|
nokogiri (~> 1.8, >= 1.8.4)
|
62
62
|
ntlm-http (0.1.1)
|
63
|
-
pry (0.
|
64
|
-
coderay (~> 1.1
|
65
|
-
method_source (~>
|
66
|
-
public_suffix (4.0.
|
63
|
+
pry (0.13.1)
|
64
|
+
coderay (~> 1.1)
|
65
|
+
method_source (~> 1.0)
|
66
|
+
public_suffix (4.0.4)
|
67
67
|
rake (13.0.1)
|
68
68
|
rspec (3.9.0)
|
69
69
|
rspec-core (~> 3.9.0)
|
70
70
|
rspec-expectations (~> 3.9.0)
|
71
71
|
rspec-mocks (~> 3.9.0)
|
72
|
-
rspec-core (3.9.
|
73
|
-
rspec-support (~> 3.9.
|
74
|
-
rspec-expectations (3.9.
|
72
|
+
rspec-core (3.9.2)
|
73
|
+
rspec-support (~> 3.9.3)
|
74
|
+
rspec-expectations (3.9.1)
|
75
75
|
diff-lcs (>= 1.2.0, < 2.0)
|
76
76
|
rspec-support (~> 3.9.0)
|
77
77
|
rspec-mocks (3.9.1)
|
78
78
|
diff-lcs (>= 1.2.0, < 2.0)
|
79
79
|
rspec-support (~> 3.9.0)
|
80
|
-
rspec-support (3.9.
|
80
|
+
rspec-support (3.9.3)
|
81
81
|
ruby-readability (0.7.0)
|
82
82
|
guess_html_encoding (>= 0.0.4)
|
83
83
|
nokogiri (>= 1.6.0)
|
@@ -88,8 +88,8 @@ GEM
|
|
88
88
|
nokogumbo (~> 2.0)
|
89
89
|
unf (0.1.4)
|
90
90
|
unf_ext
|
91
|
-
unf_ext (0.0.7.
|
92
|
-
webmock (3.
|
91
|
+
unf_ext (0.0.7.7)
|
92
|
+
webmock (3.8.3)
|
93
93
|
addressable (>= 2.3.6)
|
94
94
|
crack (>= 0.3.2)
|
95
95
|
hashdiff (>= 0.4.0, < 2.0.0)
|
@@ -107,4 +107,4 @@ DEPENDENCIES
|
|
107
107
|
webmock (>= 3.6.0)
|
108
108
|
|
109
109
|
BUNDLED WITH
|
110
|
-
2.1.
|
110
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
# !!!!! Precautions when using with Rails !!!!!
|
2
|
+
|
3
|
+
Write this line your Gemfile.
|
4
|
+
```
|
5
|
+
gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
|
6
|
+
```
|
7
|
+
|
1
8
|
# WebStat
|
2
9
|
|
3
10
|
Fetch the web pages and stat.
|
@@ -18,6 +25,10 @@ Fetch the web pages and stat.
|
|
18
25
|
- "rake", "~> 10.0"
|
19
26
|
- "rspec", "~> 3.0"
|
20
27
|
|
28
|
+
### Install mecab
|
29
|
+
|
30
|
+
$ sudo apt install mecab-ipadic-utf8 libmecab
|
31
|
+
|
21
32
|
## Installation
|
22
33
|
|
23
34
|
Add this line to your application's Gemfile:
|
data/lib/web_stat.rb
CHANGED
@@ -21,24 +21,26 @@ require "web_stat/fetch/fetch_as_web"
|
|
21
21
|
module WebStat
|
22
22
|
class << self
|
23
23
|
# Get web page's stat by url
|
24
|
-
|
24
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
25
|
+
def stat_by_web(url, userdics: nil)
|
25
26
|
web_stat = WebStat::FetchAsWeb.new(url)
|
26
|
-
web_stat.stat
|
27
|
+
web_stat.stat(userdics: userdics)
|
27
28
|
end
|
28
29
|
|
29
30
|
# Get web page's stat by url
|
30
31
|
# @param String url
|
31
|
-
def stat_by_url(url)
|
32
|
-
stat_by_web(url)
|
32
|
+
def stat_by_url(url, userdics: nil)
|
33
|
+
stat_by_web(url, userdics: userdics)
|
33
34
|
end
|
34
35
|
|
35
36
|
# Get web page's stat by html
|
36
37
|
# @param String html
|
37
38
|
# @param [String] url
|
38
|
-
|
39
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
40
|
+
def stat_by_html(html, url=nil, userdics: nil)
|
39
41
|
web_stat = WebStat::FetchAsHtml.new(html)
|
40
42
|
web_stat.url = url unless url.nil?
|
41
|
-
web_stat.stat
|
43
|
+
web_stat.stat(userdics: userdics)
|
42
44
|
end
|
43
45
|
end
|
44
46
|
end
|
data/lib/web_stat/fetch.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module WebStat
|
2
2
|
class Fetch
|
3
|
-
attr_accessor :url, :html, :nokogiri, :userdic
|
3
|
+
attr_accessor :url, :html, :nokogiri, :userdic, :status
|
4
4
|
|
5
5
|
# Get title
|
6
6
|
# @return [String] title
|
@@ -13,7 +13,11 @@ module WebStat
|
|
13
13
|
rescue
|
14
14
|
title = @nokogiri.title
|
15
15
|
end
|
16
|
-
title.
|
16
|
+
if title.nil?
|
17
|
+
"No Title"
|
18
|
+
else
|
19
|
+
title.strip
|
20
|
+
end
|
17
21
|
end
|
18
22
|
|
19
23
|
# Get name of domain
|
@@ -23,7 +27,11 @@ module WebStat
|
|
23
27
|
rescue
|
24
28
|
site_name = @nokogiri.title
|
25
29
|
end
|
26
|
-
site_name.
|
30
|
+
if site_name.nil?
|
31
|
+
"No Sitename"
|
32
|
+
else
|
33
|
+
site_name.strip
|
34
|
+
end
|
27
35
|
end
|
28
36
|
[]
|
29
37
|
# Get main section
|
@@ -41,7 +49,7 @@ module WebStat
|
|
41
49
|
break
|
42
50
|
end
|
43
51
|
end
|
44
|
-
if path.match(/^\//)
|
52
|
+
if ! path.nil? && path.match(/^\//)
|
45
53
|
"#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
|
46
54
|
else
|
47
55
|
path
|
@@ -53,9 +61,13 @@ module WebStat
|
|
53
61
|
def save_local_path(url)
|
54
62
|
return nil if url.nil?
|
55
63
|
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
56
|
-
|
57
|
-
|
58
|
-
|
64
|
+
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
65
|
+
image = agent.get(url)
|
66
|
+
File.open(tmp_file, "w+b") do |_file|
|
67
|
+
if image.class == Mechanize::File
|
68
|
+
_file.puts(image.body)
|
69
|
+
else
|
70
|
+
_file.puts(image.body_io.read)
|
59
71
|
end
|
60
72
|
end
|
61
73
|
tmp_file
|
@@ -63,23 +75,44 @@ module WebStat
|
|
63
75
|
|
64
76
|
# Get url
|
65
77
|
# @param [String] url
|
78
|
+
# @param [String] body
|
66
79
|
def get_url(url)
|
67
80
|
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
68
81
|
# Enable to read Robots.txt
|
69
82
|
agent.robots = true
|
70
|
-
|
71
|
-
|
83
|
+
begin
|
84
|
+
document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
|
85
|
+
if document.class == Mechanize::File
|
86
|
+
body = document.body
|
87
|
+
else
|
88
|
+
body = document.body.encode('UTF-8', document.encoding)
|
89
|
+
end
|
90
|
+
@status = document.code
|
91
|
+
rescue Mechanize::ResponseCodeError => e
|
92
|
+
body = e.page.body
|
93
|
+
@status = e.page.code
|
94
|
+
end
|
95
|
+
body
|
72
96
|
end
|
73
97
|
|
74
98
|
# Get the informations of @url
|
75
|
-
|
76
|
-
|
77
|
-
|
99
|
+
# @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
|
100
|
+
def stat(userdics: nil)
|
101
|
+
clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
|
102
|
+
language_code = CLD.detect_language(clean_content)[:code]
|
103
|
+
if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
|
104
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
|
105
|
+
elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"])
|
106
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"])
|
107
|
+
else
|
108
|
+
tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"])
|
109
|
+
end
|
78
110
|
{
|
79
111
|
title: title,
|
80
112
|
site_name: site_name,
|
81
113
|
content: clean_content,
|
82
|
-
language_code:
|
114
|
+
language_code: language_code,
|
115
|
+
status: @status,
|
83
116
|
url: @url,
|
84
117
|
eyecatch_image_path: save_local_path(eyecatch_image_path),
|
85
118
|
tags: tag.nouns
|
@@ -91,8 +124,9 @@ module WebStat
|
|
91
124
|
# Get original url
|
92
125
|
# @param [String] url
|
93
126
|
def original_url(url)
|
94
|
-
|
95
|
-
|
127
|
+
last_url = FinalRedirectUrl.final_redirect_url(url)
|
128
|
+
unless last_url.nil? || last_url.scrub('').empty?
|
129
|
+
last_url
|
96
130
|
else
|
97
131
|
url
|
98
132
|
end
|
data/lib/web_stat/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module WebStat
|
2
|
-
VERSION = "0.2.
|
3
|
-
end
|
2
|
+
VERSION = "0.2.11"
|
3
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -59,9 +59,10 @@ end
|
|
59
59
|
|
60
60
|
# Set webmock
|
61
61
|
WebStatTestHelper.scheme_and_files.each do |url|
|
62
|
+
status = [200, 404, 503].sample
|
62
63
|
WebMock.stub_request(:get, url)
|
63
64
|
.to_return(
|
64
|
-
status:
|
65
|
+
status: status,
|
65
66
|
body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
|
66
67
|
headers: {content_type: 'application/html; charset=utf-8'})
|
67
68
|
end
|
data/spec/web_stat/fetch_spec.rb
CHANGED
@@ -83,6 +83,7 @@ RSpec.describe WebStat::Fetch do
|
|
83
83
|
expect(web_stat[:title]).to eq "gem作成でついまずいたところ"
|
84
84
|
expect(web_stat[:site_name]).to eq "newsdict.blog"
|
85
85
|
expect(web_stat[:content]).not_to eq nil
|
86
|
+
expect(web_stat[:status]).to eq("200").or eq("404").or eq("503")
|
86
87
|
expect(Sanitize.clean(web_stat[:content]).length).to eq web_stat[:content].length
|
87
88
|
expect(web_stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
88
89
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -224,7 +224,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
224
224
|
- !ruby/object:Gem::Version
|
225
225
|
version: '0'
|
226
226
|
requirements: []
|
227
|
-
rubygems_version: 3.
|
227
|
+
rubygems_version: 3.0.3
|
228
228
|
signing_key:
|
229
229
|
specification_version: 4
|
230
230
|
summary: Get the status of the web pages.
|