web_stat 0.3.17 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Dockerfile +1 -1
- data/Gemfile.lock +15 -13
- data/README.md +0 -23
- data/lib/web_stat/config/web_stat.yml +5 -1
- data/lib/web_stat/fetch.rb +14 -11
- data/lib/web_stat/version.rb +1 -1
- data/spec/web_stat/configure_spec.rb +17 -2
- data/web_stat.gemspec +2 -0
- metadata +31 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 871e9eb97dc238635bd6a46571e86f6a9548c0ca32b6bf3576017ad71c81394f
|
|
4
|
+
data.tar.gz: 7ed7e9750fc2030f486c14f4de9441ba2c2c98a8357565bf708e6d43f7ae620e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 963dca0c991935d084ba40d2ff35de87c55d1584f9d39f3c54c7df0d89aa701ddef21014c1ea6b4a0a83d118b9abd6daf98cd084663992d0a3ae09c9b92e4267
|
|
7
|
+
data.tar.gz: df4c99e70693d004d312219dd2550cd7a3b39e4eb64ac101cd0654a886ee0d6a4d667a82dbdb15574f825aa4cc71fb3d34d9e10f080a977fe026c6f7b7d6006e
|
data/.ruby-version
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
3.0.0
|
data/Dockerfile
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# Define base image, you can use --build-arg
|
|
2
|
-
ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.
|
|
2
|
+
ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
|
|
3
3
|
FROM $base_image
|
|
4
4
|
|
|
5
5
|
# Set locale
|
data/Gemfile.lock
CHANGED
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
web_stat (0.
|
|
4
|
+
web_stat (0.4.2)
|
|
5
5
|
bundler (>= 2.0.2)
|
|
6
6
|
cld (>= 0.8.0)
|
|
7
7
|
mechanize (>= 2.7)
|
|
8
8
|
natto (>= 1.1.2)
|
|
9
9
|
nokogiri (>= 1.10.4)
|
|
10
10
|
pdf-reader (= 2.4.0)
|
|
11
|
+
rexml (>= 3.2.4)
|
|
11
12
|
ruby-readability (>= 0.7)
|
|
12
13
|
sanitize (>= 5.0.0)
|
|
13
14
|
selenium-webdriver (= 3.142.7)
|
|
15
|
+
webrick (>= 1.7.0)
|
|
14
16
|
|
|
15
17
|
GEM
|
|
16
18
|
remote: https://rubygems.org/
|
|
@@ -24,14 +26,13 @@ GEM
|
|
|
24
26
|
cld (0.8.0)
|
|
25
27
|
ffi
|
|
26
28
|
coderay (1.1.3)
|
|
27
|
-
connection_pool (2.2.3)
|
|
28
29
|
crack (0.4.3)
|
|
29
30
|
safe_yaml (~> 1.0.0)
|
|
30
31
|
crass (1.0.6)
|
|
31
32
|
diff-lcs (1.3)
|
|
32
33
|
domain_name (0.5.20190701)
|
|
33
34
|
unf (>= 0.0.5, < 1.0.0)
|
|
34
|
-
ffi (1.
|
|
35
|
+
ffi (1.14.2)
|
|
35
36
|
guess_html_encoding (0.0.11)
|
|
36
37
|
hashdiff (1.0.1)
|
|
37
38
|
hashery (2.1.2)
|
|
@@ -49,16 +50,14 @@ GEM
|
|
|
49
50
|
method_source (1.0.0)
|
|
50
51
|
mime-types (3.3.1)
|
|
51
52
|
mime-types-data (~> 3.2015)
|
|
52
|
-
mime-types-data (3.2020.
|
|
53
|
-
mini_portile2 (2.4.0)
|
|
53
|
+
mime-types-data (3.2020.1104)
|
|
54
54
|
natto (1.2.0)
|
|
55
55
|
ffi (>= 1.9.0)
|
|
56
56
|
net-http-digest_auth (1.4.1)
|
|
57
|
-
net-http-persistent (
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
nokogumbo (2.0.2)
|
|
57
|
+
net-http-persistent (2.9.4)
|
|
58
|
+
nokogiri (1.11.1-x86_64-linux)
|
|
59
|
+
racc (~> 1.4)
|
|
60
|
+
nokogumbo (2.0.4)
|
|
62
61
|
nokogiri (~> 1.8, >= 1.8.4)
|
|
63
62
|
ntlm-http (0.1.1)
|
|
64
63
|
pdf-reader (2.4.0)
|
|
@@ -74,7 +73,9 @@ GEM
|
|
|
74
73
|
byebug (~> 11.0)
|
|
75
74
|
pry (~> 0.13.0)
|
|
76
75
|
public_suffix (4.0.5)
|
|
76
|
+
racc (1.5.2)
|
|
77
77
|
rake (13.0.1)
|
|
78
|
+
rexml (3.2.4)
|
|
78
79
|
rspec (3.9.0)
|
|
79
80
|
rspec-core (~> 3.9.0)
|
|
80
81
|
rspec-expectations (~> 3.9.0)
|
|
@@ -94,14 +95,14 @@ GEM
|
|
|
94
95
|
nokogiri (>= 1.6.0)
|
|
95
96
|
rubyzip (2.3.0)
|
|
96
97
|
safe_yaml (1.0.5)
|
|
97
|
-
sanitize (5.2.
|
|
98
|
+
sanitize (5.2.2)
|
|
98
99
|
crass (~> 1.0.2)
|
|
99
100
|
nokogiri (>= 1.8.0)
|
|
100
101
|
nokogumbo (~> 2.0)
|
|
101
102
|
selenium-webdriver (3.142.7)
|
|
102
103
|
childprocess (>= 0.5, < 4.0)
|
|
103
104
|
rubyzip (>= 1.2.2)
|
|
104
|
-
ttfunk (1.
|
|
105
|
+
ttfunk (1.7.0)
|
|
105
106
|
unf (0.1.4)
|
|
106
107
|
unf_ext
|
|
107
108
|
unf_ext (0.0.7.7)
|
|
@@ -109,6 +110,7 @@ GEM
|
|
|
109
110
|
addressable (>= 2.3.6)
|
|
110
111
|
crack (>= 0.3.2)
|
|
111
112
|
hashdiff (>= 0.4.0, < 2.0.0)
|
|
113
|
+
webrick (1.7.0)
|
|
112
114
|
webrobots (0.1.2)
|
|
113
115
|
|
|
114
116
|
PLATFORMS
|
|
@@ -123,4 +125,4 @@ DEPENDENCIES
|
|
|
123
125
|
webmock (>= 3.8.3)
|
|
124
126
|
|
|
125
127
|
BUNDLED WITH
|
|
126
|
-
2.
|
|
128
|
+
2.2.4
|
data/README.md
CHANGED
|
@@ -1,30 +1,7 @@
|
|
|
1
|
-
# !!!!! Precautions when using with Rails !!!!!
|
|
2
|
-
|
|
3
|
-
Write this line your Gemfile.
|
|
4
|
-
```
|
|
5
|
-
gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
|
|
6
|
-
```
|
|
7
|
-
|
|
8
1
|
# WebStat
|
|
9
2
|
|
|
10
3
|
Fetch the web pages and stat.
|
|
11
4
|
|
|
12
|
-
## Requirements
|
|
13
|
-
|
|
14
|
-
- [MeCab _0.996_](http://taku910.github.io/mecab/#download)
|
|
15
|
-
- add runtime dependency
|
|
16
|
-
- "bundler", "~> 2.0"
|
|
17
|
-
- "nokogiri", "~> 1.10"
|
|
18
|
-
- "mechanize", "~> 2.7"
|
|
19
|
-
- "ruby-readability", "~> 0.7"
|
|
20
|
-
- "final_redirect_url", "~> 0.1.0"
|
|
21
|
-
- "natto", "~> 1.1.2"
|
|
22
|
-
- add development dependency
|
|
23
|
-
- "rake", "~> 10.0"
|
|
24
|
-
- "rspec", "~> 3.0"
|
|
25
|
-
- "rake", "~> 10.0"
|
|
26
|
-
- "rspec", "~> 3.0"
|
|
27
|
-
|
|
28
5
|
### Install mecab
|
|
29
6
|
|
|
30
7
|
$ sudo apt install mecab-ipadic-utf8 libmecab
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
development: &development
|
|
2
2
|
# Minimum number of characters to detect meta title
|
|
3
3
|
min_length_of_meta_title: 10
|
|
4
|
-
# Split regular expression for titles
|
|
4
|
+
# Split regular expression for titles
|
|
5
5
|
regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
|
|
6
6
|
# User Agent
|
|
7
7
|
user_agent: "web_stat gem agent"
|
|
@@ -14,6 +14,10 @@ development: &development
|
|
|
14
14
|
- '//img/@src'
|
|
15
15
|
userdic: ""
|
|
16
16
|
use_chromedirver: false
|
|
17
|
+
thumbnail_regex:
|
|
18
|
+
youtube:
|
|
19
|
+
- '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
|
|
20
|
+
- 'http://img.youtube.com/vi/\1/default.jpg'
|
|
17
21
|
test:
|
|
18
22
|
<<: *development
|
|
19
23
|
production:
|
data/lib/web_stat/fetch.rb
CHANGED
|
@@ -1,11 +1,5 @@
|
|
|
1
1
|
module WebStat
|
|
2
2
|
class Fetch
|
|
3
|
-
THUMBNAIL_REGEXS = {
|
|
4
|
-
:youtube => [
|
|
5
|
-
%r{^https://www.youtube.com/watch\?v=([^&]+)},
|
|
6
|
-
'http://img.youtube.com/vi/\1/default.jpg'
|
|
7
|
-
]
|
|
8
|
-
}
|
|
9
3
|
attr_accessor :url, :html, :nokogiri, :userdic, :status
|
|
10
4
|
# Get title
|
|
11
5
|
# @return [String] title
|
|
@@ -40,7 +34,7 @@ module WebStat
|
|
|
40
34
|
end
|
|
41
35
|
# Get main section
|
|
42
36
|
def content
|
|
43
|
-
Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
|
|
37
|
+
Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
|
|
44
38
|
end
|
|
45
39
|
|
|
46
40
|
# Get temporary path of image
|
|
@@ -54,12 +48,12 @@ module WebStat
|
|
|
54
48
|
end
|
|
55
49
|
end
|
|
56
50
|
# If there is a thumbnail rule, apply it.
|
|
57
|
-
|
|
51
|
+
WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
|
|
58
52
|
if @url.match(v[0])
|
|
59
53
|
return @url.gsub(v[0], v[1])
|
|
60
54
|
end
|
|
61
55
|
end
|
|
62
|
-
readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
|
|
56
|
+
readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
|
|
63
57
|
if (path.nil? || path.empty?) && readability_content.xpath('//img').first
|
|
64
58
|
path = readability_content.xpath('//img').first.attr('src')
|
|
65
59
|
end
|
|
@@ -83,7 +77,7 @@ module WebStat
|
|
|
83
77
|
File.open(tmp_file, "w+b") do |_file|
|
|
84
78
|
if image.class == Mechanize::File
|
|
85
79
|
_file.puts(image.body)
|
|
86
|
-
elsif image.respond_to?(:
|
|
80
|
+
elsif image.respond_to?(:body_io)
|
|
87
81
|
_file.puts(image.body_io.read)
|
|
88
82
|
end
|
|
89
83
|
end
|
|
@@ -102,7 +96,16 @@ module WebStat
|
|
|
102
96
|
raise Mechanize::RobotsDisallowedError.new(url)
|
|
103
97
|
end
|
|
104
98
|
if WebStat::Configure.get["use_chromedirver"]
|
|
105
|
-
|
|
99
|
+
begin
|
|
100
|
+
body = WebStat::WebDriverHelper.get_source(url)
|
|
101
|
+
rescue Selenium::WebDriver::Error::UnknownError => e
|
|
102
|
+
document = mech.agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
|
|
103
|
+
if document.class == Mechanize::File
|
|
104
|
+
body = document.body
|
|
105
|
+
else
|
|
106
|
+
body = document.body.encode('UTF-8', document.encoding)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
106
109
|
@status = 200
|
|
107
110
|
else
|
|
108
111
|
document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
|
data/lib/web_stat/version.rb
CHANGED
|
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
|
|
|
3
3
|
configure = WebStat::Configure.get
|
|
4
4
|
expect(configure).not_to eq nil
|
|
5
5
|
end
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
it "Readable Config" do
|
|
8
8
|
config = WebStat::Configure.get
|
|
9
|
-
|
|
9
|
+
|
|
10
10
|
expect(config["min_length_of_meta_title"]).to eq 10
|
|
11
11
|
expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
|
|
12
12
|
end
|
|
13
|
+
|
|
14
|
+
it "Get thumbnail_regex.youtube." do
|
|
15
|
+
config = WebStat::Configure.get
|
|
16
|
+
expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
|
|
17
|
+
expect(config["thumbnail_regex"]["youtube"].count).to eq 2
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it "Match youtube url." do
|
|
21
|
+
sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
|
|
22
|
+
WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
|
|
23
|
+
if sample_url.match(v[0])
|
|
24
|
+
expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
13
28
|
end
|
data/web_stat.gemspec
CHANGED
|
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
|
|
|
29
29
|
spec.add_runtime_dependency "cld", ">= 0.8.0"
|
|
30
30
|
spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
|
|
31
31
|
spec.add_runtime_dependency "pdf-reader", "2.4.0"
|
|
32
|
+
spec.add_runtime_dependency "webrick", ">= 1.7.0"
|
|
33
|
+
spec.add_runtime_dependency "rexml", ">= 3.2.4"
|
|
32
34
|
|
|
33
35
|
spec.add_development_dependency "rake", ">= 10.0"
|
|
34
36
|
spec.add_development_dependency "rspec", ">= 3.0"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: web_stat
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yusuke abe
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-01-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -136,6 +136,34 @@ dependencies:
|
|
|
136
136
|
- - '='
|
|
137
137
|
- !ruby/object:Gem::Version
|
|
138
138
|
version: 2.4.0
|
|
139
|
+
- !ruby/object:Gem::Dependency
|
|
140
|
+
name: webrick
|
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
|
142
|
+
requirements:
|
|
143
|
+
- - ">="
|
|
144
|
+
- !ruby/object:Gem::Version
|
|
145
|
+
version: 1.7.0
|
|
146
|
+
type: :runtime
|
|
147
|
+
prerelease: false
|
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
+
requirements:
|
|
150
|
+
- - ">="
|
|
151
|
+
- !ruby/object:Gem::Version
|
|
152
|
+
version: 1.7.0
|
|
153
|
+
- !ruby/object:Gem::Dependency
|
|
154
|
+
name: rexml
|
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
|
156
|
+
requirements:
|
|
157
|
+
- - ">="
|
|
158
|
+
- !ruby/object:Gem::Version
|
|
159
|
+
version: 3.2.4
|
|
160
|
+
type: :runtime
|
|
161
|
+
prerelease: false
|
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
163
|
+
requirements:
|
|
164
|
+
- - ">="
|
|
165
|
+
- !ruby/object:Gem::Version
|
|
166
|
+
version: 3.2.4
|
|
139
167
|
- !ruby/object:Gem::Dependency
|
|
140
168
|
name: rake
|
|
141
169
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
274
302
|
- !ruby/object:Gem::Version
|
|
275
303
|
version: '0'
|
|
276
304
|
requirements: []
|
|
277
|
-
rubygems_version: 3.
|
|
305
|
+
rubygems_version: 3.2.3
|
|
278
306
|
signing_key:
|
|
279
307
|
specification_version: 4
|
|
280
308
|
summary: Get the status of the web pages.
|