web_stat 0.4.7 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +97 -16
- data/bin/{fetch_as_html → fetch_as_url} +0 -0
- data/lib/web_stat.rb +2 -0
- data/lib/web_stat/config/web_stat.yml +5 -3
- data/lib/web_stat/fetch.rb +20 -4
- data/lib/web_stat/version.rb +1 -1
- data/spec/web_stat/configure_spec.rb +5 -4
- data/web_stat.gemspec +1 -0
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d3f6f4f9692846a2c50a365125efb42dfd4a0b44fddef1c108621d94fd6dcde4
|
4
|
+
data.tar.gz: 2f6d400bce014144c940edcafbad9dc4e8c164fce356c70bfd7dbd1ad0c2f862
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6518aaed72267de7612257c43067a762b933d2e1a64d04defb07d7471eacd8bd1a4ca1cadc96dad5b279cc09a4bbc5a3453abfe10715bde5dc614d08d1953098
|
7
|
+
data.tar.gz: 3cd03188f34030da0c9ead6633bd1c67550524c22368b5c52dd69132e54be69edff6a28d7a91b6ff76ad02a6f3bb2becebb2df1e03db72fd7f3f16262d8137e2
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
web_stat (0.
|
4
|
+
web_stat (0.5.0)
|
5
5
|
bundler (>= 2.0.2)
|
6
6
|
cld (>= 0.8.0)
|
7
|
+
google-api-client (>= 0.53.0)
|
7
8
|
mechanize (>= 2.7.7)
|
8
9
|
natto (>= 1.1.2)
|
9
10
|
nokogiri (>= 1.10.4)
|
@@ -18,6 +19,12 @@ GEM
|
|
18
19
|
remote: https://rubygems.org/
|
19
20
|
specs:
|
20
21
|
Ascii85 (1.0.3)
|
22
|
+
activesupport (6.1.3.2)
|
23
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
24
|
+
i18n (>= 1.6, < 2)
|
25
|
+
minitest (>= 5.1)
|
26
|
+
tzinfo (~> 2.0)
|
27
|
+
zeitwerk (~> 2.3)
|
21
28
|
addressable (2.7.0)
|
22
29
|
public_suffix (>= 2.0.2, < 5.0)
|
23
30
|
afm (0.2.2)
|
@@ -26,43 +33,99 @@ GEM
|
|
26
33
|
cld (0.8.0)
|
27
34
|
ffi
|
28
35
|
coderay (1.1.3)
|
29
|
-
|
36
|
+
concurrent-ruby (1.1.9)
|
37
|
+
connection_pool (2.2.5)
|
30
38
|
crack (0.4.5)
|
31
39
|
rexml
|
32
40
|
crass (1.0.6)
|
41
|
+
declarative (0.0.20)
|
33
42
|
diff-lcs (1.4.4)
|
34
43
|
domain_name (0.5.20190701)
|
35
44
|
unf (>= 0.0.5, < 1.0.0)
|
36
|
-
|
45
|
+
faraday (1.4.2)
|
46
|
+
faraday-em_http (~> 1.0)
|
47
|
+
faraday-em_synchrony (~> 1.0)
|
48
|
+
faraday-excon (~> 1.1)
|
49
|
+
faraday-net_http (~> 1.0)
|
50
|
+
faraday-net_http_persistent (~> 1.1)
|
51
|
+
multipart-post (>= 1.2, < 3)
|
52
|
+
ruby2_keywords (>= 0.0.4)
|
53
|
+
faraday-em_http (1.0.0)
|
54
|
+
faraday-em_synchrony (1.0.0)
|
55
|
+
faraday-excon (1.1.0)
|
56
|
+
faraday-net_http (1.0.1)
|
57
|
+
faraday-net_http_persistent (1.1.0)
|
58
|
+
ffi (1.15.3)
|
59
|
+
gems (1.2.0)
|
60
|
+
google-api-client (0.53.0)
|
61
|
+
google-apis-core (~> 0.1)
|
62
|
+
google-apis-generator (~> 0.1)
|
63
|
+
google-apis-core (0.3.0)
|
64
|
+
addressable (~> 2.5, >= 2.5.1)
|
65
|
+
googleauth (~> 0.14)
|
66
|
+
httpclient (>= 2.8.1, < 3.0)
|
67
|
+
mini_mime (~> 1.0)
|
68
|
+
representable (~> 3.0)
|
69
|
+
retriable (>= 2.0, < 4.0)
|
70
|
+
rexml
|
71
|
+
signet (~> 0.14)
|
72
|
+
webrick
|
73
|
+
google-apis-discovery_v1 (0.4.0)
|
74
|
+
google-apis-core (~> 0.1)
|
75
|
+
google-apis-generator (0.3.0)
|
76
|
+
activesupport (>= 5.0)
|
77
|
+
gems (~> 1.2)
|
78
|
+
google-apis-core (~> 0.1)
|
79
|
+
google-apis-discovery_v1 (~> 0.0)
|
80
|
+
thor (>= 0.20, < 2.a)
|
81
|
+
googleauth (0.16.2)
|
82
|
+
faraday (>= 0.17.3, < 2.0)
|
83
|
+
jwt (>= 1.4, < 3.0)
|
84
|
+
memoist (~> 0.16)
|
85
|
+
multi_json (~> 1.11)
|
86
|
+
os (>= 0.9, < 2.0)
|
87
|
+
signet (~> 0.14)
|
37
88
|
guess_html_encoding (0.0.11)
|
38
89
|
hashdiff (1.0.1)
|
39
90
|
hashery (2.1.2)
|
40
|
-
http-cookie (1.0.
|
91
|
+
http-cookie (1.0.4)
|
41
92
|
domain_name (~> 0.5)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
93
|
+
httpclient (2.8.3)
|
94
|
+
i18n (1.8.10)
|
95
|
+
concurrent-ruby (~> 1.0)
|
96
|
+
jwt (2.2.3)
|
97
|
+
mechanize (2.8.1)
|
98
|
+
addressable (~> 2.7)
|
99
|
+
domain_name (~> 0.5, >= 0.5.20190701)
|
100
|
+
http-cookie (~> 1.0, >= 1.0.3)
|
101
|
+
mime-types (~> 3.0)
|
102
|
+
net-http-digest_auth (~> 1.4, >= 1.4.1)
|
103
|
+
net-http-persistent (>= 2.5.2, < 5.0.dev)
|
104
|
+
nokogiri (~> 1.11, >= 1.11.2)
|
105
|
+
rubyntlm (~> 0.6, >= 0.6.3)
|
50
106
|
webrick (~> 1.7)
|
51
|
-
webrobots (
|
107
|
+
webrobots (~> 0.1.2)
|
108
|
+
memoist (0.16.2)
|
52
109
|
method_source (1.0.0)
|
53
110
|
mime-types (3.3.1)
|
54
111
|
mime-types-data (~> 3.2015)
|
55
112
|
mime-types-data (3.2021.0225)
|
113
|
+
mini_mime (1.1.0)
|
114
|
+
mini_portile2 (2.5.3)
|
115
|
+
minitest (5.14.4)
|
116
|
+
multi_json (1.15.0)
|
117
|
+
multipart-post (2.1.1)
|
56
118
|
natto (1.2.0)
|
57
119
|
ffi (>= 1.9.0)
|
58
120
|
net-http-digest_auth (1.4.1)
|
59
121
|
net-http-persistent (4.0.1)
|
60
122
|
connection_pool (~> 2.2)
|
61
|
-
nokogiri (1.11.
|
123
|
+
nokogiri (1.11.7)
|
124
|
+
mini_portile2 (~> 2.5.0)
|
62
125
|
racc (~> 1.4)
|
63
|
-
nokogumbo (2.0.
|
126
|
+
nokogumbo (2.0.5)
|
64
127
|
nokogiri (~> 1.8, >= 1.8.4)
|
65
|
-
|
128
|
+
os (1.1.1)
|
66
129
|
pdf-reader (2.4.0)
|
67
130
|
Ascii85 (~> 1.0.0)
|
68
131
|
afm (~> 0.2.1)
|
@@ -78,6 +141,11 @@ GEM
|
|
78
141
|
public_suffix (4.0.6)
|
79
142
|
racc (1.5.2)
|
80
143
|
rake (13.0.3)
|
144
|
+
representable (3.1.1)
|
145
|
+
declarative (< 0.1.0)
|
146
|
+
trailblazer-option (>= 0.1.1, < 0.2.0)
|
147
|
+
uber (< 0.2.0)
|
148
|
+
retriable (3.1.2)
|
81
149
|
rexml (3.2.4)
|
82
150
|
rspec (3.10.0)
|
83
151
|
rspec-core (~> 3.10.0)
|
@@ -96,6 +164,8 @@ GEM
|
|
96
164
|
ruby-readability (0.7.0)
|
97
165
|
guess_html_encoding (>= 0.0.4)
|
98
166
|
nokogiri (>= 1.6.0)
|
167
|
+
ruby2_keywords (0.0.4)
|
168
|
+
rubyntlm (0.6.3)
|
99
169
|
rubyzip (2.3.0)
|
100
170
|
sanitize (5.2.3)
|
101
171
|
crass (~> 1.0.2)
|
@@ -104,7 +174,17 @@ GEM
|
|
104
174
|
selenium-webdriver (3.142.7)
|
105
175
|
childprocess (>= 0.5, < 4.0)
|
106
176
|
rubyzip (>= 1.2.2)
|
177
|
+
signet (0.15.0)
|
178
|
+
addressable (~> 2.3)
|
179
|
+
faraday (>= 0.17.3, < 2.0)
|
180
|
+
jwt (>= 1.5, < 3.0)
|
181
|
+
multi_json (~> 1.10)
|
182
|
+
thor (1.1.0)
|
183
|
+
trailblazer-option (0.1.1)
|
107
184
|
ttfunk (1.7.0)
|
185
|
+
tzinfo (2.0.4)
|
186
|
+
concurrent-ruby (~> 1.0)
|
187
|
+
uber (0.1.0)
|
108
188
|
unf (0.1.4)
|
109
189
|
unf_ext
|
110
190
|
unf_ext (0.0.7.7)
|
@@ -114,6 +194,7 @@ GEM
|
|
114
194
|
hashdiff (>= 0.4.0, < 2.0.0)
|
115
195
|
webrick (1.7.0)
|
116
196
|
webrobots (0.1.2)
|
197
|
+
zeitwerk (2.4.2)
|
117
198
|
|
118
199
|
PLATFORMS
|
119
200
|
ruby
|
File without changes
|
data/lib/web_stat.rb
CHANGED
@@ -11,6 +11,7 @@ require 'net/http'
|
|
11
11
|
require 'pdf/reader'
|
12
12
|
require 'ruby-readability'
|
13
13
|
require 'selenium-webdriver'
|
14
|
+
require 'google/apis/youtube_v3'
|
14
15
|
|
15
16
|
require "helpers/web_drive_helper"
|
16
17
|
require "web_stat/final_redirect_url"
|
@@ -18,6 +19,7 @@ require "web_stat/categorize"
|
|
18
19
|
require "web_stat/configure"
|
19
20
|
require "web_stat/errors"
|
20
21
|
require "web_stat/fetch"
|
22
|
+
|
21
23
|
require "web_stat/tag"
|
22
24
|
require "web_stat/version"
|
23
25
|
require "web_stat/fetch/fetch_as_html"
|
@@ -14,10 +14,12 @@ development: &development
|
|
14
14
|
- '//img/@src'
|
15
15
|
userdic: ""
|
16
16
|
use_chromedirver: false
|
17
|
+
id_extraction_regexs:
|
18
|
+
youtube: '^https://www.youtube.com/watch\?v=([^&]+)'
|
17
19
|
thumbnail_regex:
|
18
|
-
youtube:
|
19
|
-
|
20
|
-
|
20
|
+
youtube: 'http://img.youtube.com/vi/\1/default.jpg'
|
21
|
+
api_keys:
|
22
|
+
youtube: "dummy-key"
|
21
23
|
test:
|
22
24
|
<<: *development
|
23
25
|
production:
|
data/lib/web_stat/fetch.rb
CHANGED
@@ -34,7 +34,23 @@ module WebStat
|
|
34
34
|
end
|
35
35
|
# Get main section
|
36
36
|
def content
|
37
|
-
|
37
|
+
if @url.match(WebStat::Configure.get["id_extraction_regexs"]["youtube"])
|
38
|
+
youtube_decscription
|
39
|
+
else
|
40
|
+
Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Get describe of youtube movie.
|
45
|
+
def youtube_decscription
|
46
|
+
regex_string = WebStat::Configure.get["id_extraction_regexs"]["youtube"]
|
47
|
+
if @url.match(regex_string)
|
48
|
+
id = @url.gsub(%r{#{regex_string}}, '\1')
|
49
|
+
youtube = Google::Apis::YoutubeV3::YouTubeService.new
|
50
|
+
youtube.key = WebStat::Configure.get["api_keys"]["youtube"]
|
51
|
+
response = youtube.list_videos(:snippet, id: id)
|
52
|
+
response.items.first.snippet.description
|
53
|
+
end
|
38
54
|
end
|
39
55
|
|
40
56
|
# Get temporary path of image
|
@@ -48,9 +64,9 @@ module WebStat
|
|
48
64
|
end
|
49
65
|
end
|
50
66
|
# If there is a thumbnail rule, apply it.
|
51
|
-
WebStat::Configure.get["
|
52
|
-
if @url.match(
|
53
|
-
return @url.gsub(
|
67
|
+
WebStat::Configure.get["id_extraction_regexs"].each do |provider, regex_string|
|
68
|
+
if @url.match(regex_string)
|
69
|
+
return @url.gsub(%r{#{regex_string}}, WebStat::Configure.get["thumbnail_regex"][provider])
|
54
70
|
end
|
55
71
|
end
|
56
72
|
readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
|
data/lib/web_stat/version.rb
CHANGED
@@ -14,14 +14,15 @@ RSpec.describe WebStat::Configure do
|
|
14
14
|
it "Get thumbnail_regex.youtube." do
|
15
15
|
config = WebStat::Configure.get
|
16
16
|
expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
|
17
|
-
expect(config["
|
17
|
+
expect(config["id_extraction_regexs"]["youtube"]).to be_a String
|
18
|
+
expect(config["thumbnail_regex"]["youtube"]).to be_a String
|
18
19
|
end
|
19
20
|
|
20
21
|
it "Match youtube url." do
|
21
22
|
sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
|
22
|
-
WebStat::Configure.get["
|
23
|
-
if sample_url.match(
|
24
|
-
expect(sample_url.gsub(
|
23
|
+
WebStat::Configure.get["id_extraction_regexs"].each do |provider, regex_string|
|
24
|
+
if sample_url.match(regex_string)
|
25
|
+
expect(sample_url.gsub(%r{#{regex_string}}, WebStat::Configure.get["thumbnail_regex"][provider])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
|
25
26
|
end
|
26
27
|
end
|
27
28
|
end
|
data/web_stat.gemspec
CHANGED
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_runtime_dependency "pdf-reader", "2.4.0"
|
32
32
|
spec.add_runtime_dependency "webrick", ">= 1.7.0"
|
33
33
|
spec.add_runtime_dependency "rexml", ">= 3.2.4"
|
34
|
+
spec.add_runtime_dependency "google-api-client", ">= 0.53.0"
|
34
35
|
|
35
36
|
spec.add_development_dependency "rake", ">= 10.0"
|
36
37
|
spec.add_development_dependency "rspec", ">= 3.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,6 +164,20 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: 3.2.4
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: google-api-client
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: 0.53.0
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: 0.53.0
|
167
181
|
- !ruby/object:Gem::Dependency
|
168
182
|
name: rake
|
169
183
|
requirement: !ruby/object:Gem::Requirement
|
@@ -238,7 +252,7 @@ description: Fetch the web pages and stat.
|
|
238
252
|
email:
|
239
253
|
- yube@newsdict.jp
|
240
254
|
executables:
|
241
|
-
-
|
255
|
+
- fetch_as_url
|
242
256
|
extensions: []
|
243
257
|
extra_rdoc_files: []
|
244
258
|
files:
|
@@ -253,7 +267,7 @@ files:
|
|
253
267
|
- LICENSE.txt
|
254
268
|
- README.md
|
255
269
|
- Rakefile
|
256
|
-
- bin/
|
270
|
+
- bin/fetch_as_url
|
257
271
|
- docker-compose.yml
|
258
272
|
- docker/exec
|
259
273
|
- docker/start
|