web_stat 0.3.14 → 0.3.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +15 -3
- data/lib/web_stat.rb +1 -0
- data/lib/web_stat/fetch/fetch_as_web.rb +25 -1
- data/lib/web_stat/version.rb +1 -1
- data/spec/fixtures/pdfs/newsdict.blog.pdf +0 -0
- data/spec/fixtures/pdfs/rfc2616.pdf +0 -0
- data/spec/spec_helper.rb +17 -2
- data/spec/web_stat/fetch_spec.rb +101 -1
- data/web_stat.gemspec +2 -1
- metadata +22 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82732f779004a5a2ef1f7259ce40126de8f51dfb07e46cd4972aafd3ab386ac3
|
4
|
+
data.tar.gz: 17c2b99bdeb5db8c134d107fc9d4957dab985c7e680b1c92060bf7090315477b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6691a57b0498fcfbb609042cdb7e12b4ac453b5cedaf6dd39671090c97fc19958a4162ba50947392116320ba78c9a21491acab27be2a41287825d3d4d2194d1
|
7
|
+
data.tar.gz: 1e2e3d33f5c232532f442bbd16540c47810c93caef436a351124165b5294ddc7354d766ea66aa24ebc052c69432a639a6d5ef7b61a7cde896f6dc4676f95650d
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
web_stat (0.3.
|
4
|
+
web_stat (0.3.15)
|
5
5
|
bundler (>= 2.0.2)
|
6
6
|
cld (>= 0.8.0)
|
7
7
|
mechanize (>= 2.7)
|
8
8
|
natto (>= 1.1.2)
|
9
9
|
nokogiri (>= 1.10.4)
|
10
|
+
pdf-reader (= 2.4.0)
|
10
11
|
ruby-readability (>= 0.7)
|
11
12
|
sanitize (>= 5.0.0)
|
12
13
|
selenium-webdriver (= 3.142.7)
|
@@ -14,8 +15,10 @@ PATH
|
|
14
15
|
GEM
|
15
16
|
remote: https://rubygems.org/
|
16
17
|
specs:
|
18
|
+
Ascii85 (1.0.3)
|
17
19
|
addressable (2.7.0)
|
18
20
|
public_suffix (>= 2.0.2, < 5.0)
|
21
|
+
afm (0.2.2)
|
19
22
|
byebug (11.1.3)
|
20
23
|
childprocess (3.0.0)
|
21
24
|
cld (0.8.0)
|
@@ -31,6 +34,7 @@ GEM
|
|
31
34
|
ffi (1.13.1)
|
32
35
|
guess_html_encoding (0.0.11)
|
33
36
|
hashdiff (1.0.1)
|
37
|
+
hashery (2.1.2)
|
34
38
|
http-cookie (1.0.3)
|
35
39
|
domain_name (~> 0.5)
|
36
40
|
mechanize (2.7.6)
|
@@ -57,6 +61,12 @@ GEM
|
|
57
61
|
nokogumbo (2.0.2)
|
58
62
|
nokogiri (~> 1.8, >= 1.8.4)
|
59
63
|
ntlm-http (0.1.1)
|
64
|
+
pdf-reader (2.4.0)
|
65
|
+
Ascii85 (~> 1.0.0)
|
66
|
+
afm (~> 0.2.1)
|
67
|
+
hashery (~> 2.0)
|
68
|
+
ruby-rc4
|
69
|
+
ttfunk
|
60
70
|
pry (0.13.1)
|
61
71
|
coderay (~> 1.1)
|
62
72
|
method_source (~> 1.0)
|
@@ -78,18 +88,20 @@ GEM
|
|
78
88
|
diff-lcs (>= 1.2.0, < 2.0)
|
79
89
|
rspec-support (~> 3.9.0)
|
80
90
|
rspec-support (3.9.3)
|
91
|
+
ruby-rc4 (0.1.5)
|
81
92
|
ruby-readability (0.7.0)
|
82
93
|
guess_html_encoding (>= 0.0.4)
|
83
94
|
nokogiri (>= 1.6.0)
|
84
95
|
rubyzip (2.3.0)
|
85
96
|
safe_yaml (1.0.5)
|
86
|
-
sanitize (5.2.
|
97
|
+
sanitize (5.2.1)
|
87
98
|
crass (~> 1.0.2)
|
88
99
|
nokogiri (>= 1.8.0)
|
89
100
|
nokogumbo (~> 2.0)
|
90
101
|
selenium-webdriver (3.142.7)
|
91
102
|
childprocess (>= 0.5, < 4.0)
|
92
103
|
rubyzip (>= 1.2.2)
|
104
|
+
ttfunk (1.6.2.1)
|
93
105
|
unf (0.1.4)
|
94
106
|
unf_ext
|
95
107
|
unf_ext (0.0.7.7)
|
@@ -108,7 +120,7 @@ DEPENDENCIES
|
|
108
120
|
rake (>= 10.0)
|
109
121
|
rspec (>= 3.0)
|
110
122
|
web_stat!
|
111
|
-
webmock (>= 3.
|
123
|
+
webmock (>= 3.8.3)
|
112
124
|
|
113
125
|
BUNDLED WITH
|
114
126
|
2.1.4
|
data/lib/web_stat.rb
CHANGED
@@ -9,7 +9,31 @@ module WebStat
|
|
9
9
|
raise WebStat::INVALID_URL, url
|
10
10
|
end
|
11
11
|
@url = original_url(url)
|
12
|
-
|
12
|
+
if @url.match?(/\.pdf$/)
|
13
|
+
title = nil
|
14
|
+
body = nil
|
15
|
+
URI.open(@url) do |io|
|
16
|
+
reader = PDF::Reader.new(io)
|
17
|
+
if reader.info.key?(:Title)
|
18
|
+
title = reader.info[:Title]
|
19
|
+
else
|
20
|
+
title = File.basename(@url, ".pdf")
|
21
|
+
end
|
22
|
+
body = reader.pages.first.text
|
23
|
+
end
|
24
|
+
@html = <<-"EOS"
|
25
|
+
<html>
|
26
|
+
<head>
|
27
|
+
<title>#{title}</title>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
#{body}
|
31
|
+
</body>
|
32
|
+
</html>
|
33
|
+
EOS
|
34
|
+
else
|
35
|
+
@html = get_url(@url)
|
36
|
+
end
|
13
37
|
@nokogiri = ::Nokogiri::HTML(@html)
|
14
38
|
end
|
15
39
|
|
data/lib/web_stat/version.rb
CHANGED
Binary file
|
Binary file
|
data/spec/spec_helper.rb
CHANGED
@@ -6,8 +6,9 @@ require "web_stat"
|
|
6
6
|
|
7
7
|
require 'webmock'
|
8
8
|
include WebMock::API
|
9
|
+
ENV['ENV'] = 'test'
|
9
10
|
WebMock.enable!
|
10
|
-
|
11
|
+
|
11
12
|
RSpec.configure do |config|
|
12
13
|
# Enable flags like --only-failures and --next-failure
|
13
14
|
config.example_status_persistence_file_path = ".rspec_status"
|
@@ -52,7 +53,14 @@ module WebStatTestHelper
|
|
52
53
|
# Get htmls of fixture
|
53
54
|
def scheme_and_files
|
54
55
|
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
|
55
|
-
|
56
|
+
"https://newsdict.blog/#{File.basename(file)}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get pdfs of fixture
|
61
|
+
def pdfs
|
62
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
|
63
|
+
"https://newsdict.blog/#{File.basename(file)}"
|
56
64
|
end
|
57
65
|
end
|
58
66
|
end
|
@@ -67,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
|
|
67
75
|
body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
|
68
76
|
headers: {content_type: 'application/html; charset=utf-8'})
|
69
77
|
end
|
78
|
+
WebStatTestHelper.pdfs.each do |url|
|
79
|
+
WebMock.stub_request(:get, url)
|
80
|
+
.to_return(
|
81
|
+
status: 200,
|
82
|
+
body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
|
83
|
+
headers: {content_type: 'application/pdf'})
|
84
|
+
end
|
70
85
|
|
71
86
|
WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
|
72
87
|
.to_return(
|
data/spec/web_stat/fetch_spec.rb
CHANGED
@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
+
[{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
|
70
|
+
|
71
|
+
it "Get title by #{fetch[:class].to_s}" do
|
72
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
73
|
+
expect(web_stat.title).to eq "Microsoft Word"
|
74
|
+
end
|
75
|
+
|
76
|
+
it "Get site name by #{fetch[:class].to_s}" do
|
77
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
78
|
+
expect(web_stat.site_name).to eq "RFC2616.doc"
|
79
|
+
end
|
80
|
+
|
81
|
+
it "Get Document's content by #{fetch[:class].to_s}" do
|
82
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
83
|
+
expect(web_stat.content).not_to eq nil
|
84
|
+
end
|
85
|
+
|
86
|
+
it "WebStat content do not include html by #{fetch[:class].to_s}" do
|
87
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
88
|
+
expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
|
89
|
+
end
|
90
|
+
|
91
|
+
it "Get eyecatch image blob by #{fetch[:class].to_s}" do
|
92
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
93
|
+
web_stat.url = "https://newsdict.blog"
|
94
|
+
unless web_stat.stat[:eyecatch_image_path].nil?
|
95
|
+
image = File.read(web_stat.stat[:eyecatch_image_path])
|
96
|
+
expect(image.encoding.to_s).to eq("UTF-8")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
it "Get eyecatch image path by #{fetch[:class].to_s}" do
|
101
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
102
|
+
web_stat.url = "https://newsdict.blog"
|
103
|
+
expect(web_stat.eyecatch_image_path).to be_string_or_nil
|
104
|
+
end
|
105
|
+
|
106
|
+
it "Get language_iso by #{fetch[:class].to_s}" do
|
107
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
108
|
+
web_stat.url = "https://newsdict.blog"
|
109
|
+
expect(web_stat.stat[:language_code]).to eq("en")
|
110
|
+
end
|
111
|
+
|
112
|
+
it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
|
113
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
114
|
+
web_stat.url = "https://newsdict.blog"
|
115
|
+
expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
[{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
|
120
|
+
|
121
|
+
it "Get title by #{fetch[:class].to_s}" do
|
122
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
123
|
+
expect(web_stat.title).to eq "newsdict.blog"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "Get site name by #{fetch[:class].to_s}" do
|
127
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
128
|
+
expect(web_stat.site_name).to eq "newsdict.blog"
|
129
|
+
end
|
130
|
+
|
131
|
+
it "Get Document's content by #{fetch[:class].to_s}" do
|
132
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
133
|
+
expect(web_stat.content).not_to eq nil
|
134
|
+
end
|
135
|
+
|
136
|
+
it "WebStat content do not include html by #{fetch[:class].to_s}" do
|
137
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
138
|
+
expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
|
139
|
+
end
|
140
|
+
|
141
|
+
it "Get eyecatch image blob by #{fetch[:class].to_s}" do
|
142
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
143
|
+
web_stat.url = "https://newsdict.blog"
|
144
|
+
unless web_stat.stat[:eyecatch_image_path].nil?
|
145
|
+
image = File.read(web_stat.stat[:eyecatch_image_path])
|
146
|
+
expect(image.encoding.to_s).to eq("UTF-8")
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
it "Get eyecatch image path by #{fetch[:class].to_s}" do
|
151
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
152
|
+
web_stat.url = "https://newsdict.blog"
|
153
|
+
expect(web_stat.eyecatch_image_path).to be_string_or_nil
|
154
|
+
end
|
155
|
+
|
156
|
+
it "Get language_iso by #{fetch[:class].to_s}" do
|
157
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
158
|
+
web_stat.url = "https://newsdict.blog"
|
159
|
+
expect(web_stat.stat[:language_code]).to eq("ja")
|
160
|
+
end
|
161
|
+
|
162
|
+
it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
|
163
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
164
|
+
web_stat.url = "https://newsdict.blog"
|
165
|
+
expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
69
169
|
it "WebStat.stat_by_html" do
|
70
170
|
WebStatTestHelper.htmls.each do |fixture|
|
71
171
|
web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
|
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
|
|
104
204
|
expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
|
105
205
|
expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
|
106
206
|
end
|
107
|
-
end
|
207
|
+
end
|
data/web_stat.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_runtime_dependency "sanitize", ">= 5.0.0"
|
29
29
|
spec.add_runtime_dependency "cld", ">= 0.8.0"
|
30
30
|
spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
|
31
|
+
spec.add_runtime_dependency "pdf-reader", "2.4.0"
|
31
32
|
|
32
33
|
spec.add_development_dependency "rake", ">= 10.0"
|
33
34
|
spec.add_development_dependency "rspec", ">= 3.0"
|
34
35
|
spec.add_development_dependency "pry", ">= 0.13.1"
|
35
|
-
spec.add_development_dependency "webmock", ">= 3.
|
36
|
+
spec.add_development_dependency "webmock", ">= 3.8.3"
|
36
37
|
spec.add_development_dependency "pry-byebug", "3.9.0"
|
37
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - '='
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: 3.142.7
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pdf-reader
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 2.4.0
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: 2.4.0
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: rake
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,14 +184,14 @@ dependencies:
|
|
170
184
|
requirements:
|
171
185
|
- - ">="
|
172
186
|
- !ruby/object:Gem::Version
|
173
|
-
version: 3.
|
187
|
+
version: 3.8.3
|
174
188
|
type: :development
|
175
189
|
prerelease: false
|
176
190
|
version_requirements: !ruby/object:Gem::Requirement
|
177
191
|
requirements:
|
178
192
|
- - ">="
|
179
193
|
- !ruby/object:Gem::Version
|
180
|
-
version: 3.
|
194
|
+
version: 3.8.3
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
196
|
name: pry-byebug
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -233,6 +247,8 @@ files:
|
|
233
247
|
- spec/fixtures/htmls/image.html
|
234
248
|
- spec/fixtures/images/facebook-3.jpg
|
235
249
|
- spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
|
250
|
+
- spec/fixtures/pdfs/newsdict.blog.pdf
|
251
|
+
- spec/fixtures/pdfs/rfc2616.pdf
|
236
252
|
- spec/spec_helper.rb
|
237
253
|
- spec/web_stat/configure_spec.rb
|
238
254
|
- spec/web_stat/fetch_spec.rb
|
@@ -268,6 +284,8 @@ test_files:
|
|
268
284
|
- spec/fixtures/htmls/image.html
|
269
285
|
- spec/fixtures/images/facebook-3.jpg
|
270
286
|
- spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
|
287
|
+
- spec/fixtures/pdfs/newsdict.blog.pdf
|
288
|
+
- spec/fixtures/pdfs/rfc2616.pdf
|
271
289
|
- spec/spec_helper.rb
|
272
290
|
- spec/web_stat/configure_spec.rb
|
273
291
|
- spec/web_stat/fetch_spec.rb
|