web_stat 0.3.14 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +15 -3
- data/lib/web_stat.rb +1 -0
- data/lib/web_stat/fetch/fetch_as_web.rb +25 -1
- data/lib/web_stat/version.rb +1 -1
- data/spec/fixtures/pdfs/newsdict.blog.pdf +0 -0
- data/spec/fixtures/pdfs/rfc2616.pdf +0 -0
- data/spec/spec_helper.rb +17 -2
- data/spec/web_stat/fetch_spec.rb +101 -1
- data/web_stat.gemspec +2 -1
- metadata +22 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82732f779004a5a2ef1f7259ce40126de8f51dfb07e46cd4972aafd3ab386ac3
|
4
|
+
data.tar.gz: 17c2b99bdeb5db8c134d107fc9d4957dab985c7e680b1c92060bf7090315477b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6691a57b0498fcfbb609042cdb7e12b4ac453b5cedaf6dd39671090c97fc19958a4162ba50947392116320ba78c9a21491acab27be2a41287825d3d4d2194d1
|
7
|
+
data.tar.gz: 1e2e3d33f5c232532f442bbd16540c47810c93caef436a351124165b5294ddc7354d766ea66aa24ebc052c69432a639a6d5ef7b61a7cde896f6dc4676f95650d
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
web_stat (0.3.
|
4
|
+
web_stat (0.3.15)
|
5
5
|
bundler (>= 2.0.2)
|
6
6
|
cld (>= 0.8.0)
|
7
7
|
mechanize (>= 2.7)
|
8
8
|
natto (>= 1.1.2)
|
9
9
|
nokogiri (>= 1.10.4)
|
10
|
+
pdf-reader (= 2.4.0)
|
10
11
|
ruby-readability (>= 0.7)
|
11
12
|
sanitize (>= 5.0.0)
|
12
13
|
selenium-webdriver (= 3.142.7)
|
@@ -14,8 +15,10 @@ PATH
|
|
14
15
|
GEM
|
15
16
|
remote: https://rubygems.org/
|
16
17
|
specs:
|
18
|
+
Ascii85 (1.0.3)
|
17
19
|
addressable (2.7.0)
|
18
20
|
public_suffix (>= 2.0.2, < 5.0)
|
21
|
+
afm (0.2.2)
|
19
22
|
byebug (11.1.3)
|
20
23
|
childprocess (3.0.0)
|
21
24
|
cld (0.8.0)
|
@@ -31,6 +34,7 @@ GEM
|
|
31
34
|
ffi (1.13.1)
|
32
35
|
guess_html_encoding (0.0.11)
|
33
36
|
hashdiff (1.0.1)
|
37
|
+
hashery (2.1.2)
|
34
38
|
http-cookie (1.0.3)
|
35
39
|
domain_name (~> 0.5)
|
36
40
|
mechanize (2.7.6)
|
@@ -57,6 +61,12 @@ GEM
|
|
57
61
|
nokogumbo (2.0.2)
|
58
62
|
nokogiri (~> 1.8, >= 1.8.4)
|
59
63
|
ntlm-http (0.1.1)
|
64
|
+
pdf-reader (2.4.0)
|
65
|
+
Ascii85 (~> 1.0.0)
|
66
|
+
afm (~> 0.2.1)
|
67
|
+
hashery (~> 2.0)
|
68
|
+
ruby-rc4
|
69
|
+
ttfunk
|
60
70
|
pry (0.13.1)
|
61
71
|
coderay (~> 1.1)
|
62
72
|
method_source (~> 1.0)
|
@@ -78,18 +88,20 @@ GEM
|
|
78
88
|
diff-lcs (>= 1.2.0, < 2.0)
|
79
89
|
rspec-support (~> 3.9.0)
|
80
90
|
rspec-support (3.9.3)
|
91
|
+
ruby-rc4 (0.1.5)
|
81
92
|
ruby-readability (0.7.0)
|
82
93
|
guess_html_encoding (>= 0.0.4)
|
83
94
|
nokogiri (>= 1.6.0)
|
84
95
|
rubyzip (2.3.0)
|
85
96
|
safe_yaml (1.0.5)
|
86
|
-
sanitize (5.2.
|
97
|
+
sanitize (5.2.1)
|
87
98
|
crass (~> 1.0.2)
|
88
99
|
nokogiri (>= 1.8.0)
|
89
100
|
nokogumbo (~> 2.0)
|
90
101
|
selenium-webdriver (3.142.7)
|
91
102
|
childprocess (>= 0.5, < 4.0)
|
92
103
|
rubyzip (>= 1.2.2)
|
104
|
+
ttfunk (1.6.2.1)
|
93
105
|
unf (0.1.4)
|
94
106
|
unf_ext
|
95
107
|
unf_ext (0.0.7.7)
|
@@ -108,7 +120,7 @@ DEPENDENCIES
|
|
108
120
|
rake (>= 10.0)
|
109
121
|
rspec (>= 3.0)
|
110
122
|
web_stat!
|
111
|
-
webmock (>= 3.
|
123
|
+
webmock (>= 3.8.3)
|
112
124
|
|
113
125
|
BUNDLED WITH
|
114
126
|
2.1.4
|
data/lib/web_stat.rb
CHANGED
@@ -9,7 +9,31 @@ module WebStat
|
|
9
9
|
raise WebStat::INVALID_URL, url
|
10
10
|
end
|
11
11
|
@url = original_url(url)
|
12
|
-
|
12
|
+
if @url.match?(/\.pdf$/)
|
13
|
+
title = nil
|
14
|
+
body = nil
|
15
|
+
URI.open(@url) do |io|
|
16
|
+
reader = PDF::Reader.new(io)
|
17
|
+
if reader.info.key?(:Title)
|
18
|
+
title = reader.info[:Title]
|
19
|
+
else
|
20
|
+
title = File.basename(@url, ".pdf")
|
21
|
+
end
|
22
|
+
body = reader.pages.first.text
|
23
|
+
end
|
24
|
+
@html = <<-"EOS"
|
25
|
+
<html>
|
26
|
+
<head>
|
27
|
+
<title>#{title}</title>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
#{body}
|
31
|
+
</body>
|
32
|
+
</html>
|
33
|
+
EOS
|
34
|
+
else
|
35
|
+
@html = get_url(@url)
|
36
|
+
end
|
13
37
|
@nokogiri = ::Nokogiri::HTML(@html)
|
14
38
|
end
|
15
39
|
|
data/lib/web_stat/version.rb
CHANGED
Binary file
|
Binary file
|
data/spec/spec_helper.rb
CHANGED
@@ -6,8 +6,9 @@ require "web_stat"
|
|
6
6
|
|
7
7
|
require 'webmock'
|
8
8
|
include WebMock::API
|
9
|
+
ENV['ENV'] = 'test'
|
9
10
|
WebMock.enable!
|
10
|
-
|
11
|
+
|
11
12
|
RSpec.configure do |config|
|
12
13
|
# Enable flags like --only-failures and --next-failure
|
13
14
|
config.example_status_persistence_file_path = ".rspec_status"
|
@@ -52,7 +53,14 @@ module WebStatTestHelper
|
|
52
53
|
# Get htmls of fixture
|
53
54
|
def scheme_and_files
|
54
55
|
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
|
55
|
-
|
56
|
+
"https://newsdict.blog/#{File.basename(file)}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get pdfs of fixture
|
61
|
+
def pdfs
|
62
|
+
Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
|
63
|
+
"https://newsdict.blog/#{File.basename(file)}"
|
56
64
|
end
|
57
65
|
end
|
58
66
|
end
|
@@ -67,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
|
|
67
75
|
body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
|
68
76
|
headers: {content_type: 'application/html; charset=utf-8'})
|
69
77
|
end
|
78
|
+
WebStatTestHelper.pdfs.each do |url|
|
79
|
+
WebMock.stub_request(:get, url)
|
80
|
+
.to_return(
|
81
|
+
status: 200,
|
82
|
+
body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
|
83
|
+
headers: {content_type: 'application/pdf'})
|
84
|
+
end
|
70
85
|
|
71
86
|
WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
|
72
87
|
.to_return(
|
data/spec/web_stat/fetch_spec.rb
CHANGED
@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
+
[{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
|
70
|
+
|
71
|
+
it "Get title by #{fetch[:class].to_s}" do
|
72
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
73
|
+
expect(web_stat.title).to eq "Microsoft Word"
|
74
|
+
end
|
75
|
+
|
76
|
+
it "Get site name by #{fetch[:class].to_s}" do
|
77
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
78
|
+
expect(web_stat.site_name).to eq "RFC2616.doc"
|
79
|
+
end
|
80
|
+
|
81
|
+
it "Get Document's content by #{fetch[:class].to_s}" do
|
82
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
83
|
+
expect(web_stat.content).not_to eq nil
|
84
|
+
end
|
85
|
+
|
86
|
+
it "WebStat content do not include html by #{fetch[:class].to_s}" do
|
87
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
88
|
+
expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
|
89
|
+
end
|
90
|
+
|
91
|
+
it "Get eyecatch image blob by #{fetch[:class].to_s}" do
|
92
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
93
|
+
web_stat.url = "https://newsdict.blog"
|
94
|
+
unless web_stat.stat[:eyecatch_image_path].nil?
|
95
|
+
image = File.read(web_stat.stat[:eyecatch_image_path])
|
96
|
+
expect(image.encoding.to_s).to eq("UTF-8")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
it "Get eyecatch image path by #{fetch[:class].to_s}" do
|
101
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
102
|
+
web_stat.url = "https://newsdict.blog"
|
103
|
+
expect(web_stat.eyecatch_image_path).to be_string_or_nil
|
104
|
+
end
|
105
|
+
|
106
|
+
it "Get language_iso by #{fetch[:class].to_s}" do
|
107
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
108
|
+
web_stat.url = "https://newsdict.blog"
|
109
|
+
expect(web_stat.stat[:language_code]).to eq("en")
|
110
|
+
end
|
111
|
+
|
112
|
+
it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
|
113
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
114
|
+
web_stat.url = "https://newsdict.blog"
|
115
|
+
expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
[{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
|
120
|
+
|
121
|
+
it "Get title by #{fetch[:class].to_s}" do
|
122
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
123
|
+
expect(web_stat.title).to eq "newsdict.blog"
|
124
|
+
end
|
125
|
+
|
126
|
+
it "Get site name by #{fetch[:class].to_s}" do
|
127
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
128
|
+
expect(web_stat.site_name).to eq "newsdict.blog"
|
129
|
+
end
|
130
|
+
|
131
|
+
it "Get Document's content by #{fetch[:class].to_s}" do
|
132
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
133
|
+
expect(web_stat.content).not_to eq nil
|
134
|
+
end
|
135
|
+
|
136
|
+
it "WebStat content do not include html by #{fetch[:class].to_s}" do
|
137
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
138
|
+
expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
|
139
|
+
end
|
140
|
+
|
141
|
+
it "Get eyecatch image blob by #{fetch[:class].to_s}" do
|
142
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
143
|
+
web_stat.url = "https://newsdict.blog"
|
144
|
+
unless web_stat.stat[:eyecatch_image_path].nil?
|
145
|
+
image = File.read(web_stat.stat[:eyecatch_image_path])
|
146
|
+
expect(image.encoding.to_s).to eq("UTF-8")
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
it "Get eyecatch image path by #{fetch[:class].to_s}" do
|
151
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
152
|
+
web_stat.url = "https://newsdict.blog"
|
153
|
+
expect(web_stat.eyecatch_image_path).to be_string_or_nil
|
154
|
+
end
|
155
|
+
|
156
|
+
it "Get language_iso by #{fetch[:class].to_s}" do
|
157
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
158
|
+
web_stat.url = "https://newsdict.blog"
|
159
|
+
expect(web_stat.stat[:language_code]).to eq("ja")
|
160
|
+
end
|
161
|
+
|
162
|
+
it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
|
163
|
+
web_stat = fetch[:class].new(fetch[:fixture])
|
164
|
+
web_stat.url = "https://newsdict.blog"
|
165
|
+
expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
69
169
|
it "WebStat.stat_by_html" do
|
70
170
|
WebStatTestHelper.htmls.each do |fixture|
|
71
171
|
web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
|
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
|
|
104
204
|
expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
|
105
205
|
expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
|
106
206
|
end
|
107
|
-
end
|
207
|
+
end
|
data/web_stat.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_runtime_dependency "sanitize", ">= 5.0.0"
|
29
29
|
spec.add_runtime_dependency "cld", ">= 0.8.0"
|
30
30
|
spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
|
31
|
+
spec.add_runtime_dependency "pdf-reader", "2.4.0"
|
31
32
|
|
32
33
|
spec.add_development_dependency "rake", ">= 10.0"
|
33
34
|
spec.add_development_dependency "rspec", ">= 3.0"
|
34
35
|
spec.add_development_dependency "pry", ">= 0.13.1"
|
35
|
-
spec.add_development_dependency "webmock", ">= 3.
|
36
|
+
spec.add_development_dependency "webmock", ">= 3.8.3"
|
36
37
|
spec.add_development_dependency "pry-byebug", "3.9.0"
|
37
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_stat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yusuke abe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - '='
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: 3.142.7
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pdf-reader
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 2.4.0
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: 2.4.0
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: rake
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,14 +184,14 @@ dependencies:
|
|
170
184
|
requirements:
|
171
185
|
- - ">="
|
172
186
|
- !ruby/object:Gem::Version
|
173
|
-
version: 3.
|
187
|
+
version: 3.8.3
|
174
188
|
type: :development
|
175
189
|
prerelease: false
|
176
190
|
version_requirements: !ruby/object:Gem::Requirement
|
177
191
|
requirements:
|
178
192
|
- - ">="
|
179
193
|
- !ruby/object:Gem::Version
|
180
|
-
version: 3.
|
194
|
+
version: 3.8.3
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
196
|
name: pry-byebug
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -233,6 +247,8 @@ files:
|
|
233
247
|
- spec/fixtures/htmls/image.html
|
234
248
|
- spec/fixtures/images/facebook-3.jpg
|
235
249
|
- spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
|
250
|
+
- spec/fixtures/pdfs/newsdict.blog.pdf
|
251
|
+
- spec/fixtures/pdfs/rfc2616.pdf
|
236
252
|
- spec/spec_helper.rb
|
237
253
|
- spec/web_stat/configure_spec.rb
|
238
254
|
- spec/web_stat/fetch_spec.rb
|
@@ -268,6 +284,8 @@ test_files:
|
|
268
284
|
- spec/fixtures/htmls/image.html
|
269
285
|
- spec/fixtures/images/facebook-3.jpg
|
270
286
|
- spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
|
287
|
+
- spec/fixtures/pdfs/newsdict.blog.pdf
|
288
|
+
- spec/fixtures/pdfs/rfc2616.pdf
|
271
289
|
- spec/spec_helper.rb
|
272
290
|
- spec/web_stat/configure_spec.rb
|
273
291
|
- spec/web_stat/fetch_spec.rb
|