web_stat 0.3.14 → 0.3.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 549db2077648ce028b556a72126335f05336155f86c4b18d47856f219c71fff0
4
- data.tar.gz: be8e0cee272fc20013659346608bf9a50a69dc48e777a46214c29f5c1865232d
3
+ metadata.gz: 82732f779004a5a2ef1f7259ce40126de8f51dfb07e46cd4972aafd3ab386ac3
4
+ data.tar.gz: 17c2b99bdeb5db8c134d107fc9d4957dab985c7e680b1c92060bf7090315477b
5
5
  SHA512:
6
- metadata.gz: c78fa085f475c7cdf0747b4c777e357503ce41929bbe5462ccda7b28a6cf4f20a5394deb7853ee11570a4b3338573e392f8697e61484649fe06beadc54aa38a8
7
- data.tar.gz: 941f0de20548a37899ac7610bd95b381ac18ca71761ccfb7f7788c299dd41ff521ae7eea283df64ee624044f919d2951f396ec38c3c4a15ded378ee0acbd0a20
6
+ metadata.gz: d6691a57b0498fcfbb609042cdb7e12b4ac453b5cedaf6dd39671090c97fc19958a4162ba50947392116320ba78c9a21491acab27be2a41287825d3d4d2194d1
7
+ data.tar.gz: 1e2e3d33f5c232532f442bbd16540c47810c93caef436a351124165b5294ddc7354d766ea66aa24ebc052c69432a639a6d5ef7b61a7cde896f6dc4676f95650d
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.12)
4
+ web_stat (0.3.15)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
+ pdf-reader (= 2.4.0)
10
11
  ruby-readability (>= 0.7)
11
12
  sanitize (>= 5.0.0)
12
13
  selenium-webdriver (= 3.142.7)
@@ -14,8 +15,10 @@ PATH
14
15
  GEM
15
16
  remote: https://rubygems.org/
16
17
  specs:
18
+ Ascii85 (1.0.3)
17
19
  addressable (2.7.0)
18
20
  public_suffix (>= 2.0.2, < 5.0)
21
+ afm (0.2.2)
19
22
  byebug (11.1.3)
20
23
  childprocess (3.0.0)
21
24
  cld (0.8.0)
@@ -31,6 +34,7 @@ GEM
31
34
  ffi (1.13.1)
32
35
  guess_html_encoding (0.0.11)
33
36
  hashdiff (1.0.1)
37
+ hashery (2.1.2)
34
38
  http-cookie (1.0.3)
35
39
  domain_name (~> 0.5)
36
40
  mechanize (2.7.6)
@@ -57,6 +61,12 @@ GEM
57
61
  nokogumbo (2.0.2)
58
62
  nokogiri (~> 1.8, >= 1.8.4)
59
63
  ntlm-http (0.1.1)
64
+ pdf-reader (2.4.0)
65
+ Ascii85 (~> 1.0.0)
66
+ afm (~> 0.2.1)
67
+ hashery (~> 2.0)
68
+ ruby-rc4
69
+ ttfunk
60
70
  pry (0.13.1)
61
71
  coderay (~> 1.1)
62
72
  method_source (~> 1.0)
@@ -78,18 +88,20 @@ GEM
78
88
  diff-lcs (>= 1.2.0, < 2.0)
79
89
  rspec-support (~> 3.9.0)
80
90
  rspec-support (3.9.3)
91
+ ruby-rc4 (0.1.5)
81
92
  ruby-readability (0.7.0)
82
93
  guess_html_encoding (>= 0.0.4)
83
94
  nokogiri (>= 1.6.0)
84
95
  rubyzip (2.3.0)
85
96
  safe_yaml (1.0.5)
86
- sanitize (5.2.0)
97
+ sanitize (5.2.1)
87
98
  crass (~> 1.0.2)
88
99
  nokogiri (>= 1.8.0)
89
100
  nokogumbo (~> 2.0)
90
101
  selenium-webdriver (3.142.7)
91
102
  childprocess (>= 0.5, < 4.0)
92
103
  rubyzip (>= 1.2.2)
104
+ ttfunk (1.6.2.1)
93
105
  unf (0.1.4)
94
106
  unf_ext
95
107
  unf_ext (0.0.7.7)
@@ -108,7 +120,7 @@ DEPENDENCIES
108
120
  rake (>= 10.0)
109
121
  rspec (>= 3.0)
110
122
  web_stat!
111
- webmock (>= 3.6.0)
123
+ webmock (>= 3.8.3)
112
124
 
113
125
  BUNDLED WITH
114
126
  2.1.4
@@ -8,6 +8,7 @@ require 'sanitize'
8
8
  require 'nokogiri'
9
9
  require 'open-uri'
10
10
  require 'net/http'
11
+ require 'pdf/reader'
11
12
  require 'ruby-readability'
12
13
  require 'selenium-webdriver'
13
14
 
@@ -9,7 +9,31 @@ module WebStat
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
12
- @html = get_url(@url)
12
+ if @url.match?(/\.pdf$/)
13
+ title = nil
14
+ body = nil
15
+ URI.open(@url) do |io|
16
+ reader = PDF::Reader.new(io)
17
+ if reader.info.key?(:Title)
18
+ title = reader.info[:Title]
19
+ else
20
+ title = File.basename(@url, ".pdf")
21
+ end
22
+ body = reader.pages.first.text
23
+ end
24
+ @html = <<-"EOS"
25
+ <html>
26
+ <head>
27
+ <title>#{title}</title>
28
+ </head>
29
+ <body>
30
+ #{body}
31
+ </body>
32
+ </html>
33
+ EOS
34
+ else
35
+ @html = get_url(@url)
36
+ end
13
37
  @nokogiri = ::Nokogiri::HTML(@html)
14
38
  end
15
39
 
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.15"
3
3
  end
@@ -6,8 +6,9 @@ require "web_stat"
6
6
 
7
7
  require 'webmock'
8
8
  include WebMock::API
9
+ ENV['ENV'] = 'test'
9
10
  WebMock.enable!
10
-
11
+
11
12
  RSpec.configure do |config|
12
13
  # Enable flags like --only-failures and --next-failure
13
14
  config.example_status_persistence_file_path = ".rspec_status"
@@ -52,7 +53,14 @@ module WebStatTestHelper
52
53
  # Get htmls of fixture
53
54
  def scheme_and_files
54
55
  Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
55
- "https://newsdict.blog/#{File.basename(file)}"
56
+ "https://newsdict.blog/#{File.basename(file)}"
57
+ end
58
+ end
59
+
60
+ # Get pdfs of fixture
61
+ def pdfs
62
+ Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
63
+ "https://newsdict.blog/#{File.basename(file)}"
56
64
  end
57
65
  end
58
66
  end
@@ -67,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
67
75
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
68
76
  headers: {content_type: 'application/html; charset=utf-8'})
69
77
  end
78
+ WebStatTestHelper.pdfs.each do |url|
79
+ WebMock.stub_request(:get, url)
80
+ .to_return(
81
+ status: 200,
82
+ body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
83
+ headers: {content_type: 'application/pdf'})
84
+ end
70
85
 
71
86
  WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
72
87
  .to_return(
@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
66
66
  end
67
67
  end
68
68
 
69
+ [{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
70
+
71
+ it "Get title by #{fetch[:class].to_s}" do
72
+ web_stat = fetch[:class].new(fetch[:fixture])
73
+ expect(web_stat.title).to eq "Microsoft Word"
74
+ end
75
+
76
+ it "Get site name by #{fetch[:class].to_s}" do
77
+ web_stat = fetch[:class].new(fetch[:fixture])
78
+ expect(web_stat.site_name).to eq "RFC2616.doc"
79
+ end
80
+
81
+ it "Get Document's content by #{fetch[:class].to_s}" do
82
+ web_stat = fetch[:class].new(fetch[:fixture])
83
+ expect(web_stat.content).not_to eq nil
84
+ end
85
+
86
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
87
+ web_stat = fetch[:class].new(fetch[:fixture])
88
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
89
+ end
90
+
91
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
92
+ web_stat = fetch[:class].new(fetch[:fixture])
93
+ web_stat.url = "https://newsdict.blog"
94
+ unless web_stat.stat[:eyecatch_image_path].nil?
95
+ image = File.read(web_stat.stat[:eyecatch_image_path])
96
+ expect(image.encoding.to_s).to eq("UTF-8")
97
+ end
98
+ end
99
+
100
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
101
+ web_stat = fetch[:class].new(fetch[:fixture])
102
+ web_stat.url = "https://newsdict.blog"
103
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
104
+ end
105
+
106
+ it "Get language_iso by #{fetch[:class].to_s}" do
107
+ web_stat = fetch[:class].new(fetch[:fixture])
108
+ web_stat.url = "https://newsdict.blog"
109
+ expect(web_stat.stat[:language_code]).to eq("en")
110
+ end
111
+
112
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
113
+ web_stat = fetch[:class].new(fetch[:fixture])
114
+ web_stat.url = "https://newsdict.blog"
115
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
116
+ end
117
+ end
118
+
119
+ [{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
120
+
121
+ it "Get title by #{fetch[:class].to_s}" do
122
+ web_stat = fetch[:class].new(fetch[:fixture])
123
+ expect(web_stat.title).to eq "newsdict.blog"
124
+ end
125
+
126
+ it "Get site name by #{fetch[:class].to_s}" do
127
+ web_stat = fetch[:class].new(fetch[:fixture])
128
+ expect(web_stat.site_name).to eq "newsdict.blog"
129
+ end
130
+
131
+ it "Get Document's content by #{fetch[:class].to_s}" do
132
+ web_stat = fetch[:class].new(fetch[:fixture])
133
+ expect(web_stat.content).not_to eq nil
134
+ end
135
+
136
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
137
+ web_stat = fetch[:class].new(fetch[:fixture])
138
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
139
+ end
140
+
141
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
142
+ web_stat = fetch[:class].new(fetch[:fixture])
143
+ web_stat.url = "https://newsdict.blog"
144
+ unless web_stat.stat[:eyecatch_image_path].nil?
145
+ image = File.read(web_stat.stat[:eyecatch_image_path])
146
+ expect(image.encoding.to_s).to eq("UTF-8")
147
+ end
148
+ end
149
+
150
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
151
+ web_stat = fetch[:class].new(fetch[:fixture])
152
+ web_stat.url = "https://newsdict.blog"
153
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
154
+ end
155
+
156
+ it "Get language_iso by #{fetch[:class].to_s}" do
157
+ web_stat = fetch[:class].new(fetch[:fixture])
158
+ web_stat.url = "https://newsdict.blog"
159
+ expect(web_stat.stat[:language_code]).to eq("ja")
160
+ end
161
+
162
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
163
+ web_stat = fetch[:class].new(fetch[:fixture])
164
+ web_stat.url = "https://newsdict.blog"
165
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
166
+ end
167
+ end
168
+
69
169
  it "WebStat.stat_by_html" do
70
170
  WebStatTestHelper.htmls.each do |fixture|
71
171
  web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
104
204
  expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
105
205
  expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
106
206
  end
107
- end
207
+ end
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
+ spec.add_runtime_dependency "pdf-reader", "2.4.0"
31
32
 
32
33
  spec.add_development_dependency "rake", ">= 10.0"
33
34
  spec.add_development_dependency "rspec", ">= 3.0"
34
35
  spec.add_development_dependency "pry", ">= 0.13.1"
35
- spec.add_development_dependency "webmock", ">= 3.6.0"
36
+ spec.add_development_dependency "webmock", ">= 3.8.3"
36
37
  spec.add_development_dependency "pry-byebug", "3.9.0"
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-21 00:00:00.000000000 Z
11
+ date: 2020-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
124
  version: 3.142.7
125
+ - !ruby/object:Gem::Dependency
126
+ name: pdf-reader
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '='
130
+ - !ruby/object:Gem::Version
131
+ version: 2.4.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '='
137
+ - !ruby/object:Gem::Version
138
+ version: 2.4.0
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: rake
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -170,14 +184,14 @@ dependencies:
170
184
  requirements:
171
185
  - - ">="
172
186
  - !ruby/object:Gem::Version
173
- version: 3.6.0
187
+ version: 3.8.3
174
188
  type: :development
175
189
  prerelease: false
176
190
  version_requirements: !ruby/object:Gem::Requirement
177
191
  requirements:
178
192
  - - ">="
179
193
  - !ruby/object:Gem::Version
180
- version: 3.6.0
194
+ version: 3.8.3
181
195
  - !ruby/object:Gem::Dependency
182
196
  name: pry-byebug
183
197
  requirement: !ruby/object:Gem::Requirement
@@ -233,6 +247,8 @@ files:
233
247
  - spec/fixtures/htmls/image.html
234
248
  - spec/fixtures/images/facebook-3.jpg
235
249
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
250
+ - spec/fixtures/pdfs/newsdict.blog.pdf
251
+ - spec/fixtures/pdfs/rfc2616.pdf
236
252
  - spec/spec_helper.rb
237
253
  - spec/web_stat/configure_spec.rb
238
254
  - spec/web_stat/fetch_spec.rb
@@ -268,6 +284,8 @@ test_files:
268
284
  - spec/fixtures/htmls/image.html
269
285
  - spec/fixtures/images/facebook-3.jpg
270
286
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
287
+ - spec/fixtures/pdfs/newsdict.blog.pdf
288
+ - spec/fixtures/pdfs/rfc2616.pdf
271
289
  - spec/spec_helper.rb
272
290
  - spec/web_stat/configure_spec.rb
273
291
  - spec/web_stat/fetch_spec.rb