web_stat 0.3.11 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a94017f641fb1f84d67ea5d650c094df5a1d738ccbbf880c112f95da4a2792b
4
- data.tar.gz: 7383ba299f9f02bae998a104e84302ea99a11a28c77ad42fa23637eec9e2a922
3
+ metadata.gz: c332bb9cf67262c2b5d8e3c30c861107bd6ac8e3d86136730864e49b10dabcce
4
+ data.tar.gz: 712f39109989e917e8af6b84fe40b64ce84009a9de4b3647c03177ecb69bc58d
5
5
  SHA512:
6
- metadata.gz: 4c7d593118d75755f3db68a9f7995de60c63c5c47ba92c30a4772bc6ade5cefda14c8b44a87475ebc6cb3ab1a0badefcd3df1f6d2038aec714af654a07c89ccd
7
- data.tar.gz: 2aa7dc288acd6de6207ee8d3e2833ba2b480b99a6d3ffeed651cce6982bbb557adfe173dfd3af1d10345bb0ad4710f1e4b434272039bb8a988beabb35811c5b4
6
+ metadata.gz: 3607fbe1e76018e3e523ed26a2777837c539a80f5c68da195996616d3566f6034125f0d49e96db9338ae40aeda5dac000968f6e089a7d96c19a412f14d2acbd2
7
+ data.tar.gz: 027d3b911cbee0dfdd7a8f85b32d24e4b8c9e2f916a47a2d3ea31b34fe0e2f07b452e00372d2028d511d39fba7df8f26347789fd795103422711d0e979b1d8a2
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.10)
4
+ web_stat (0.3.15)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
+ pdf-reader (= 2.4.0)
10
11
  ruby-readability (>= 0.7)
11
12
  sanitize (>= 5.0.0)
12
13
  selenium-webdriver (= 3.142.7)
@@ -14,8 +15,10 @@ PATH
14
15
  GEM
15
16
  remote: https://rubygems.org/
16
17
  specs:
18
+ Ascii85 (1.0.3)
17
19
  addressable (2.7.0)
18
20
  public_suffix (>= 2.0.2, < 5.0)
21
+ afm (0.2.2)
19
22
  byebug (11.1.3)
20
23
  childprocess (3.0.0)
21
24
  cld (0.8.0)
@@ -31,6 +34,7 @@ GEM
31
34
  ffi (1.13.1)
32
35
  guess_html_encoding (0.0.11)
33
36
  hashdiff (1.0.1)
37
+ hashery (2.1.2)
34
38
  http-cookie (1.0.3)
35
39
  domain_name (~> 0.5)
36
40
  mechanize (2.7.6)
@@ -57,6 +61,12 @@ GEM
57
61
  nokogumbo (2.0.2)
58
62
  nokogiri (~> 1.8, >= 1.8.4)
59
63
  ntlm-http (0.1.1)
64
+ pdf-reader (2.4.0)
65
+ Ascii85 (~> 1.0.0)
66
+ afm (~> 0.2.1)
67
+ hashery (~> 2.0)
68
+ ruby-rc4
69
+ ttfunk
60
70
  pry (0.13.1)
61
71
  coderay (~> 1.1)
62
72
  method_source (~> 1.0)
@@ -78,18 +88,20 @@ GEM
78
88
  diff-lcs (>= 1.2.0, < 2.0)
79
89
  rspec-support (~> 3.9.0)
80
90
  rspec-support (3.9.3)
91
+ ruby-rc4 (0.1.5)
81
92
  ruby-readability (0.7.0)
82
93
  guess_html_encoding (>= 0.0.4)
83
94
  nokogiri (>= 1.6.0)
84
95
  rubyzip (2.3.0)
85
96
  safe_yaml (1.0.5)
86
- sanitize (5.2.0)
97
+ sanitize (5.2.1)
87
98
  crass (~> 1.0.2)
88
99
  nokogiri (>= 1.8.0)
89
100
  nokogumbo (~> 2.0)
90
101
  selenium-webdriver (3.142.7)
91
102
  childprocess (>= 0.5, < 4.0)
92
103
  rubyzip (>= 1.2.2)
104
+ ttfunk (1.6.2.1)
93
105
  unf (0.1.4)
94
106
  unf_ext
95
107
  unf_ext (0.0.7.7)
@@ -108,7 +120,7 @@ DEPENDENCIES
108
120
  rake (>= 10.0)
109
121
  rspec (>= 3.0)
110
122
  web_stat!
111
- webmock (>= 3.6.0)
123
+ webmock (>= 3.8.3)
112
124
 
113
125
  BUNDLED WITH
114
126
  2.1.4
data/README.md CHANGED
@@ -55,12 +55,10 @@ And then execute:
55
55
 
56
56
  ### spec
57
57
 
58
- $ bundle exec rake spec
59
-
60
- or
61
-
62
- $ bundle exec rspec
58
+ $ docker/start -d
59
+ $ docker/exec ENV=development bundle exec rspec
63
60
 
64
61
  Test a file
65
62
 
66
- $ bundle exec rspec spec/web_stat/fetch_spec.rb
63
+ $ docker/start -d
64
+ $ docker/exec ENV=development bundle exec rspec spec/web_stat/fetch_spec.rb
@@ -8,6 +8,7 @@ require 'sanitize'
8
8
  require 'nokogiri'
9
9
  require 'open-uri'
10
10
  require 'net/http'
11
+ require 'pdf/reader'
11
12
  require 'ruby-readability'
12
13
  require 'selenium-webdriver'
13
14
 
@@ -9,7 +9,7 @@ module WebStat
9
9
  if defined? Rails
10
10
  YAML.load_file(get_configure_path)[Rails.env]
11
11
  else
12
- YAML.load_file(get_configure_path)["production"]
12
+ YAML.load_file(get_configure_path)[ENV["ENV"] || "production"]
13
13
  end
14
14
  end
15
15
 
@@ -1,7 +1,12 @@
1
1
  module WebStat
2
2
  class Fetch
3
+ THUMBNAIL_REGEXS = {
4
+ :youtube => [
5
+ %r{^https://www.youtube.com/watch\?v=([^&]+)},
6
+ 'http://img.youtube.com/vi/\1/default.jpg'
7
+ ]
8
+ }
3
9
  attr_accessor :url, :html, :nokogiri, :userdic, :status
4
-
5
10
  # Get title
6
11
  # @return [String] title
7
12
  def title
@@ -19,7 +24,8 @@ module WebStat
19
24
  title.strip
20
25
  end
21
26
  end
22
- # Get name of domain
27
+
28
+ # Get name of domain
23
29
  def site_name
24
30
  begin
25
31
  site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -36,7 +42,7 @@ module WebStat
36
42
  def content
37
43
  Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
38
44
  end
39
-
45
+
40
46
  # Get temporary path of image
41
47
  def eyecatch_image_path
42
48
  # Reuse `path` in this method
@@ -47,8 +53,18 @@ module WebStat
47
53
  break
48
54
  end
49
55
  end
50
- if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
51
- path = @nokogiri.at('body').xpath('//img').first.attr('src')
56
+ # If there is a thumbnail rule, apply it.
57
+ THUMBNAIL_REGEXS.each do |provider, v|
58
+ if @url.match(v[0])
59
+ return @url.gsub(v[0], v[1])
60
+ end
61
+ end
62
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
63
+ if (path.nil? || path.empty?) && readability_content.xpath('//img').first
64
+ path = readability_content.xpath('//img').first.attr('src')
65
+ end
66
+ if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
67
+ path = @nokogiri.xpath('//img').first.attr('src')
52
68
  end
53
69
  if ! path.nil? && path.match(/^\//)
54
70
  "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
@@ -56,7 +72,7 @@ module WebStat
56
72
  path
57
73
  end
58
74
  end
59
-
75
+
60
76
  # Get local path to save url
61
77
  # @param [String] url
62
78
  def save_local_path(url)
@@ -73,7 +89,7 @@ module WebStat
73
89
  end
74
90
  tmp_file
75
91
  end
76
-
92
+
77
93
  # Get url
78
94
  # @param [String] url
79
95
  # @param [String] body
@@ -103,7 +119,7 @@ module WebStat
103
119
  end
104
120
  body
105
121
  end
106
-
122
+
107
123
  # Get the informations of @url
108
124
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
109
125
  def stat(userdics: nil)
@@ -9,7 +9,31 @@ module WebStat
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
12
- @html = get_url(@url)
12
+ if @url.match?(/\.pdf$/)
13
+ title = nil
14
+ body = nil
15
+ URI.open(@url) do |io|
16
+ reader = PDF::Reader.new(io)
17
+ if reader.info.key?(:Title)
18
+ title = reader.info[:Title]
19
+ else
20
+ title = File.basename(@url, ".pdf")
21
+ end
22
+ body = reader.pages.first.text
23
+ end
24
+ @html = <<-"EOS"
25
+ <html>
26
+ <head>
27
+ <title>#{title}</title>
28
+ </head>
29
+ <body>
30
+ #{body}
31
+ </body>
32
+ </html>
33
+ EOS
34
+ else
35
+ @html = get_url(@url)
36
+ end
13
37
  @nokogiri = ::Nokogiri::HTML(@html)
14
38
  end
15
39
 
@@ -11,7 +11,7 @@ module WebStat
11
11
  redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
12
12
  response_uri = get_final_redirect_url(url, redirect_lookup_depth)
13
13
  final_url = url_string_from_uri(response_uri)
14
- rescue Exception => ex
14
+ rescue => e
15
15
  # nothing
16
16
  end
17
17
  end
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.11"
2
+ VERSION = "0.3.16"
3
3
  end
@@ -6,13 +6,9 @@ require "web_stat"
6
6
 
7
7
  require 'webmock'
8
8
  include WebMock::API
9
+ ENV['ENV'] = 'test'
9
10
  WebMock.enable!
10
11
 
11
- WebMock.disable_net_connect!({
12
- allow_localhost: true,
13
- allow: 'chromedriver.storage.googleapis.com'
14
- })
15
-
16
12
  RSpec.configure do |config|
17
13
  # Enable flags like --only-failures and --next-failure
18
14
  config.example_status_persistence_file_path = ".rspec_status"
@@ -57,7 +53,14 @@ module WebStatTestHelper
57
53
  # Get htmls of fixture
58
54
  def scheme_and_files
59
55
  Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
60
- "https://newsdict.blog/#{File.basename(file)}"
56
+ "https://newsdict.blog/#{File.basename(file)}"
57
+ end
58
+ end
59
+
60
+ # Get pdfs of fixture
61
+ def pdfs
62
+ Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
63
+ "https://newsdict.blog/#{File.basename(file)}"
61
64
  end
62
65
  end
63
66
  end
@@ -72,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
72
75
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
73
76
  headers: {content_type: 'application/html; charset=utf-8'})
74
77
  end
78
+ WebStatTestHelper.pdfs.each do |url|
79
+ WebMock.stub_request(:get, url)
80
+ .to_return(
81
+ status: 200,
82
+ body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
83
+ headers: {content_type: 'application/pdf'})
84
+ end
75
85
 
76
86
  WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
77
87
  .to_return(
@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
66
66
  end
67
67
  end
68
68
 
69
+ [{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
70
+
71
+ it "Get title by #{fetch[:class].to_s}" do
72
+ web_stat = fetch[:class].new(fetch[:fixture])
73
+ expect(web_stat.title).to eq "Microsoft Word"
74
+ end
75
+
76
+ it "Get site name by #{fetch[:class].to_s}" do
77
+ web_stat = fetch[:class].new(fetch[:fixture])
78
+ expect(web_stat.site_name).to eq "RFC2616.doc"
79
+ end
80
+
81
+ it "Get Document's content by #{fetch[:class].to_s}" do
82
+ web_stat = fetch[:class].new(fetch[:fixture])
83
+ expect(web_stat.content).not_to eq nil
84
+ end
85
+
86
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
87
+ web_stat = fetch[:class].new(fetch[:fixture])
88
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
89
+ end
90
+
91
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
92
+ web_stat = fetch[:class].new(fetch[:fixture])
93
+ web_stat.url = "https://newsdict.blog"
94
+ unless web_stat.stat[:eyecatch_image_path].nil?
95
+ image = File.read(web_stat.stat[:eyecatch_image_path])
96
+ expect(image.encoding.to_s).to eq("UTF-8")
97
+ end
98
+ end
99
+
100
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
101
+ web_stat = fetch[:class].new(fetch[:fixture])
102
+ web_stat.url = "https://newsdict.blog"
103
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
104
+ end
105
+
106
+ it "Get language_iso by #{fetch[:class].to_s}" do
107
+ web_stat = fetch[:class].new(fetch[:fixture])
108
+ web_stat.url = "https://newsdict.blog"
109
+ expect(web_stat.stat[:language_code]).to eq("en")
110
+ end
111
+
112
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
113
+ web_stat = fetch[:class].new(fetch[:fixture])
114
+ web_stat.url = "https://newsdict.blog"
115
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
116
+ end
117
+ end
118
+
119
+ [{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
120
+
121
+ it "Get title by #{fetch[:class].to_s}" do
122
+ web_stat = fetch[:class].new(fetch[:fixture])
123
+ expect(web_stat.title).to eq "newsdict.blog"
124
+ end
125
+
126
+ it "Get site name by #{fetch[:class].to_s}" do
127
+ web_stat = fetch[:class].new(fetch[:fixture])
128
+ expect(web_stat.site_name).to eq "newsdict.blog"
129
+ end
130
+
131
+ it "Get Document's content by #{fetch[:class].to_s}" do
132
+ web_stat = fetch[:class].new(fetch[:fixture])
133
+ expect(web_stat.content).not_to eq nil
134
+ end
135
+
136
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
137
+ web_stat = fetch[:class].new(fetch[:fixture])
138
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
139
+ end
140
+
141
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
142
+ web_stat = fetch[:class].new(fetch[:fixture])
143
+ web_stat.url = "https://newsdict.blog"
144
+ unless web_stat.stat[:eyecatch_image_path].nil?
145
+ image = File.read(web_stat.stat[:eyecatch_image_path])
146
+ expect(image.encoding.to_s).to eq("UTF-8")
147
+ end
148
+ end
149
+
150
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
151
+ web_stat = fetch[:class].new(fetch[:fixture])
152
+ web_stat.url = "https://newsdict.blog"
153
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
154
+ end
155
+
156
+ it "Get language_iso by #{fetch[:class].to_s}" do
157
+ web_stat = fetch[:class].new(fetch[:fixture])
158
+ web_stat.url = "https://newsdict.blog"
159
+ expect(web_stat.stat[:language_code]).to eq("ja")
160
+ end
161
+
162
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
163
+ web_stat = fetch[:class].new(fetch[:fixture])
164
+ web_stat.url = "https://newsdict.blog"
165
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
166
+ end
167
+ end
168
+
69
169
  it "WebStat.stat_by_html" do
70
170
  WebStatTestHelper.htmls.each do |fixture|
71
171
  web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
104
204
  expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
105
205
  expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
106
206
  end
107
- end
207
+ end
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
+ spec.add_runtime_dependency "pdf-reader", "2.4.0"
31
32
 
32
33
  spec.add_development_dependency "rake", ">= 10.0"
33
34
  spec.add_development_dependency "rspec", ">= 3.0"
34
35
  spec.add_development_dependency "pry", ">= 0.13.1"
35
- spec.add_development_dependency "webmock", ">= 3.6.0"
36
+ spec.add_development_dependency "webmock", ">= 3.8.3"
36
37
  spec.add_development_dependency "pry-byebug", "3.9.0"
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-21 00:00:00.000000000 Z
11
+ date: 2020-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
124
  version: 3.142.7
125
+ - !ruby/object:Gem::Dependency
126
+ name: pdf-reader
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '='
130
+ - !ruby/object:Gem::Version
131
+ version: 2.4.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '='
137
+ - !ruby/object:Gem::Version
138
+ version: 2.4.0
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: rake
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -170,14 +184,14 @@ dependencies:
170
184
  requirements:
171
185
  - - ">="
172
186
  - !ruby/object:Gem::Version
173
- version: 3.6.0
187
+ version: 3.8.3
174
188
  type: :development
175
189
  prerelease: false
176
190
  version_requirements: !ruby/object:Gem::Requirement
177
191
  requirements:
178
192
  - - ">="
179
193
  - !ruby/object:Gem::Version
180
- version: 3.6.0
194
+ version: 3.8.3
181
195
  - !ruby/object:Gem::Dependency
182
196
  name: pry-byebug
183
197
  requirement: !ruby/object:Gem::Requirement
@@ -233,6 +247,8 @@ files:
233
247
  - spec/fixtures/htmls/image.html
234
248
  - spec/fixtures/images/facebook-3.jpg
235
249
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
250
+ - spec/fixtures/pdfs/newsdict.blog.pdf
251
+ - spec/fixtures/pdfs/rfc2616.pdf
236
252
  - spec/spec_helper.rb
237
253
  - spec/web_stat/configure_spec.rb
238
254
  - spec/web_stat/fetch_spec.rb
@@ -268,6 +284,8 @@ test_files:
268
284
  - spec/fixtures/htmls/image.html
269
285
  - spec/fixtures/images/facebook-3.jpg
270
286
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
287
+ - spec/fixtures/pdfs/newsdict.blog.pdf
288
+ - spec/fixtures/pdfs/rfc2616.pdf
271
289
  - spec/spec_helper.rb
272
290
  - spec/web_stat/configure_spec.rb
273
291
  - spec/web_stat/fetch_spec.rb