web_stat 0.3.14 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 549db2077648ce028b556a72126335f05336155f86c4b18d47856f219c71fff0
4
- data.tar.gz: be8e0cee272fc20013659346608bf9a50a69dc48e777a46214c29f5c1865232d
3
+ metadata.gz: 2893819d947835e7cc92b35361c487ede791e0bd87ac7aa94bb0d0c3e28780a0
4
+ data.tar.gz: 35d90e33f07dc24fabeabca7aef669519053781ff71ddeec12ca85ee52521c3b
5
5
  SHA512:
6
- metadata.gz: c78fa085f475c7cdf0747b4c777e357503ce41929bbe5462ccda7b28a6cf4f20a5394deb7853ee11570a4b3338573e392f8697e61484649fe06beadc54aa38a8
7
- data.tar.gz: 941f0de20548a37899ac7610bd95b381ac18ca71761ccfb7f7788c299dd41ff521ae7eea283df64ee624044f919d2951f396ec38c3c4a15ded378ee0acbd0a20
6
+ metadata.gz: 6f576937d619990b2ccb72bf016975521065e052605c7c4835f233f0f58da866836cb961ea8bd8d213bebee84671717d73b35cc82a0a49dcef9ad04f276cacd3
7
+ data.tar.gz: 87711062737c14f00523ea51a91de50f5e9a00364c4df73a61bd49fcfa5559ae7b995718168b2fa48346d0f3e25af16aa98df16e56b4c6f1dea7ebf74a629664
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.12)
4
+ web_stat (0.3.19)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
+ pdf-reader (= 2.4.0)
10
11
  ruby-readability (>= 0.7)
11
12
  sanitize (>= 5.0.0)
12
13
  selenium-webdriver (= 3.142.7)
@@ -14,8 +15,10 @@ PATH
14
15
  GEM
15
16
  remote: https://rubygems.org/
16
17
  specs:
18
+ Ascii85 (1.0.3)
17
19
  addressable (2.7.0)
18
20
  public_suffix (>= 2.0.2, < 5.0)
21
+ afm (0.2.2)
19
22
  byebug (11.1.3)
20
23
  childprocess (3.0.0)
21
24
  cld (0.8.0)
@@ -31,6 +34,7 @@ GEM
31
34
  ffi (1.13.1)
32
35
  guess_html_encoding (0.0.11)
33
36
  hashdiff (1.0.1)
37
+ hashery (2.1.2)
34
38
  http-cookie (1.0.3)
35
39
  domain_name (~> 0.5)
36
40
  mechanize (2.7.6)
@@ -52,11 +56,17 @@ GEM
52
56
  net-http-digest_auth (1.4.1)
53
57
  net-http-persistent (4.0.0)
54
58
  connection_pool (~> 2.2)
55
- nokogiri (1.10.9)
59
+ nokogiri (1.10.10)
56
60
  mini_portile2 (~> 2.4.0)
57
61
  nokogumbo (2.0.2)
58
62
  nokogiri (~> 1.8, >= 1.8.4)
59
63
  ntlm-http (0.1.1)
64
+ pdf-reader (2.4.0)
65
+ Ascii85 (~> 1.0.0)
66
+ afm (~> 0.2.1)
67
+ hashery (~> 2.0)
68
+ ruby-rc4
69
+ ttfunk
60
70
  pry (0.13.1)
61
71
  coderay (~> 1.1)
62
72
  method_source (~> 1.0)
@@ -78,18 +88,20 @@ GEM
78
88
  diff-lcs (>= 1.2.0, < 2.0)
79
89
  rspec-support (~> 3.9.0)
80
90
  rspec-support (3.9.3)
91
+ ruby-rc4 (0.1.5)
81
92
  ruby-readability (0.7.0)
82
93
  guess_html_encoding (>= 0.0.4)
83
94
  nokogiri (>= 1.6.0)
84
95
  rubyzip (2.3.0)
85
96
  safe_yaml (1.0.5)
86
- sanitize (5.2.0)
97
+ sanitize (5.2.1)
87
98
  crass (~> 1.0.2)
88
99
  nokogiri (>= 1.8.0)
89
100
  nokogumbo (~> 2.0)
90
101
  selenium-webdriver (3.142.7)
91
102
  childprocess (>= 0.5, < 4.0)
92
103
  rubyzip (>= 1.2.2)
104
+ ttfunk (1.6.2.1)
93
105
  unf (0.1.4)
94
106
  unf_ext
95
107
  unf_ext (0.0.7.7)
@@ -108,7 +120,7 @@ DEPENDENCIES
108
120
  rake (>= 10.0)
109
121
  rspec (>= 3.0)
110
122
  web_stat!
111
- webmock (>= 3.6.0)
123
+ webmock (>= 3.8.3)
112
124
 
113
125
  BUNDLED WITH
114
126
  2.1.4
@@ -8,6 +8,7 @@ require 'sanitize'
8
8
  require 'nokogiri'
9
9
  require 'open-uri'
10
10
  require 'net/http'
11
+ require 'pdf/reader'
11
12
  require 'ruby-readability'
12
13
  require 'selenium-webdriver'
13
14
 
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,7 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
4
-
5
4
  # Get title
6
5
  # @return [String] title
7
6
  def title
@@ -19,7 +18,8 @@ module WebStat
19
18
  title.strip
20
19
  end
21
20
  end
22
- # Get name of domain
21
+
22
+ # Get name of domain
23
23
  def site_name
24
24
  begin
25
25
  site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -36,7 +36,7 @@ module WebStat
36
36
  def content
37
37
  Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
38
38
  end
39
-
39
+
40
40
  # Get temporary path of image
41
41
  def eyecatch_image_path
42
42
  # Reuse `path` in this method
@@ -47,9 +47,15 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
+ # If there is a thumbnail rule, apply it.
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
52
+ if @url.match(v[0])
53
+ return @url.gsub(v[0], v[1])
54
+ end
55
+ end
50
56
  readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
51
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
52
- path = readability_content.xpath('//img').first.attr('src')
58
+ path = readability_content.xpath('//img').first.attr('src')
53
59
  end
54
60
  if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
55
61
  path = @nokogiri.xpath('//img').first.attr('src')
@@ -60,7 +66,7 @@ module WebStat
60
66
  path
61
67
  end
62
68
  end
63
-
69
+
64
70
  # Get local path to save url
65
71
  # @param [String] url
66
72
  def save_local_path(url)
@@ -71,13 +77,13 @@ module WebStat
71
77
  File.open(tmp_file, "w+b") do |_file|
72
78
  if image.class == Mechanize::File
73
79
  _file.puts(image.body)
74
- else
80
+ elsif image.respond_to?(:body_io)
75
81
  _file.puts(image.body_io.read)
76
82
  end
77
83
  end
78
84
  tmp_file
79
85
  end
80
-
86
+
81
87
  # Get url
82
88
  # @param [String] url
83
89
  # @param [String] body
@@ -107,7 +113,7 @@ module WebStat
107
113
  end
108
114
  body
109
115
  end
110
-
116
+
111
117
  # Get the informations of @url
112
118
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
113
119
  def stat(userdics: nil)
@@ -9,7 +9,31 @@ module WebStat
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
12
- @html = get_url(@url)
12
+ if @url.match?(/\.pdf$/)
13
+ title = nil
14
+ body = nil
15
+ URI.open(@url) do |io|
16
+ reader = PDF::Reader.new(io)
17
+ if reader.info.key?(:Title)
18
+ title = reader.info[:Title]
19
+ else
20
+ title = File.basename(@url, ".pdf")
21
+ end
22
+ body = reader.pages.first.text
23
+ end
24
+ @html = <<-"EOS"
25
+ <html>
26
+ <head>
27
+ <title>#{title}</title>
28
+ </head>
29
+ <body>
30
+ #{body}
31
+ </body>
32
+ </html>
33
+ EOS
34
+ else
35
+ @html = get_url(@url)
36
+ end
13
37
  @nokogiri = ::Nokogiri::HTML(@html)
14
38
  end
15
39
 
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.19"
3
3
  end
@@ -6,8 +6,9 @@ require "web_stat"
6
6
 
7
7
  require 'webmock'
8
8
  include WebMock::API
9
+ ENV['ENV'] = 'test'
9
10
  WebMock.enable!
10
-
11
+
11
12
  RSpec.configure do |config|
12
13
  # Enable flags like --only-failures and --next-failure
13
14
  config.example_status_persistence_file_path = ".rspec_status"
@@ -52,7 +53,14 @@ module WebStatTestHelper
52
53
  # Get htmls of fixture
53
54
  def scheme_and_files
54
55
  Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
55
- "https://newsdict.blog/#{File.basename(file)}"
56
+ "https://newsdict.blog/#{File.basename(file)}"
57
+ end
58
+ end
59
+
60
+ # Get pdfs of fixture
61
+ def pdfs
62
+ Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
63
+ "https://newsdict.blog/#{File.basename(file)}"
56
64
  end
57
65
  end
58
66
  end
@@ -67,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
67
75
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
68
76
  headers: {content_type: 'application/html; charset=utf-8'})
69
77
  end
78
+ WebStatTestHelper.pdfs.each do |url|
79
+ WebMock.stub_request(:get, url)
80
+ .to_return(
81
+ status: 200,
82
+ body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
83
+ headers: {content_type: 'application/pdf'})
84
+ end
70
85
 
71
86
  WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
72
87
  .to_return(
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
66
66
  end
67
67
  end
68
68
 
69
+ [{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
70
+
71
+ it "Get title by #{fetch[:class].to_s}" do
72
+ web_stat = fetch[:class].new(fetch[:fixture])
73
+ expect(web_stat.title).to eq "Microsoft Word"
74
+ end
75
+
76
+ it "Get site name by #{fetch[:class].to_s}" do
77
+ web_stat = fetch[:class].new(fetch[:fixture])
78
+ expect(web_stat.site_name).to eq "RFC2616.doc"
79
+ end
80
+
81
+ it "Get Document's content by #{fetch[:class].to_s}" do
82
+ web_stat = fetch[:class].new(fetch[:fixture])
83
+ expect(web_stat.content).not_to eq nil
84
+ end
85
+
86
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
87
+ web_stat = fetch[:class].new(fetch[:fixture])
88
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
89
+ end
90
+
91
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
92
+ web_stat = fetch[:class].new(fetch[:fixture])
93
+ web_stat.url = "https://newsdict.blog"
94
+ unless web_stat.stat[:eyecatch_image_path].nil?
95
+ image = File.read(web_stat.stat[:eyecatch_image_path])
96
+ expect(image.encoding.to_s).to eq("UTF-8")
97
+ end
98
+ end
99
+
100
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
101
+ web_stat = fetch[:class].new(fetch[:fixture])
102
+ web_stat.url = "https://newsdict.blog"
103
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
104
+ end
105
+
106
+ it "Get language_iso by #{fetch[:class].to_s}" do
107
+ web_stat = fetch[:class].new(fetch[:fixture])
108
+ web_stat.url = "https://newsdict.blog"
109
+ expect(web_stat.stat[:language_code]).to eq("en")
110
+ end
111
+
112
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
113
+ web_stat = fetch[:class].new(fetch[:fixture])
114
+ web_stat.url = "https://newsdict.blog"
115
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
116
+ end
117
+ end
118
+
119
+ [{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
120
+
121
+ it "Get title by #{fetch[:class].to_s}" do
122
+ web_stat = fetch[:class].new(fetch[:fixture])
123
+ expect(web_stat.title).to eq "newsdict.blog"
124
+ end
125
+
126
+ it "Get site name by #{fetch[:class].to_s}" do
127
+ web_stat = fetch[:class].new(fetch[:fixture])
128
+ expect(web_stat.site_name).to eq "newsdict.blog"
129
+ end
130
+
131
+ it "Get Document's content by #{fetch[:class].to_s}" do
132
+ web_stat = fetch[:class].new(fetch[:fixture])
133
+ expect(web_stat.content).not_to eq nil
134
+ end
135
+
136
+ it "WebStat content do not include html by #{fetch[:class].to_s}" do
137
+ web_stat = fetch[:class].new(fetch[:fixture])
138
+ expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
139
+ end
140
+
141
+ it "Get eyecatch image blob by #{fetch[:class].to_s}" do
142
+ web_stat = fetch[:class].new(fetch[:fixture])
143
+ web_stat.url = "https://newsdict.blog"
144
+ unless web_stat.stat[:eyecatch_image_path].nil?
145
+ image = File.read(web_stat.stat[:eyecatch_image_path])
146
+ expect(image.encoding.to_s).to eq("UTF-8")
147
+ end
148
+ end
149
+
150
+ it "Get eyecatch image path by #{fetch[:class].to_s}" do
151
+ web_stat = fetch[:class].new(fetch[:fixture])
152
+ web_stat.url = "https://newsdict.blog"
153
+ expect(web_stat.eyecatch_image_path).to be_string_or_nil
154
+ end
155
+
156
+ it "Get language_iso by #{fetch[:class].to_s}" do
157
+ web_stat = fetch[:class].new(fetch[:fixture])
158
+ web_stat.url = "https://newsdict.blog"
159
+ expect(web_stat.stat[:language_code]).to eq("ja")
160
+ end
161
+
162
+ it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
163
+ web_stat = fetch[:class].new(fetch[:fixture])
164
+ web_stat.url = "https://newsdict.blog"
165
+ expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
166
+ end
167
+ end
168
+
69
169
  it "WebStat.stat_by_html" do
70
170
  WebStatTestHelper.htmls.each do |fixture|
71
171
  web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
104
204
  expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
105
205
  expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
106
206
  end
107
- end
207
+ end
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
+ spec.add_runtime_dependency "pdf-reader", "2.4.0"
31
32
 
32
33
  spec.add_development_dependency "rake", ">= 10.0"
33
34
  spec.add_development_dependency "rspec", ">= 3.0"
34
35
  spec.add_development_dependency "pry", ">= 0.13.1"
35
- spec.add_development_dependency "webmock", ">= 3.6.0"
36
+ spec.add_development_dependency "webmock", ">= 3.8.3"
36
37
  spec.add_development_dependency "pry-byebug", "3.9.0"
37
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-21 00:00:00.000000000 Z
11
+ date: 2020-11-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - '='
123
123
  - !ruby/object:Gem::Version
124
124
  version: 3.142.7
125
+ - !ruby/object:Gem::Dependency
126
+ name: pdf-reader
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '='
130
+ - !ruby/object:Gem::Version
131
+ version: 2.4.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '='
137
+ - !ruby/object:Gem::Version
138
+ version: 2.4.0
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: rake
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -170,14 +184,14 @@ dependencies:
170
184
  requirements:
171
185
  - - ">="
172
186
  - !ruby/object:Gem::Version
173
- version: 3.6.0
187
+ version: 3.8.3
174
188
  type: :development
175
189
  prerelease: false
176
190
  version_requirements: !ruby/object:Gem::Requirement
177
191
  requirements:
178
192
  - - ">="
179
193
  - !ruby/object:Gem::Version
180
- version: 3.6.0
194
+ version: 3.8.3
181
195
  - !ruby/object:Gem::Dependency
182
196
  name: pry-byebug
183
197
  requirement: !ruby/object:Gem::Requirement
@@ -233,6 +247,8 @@ files:
233
247
  - spec/fixtures/htmls/image.html
234
248
  - spec/fixtures/images/facebook-3.jpg
235
249
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
250
+ - spec/fixtures/pdfs/newsdict.blog.pdf
251
+ - spec/fixtures/pdfs/rfc2616.pdf
236
252
  - spec/spec_helper.rb
237
253
  - spec/web_stat/configure_spec.rb
238
254
  - spec/web_stat/fetch_spec.rb
@@ -268,6 +284,8 @@ test_files:
268
284
  - spec/fixtures/htmls/image.html
269
285
  - spec/fixtures/images/facebook-3.jpg
270
286
  - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
287
+ - spec/fixtures/pdfs/newsdict.blog.pdf
288
+ - spec/fixtures/pdfs/rfc2616.pdf
271
289
  - spec/spec_helper.rb
272
290
  - spec/web_stat/configure_spec.rb
273
291
  - spec/web_stat/fetch_spec.rb