web_stat 0.3.15 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 82732f779004a5a2ef1f7259ce40126de8f51dfb07e46cd4972aafd3ab386ac3
4
- data.tar.gz: 17c2b99bdeb5db8c134d107fc9d4957dab985c7e680b1c92060bf7090315477b
3
+ metadata.gz: 29d04d8379978d1e295b829193a330fd9bab8cb92326e83aab39d066059dcb02
4
+ data.tar.gz: c2df9896f21ec9d777dea101a1c4c4be513ad28c11428fc1104ccefe08f98b40
5
5
  SHA512:
6
- metadata.gz: d6691a57b0498fcfbb609042cdb7e12b4ac453b5cedaf6dd39671090c97fc19958a4162ba50947392116320ba78c9a21491acab27be2a41287825d3d4d2194d1
7
- data.tar.gz: 1e2e3d33f5c232532f442bbd16540c47810c93caef436a351124165b5294ddc7354d766ea66aa24ebc052c69432a639a6d5ef7b61a7cde896f6dc4676f95650d
6
+ metadata.gz: 728bc129ddced4cc58081ca4d8c02b761ba77e4adf277b736115ce51b1e7293f7b208adc5742fe50ba4e406f2a7c3f65af19d5c55f0cf0188a4a4b25704719bd
7
+ data.tar.gz: ef8ee476834a6fb75fed33476d070e1b00044f9115b85eaf3148f1901af10e4cd32f79ef3b369e59366a56ab7e8560180da3def8562e2b396f2b8b6bfc26196b
@@ -1 +1 @@
1
- 2.7.1
1
+ 3.0.0
data/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
1
  # Define base image, you can use --build-arg
2
- ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
3
3
  FROM $base_image
4
4
 
5
5
  # Set locale
@@ -1,16 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.15)
4
+ web_stat (0.3.19)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
11
+ rexml (>= 3.2.4)
11
12
  ruby-readability (>= 0.7)
12
13
  sanitize (>= 5.0.0)
13
14
  selenium-webdriver (= 3.142.7)
15
+ webrick (>= 1.7.0)
14
16
 
15
17
  GEM
16
18
  remote: https://rubygems.org/
@@ -24,14 +26,13 @@ GEM
24
26
  cld (0.8.0)
25
27
  ffi
26
28
  coderay (1.1.3)
27
- connection_pool (2.2.3)
28
29
  crack (0.4.3)
29
30
  safe_yaml (~> 1.0.0)
30
31
  crass (1.0.6)
31
32
  diff-lcs (1.3)
32
33
  domain_name (0.5.20190701)
33
34
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.13.1)
35
+ ffi (1.14.2)
35
36
  guess_html_encoding (0.0.11)
36
37
  hashdiff (1.0.1)
37
38
  hashery (2.1.2)
@@ -49,16 +50,14 @@ GEM
49
50
  method_source (1.0.0)
50
51
  mime-types (3.3.1)
51
52
  mime-types-data (~> 3.2015)
52
- mime-types-data (3.2020.0512)
53
- mini_portile2 (2.4.0)
53
+ mime-types-data (3.2020.1104)
54
54
  natto (1.2.0)
55
55
  ffi (>= 1.9.0)
56
56
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (4.0.0)
58
- connection_pool (~> 2.2)
59
- nokogiri (1.10.9)
60
- mini_portile2 (~> 2.4.0)
61
- nokogumbo (2.0.2)
57
+ net-http-persistent (2.9.4)
58
+ nokogiri (1.11.1-x86_64-linux)
59
+ racc (~> 1.4)
60
+ nokogumbo (2.0.4)
62
61
  nokogiri (~> 1.8, >= 1.8.4)
63
62
  ntlm-http (0.1.1)
64
63
  pdf-reader (2.4.0)
@@ -74,7 +73,9 @@ GEM
74
73
  byebug (~> 11.0)
75
74
  pry (~> 0.13.0)
76
75
  public_suffix (4.0.5)
76
+ racc (1.5.2)
77
77
  rake (13.0.1)
78
+ rexml (3.2.4)
78
79
  rspec (3.9.0)
79
80
  rspec-core (~> 3.9.0)
80
81
  rspec-expectations (~> 3.9.0)
@@ -94,14 +95,14 @@ GEM
94
95
  nokogiri (>= 1.6.0)
95
96
  rubyzip (2.3.0)
96
97
  safe_yaml (1.0.5)
97
- sanitize (5.2.1)
98
+ sanitize (5.2.2)
98
99
  crass (~> 1.0.2)
99
100
  nokogiri (>= 1.8.0)
100
101
  nokogumbo (~> 2.0)
101
102
  selenium-webdriver (3.142.7)
102
103
  childprocess (>= 0.5, < 4.0)
103
104
  rubyzip (>= 1.2.2)
104
- ttfunk (1.6.2.1)
105
+ ttfunk (1.7.0)
105
106
  unf (0.1.4)
106
107
  unf_ext
107
108
  unf_ext (0.0.7.7)
@@ -109,6 +110,7 @@ GEM
109
110
  addressable (>= 2.3.6)
110
111
  crack (>= 0.3.2)
111
112
  hashdiff (>= 0.4.0, < 2.0.0)
113
+ webrick (1.7.0)
112
114
  webrobots (0.1.2)
113
115
 
114
116
  PLATFORMS
@@ -123,4 +125,4 @@ DEPENDENCIES
123
125
  webmock (>= 3.8.3)
124
126
 
125
127
  BUNDLED WITH
126
- 2.1.4
128
+ 2.2.4
data/README.md CHANGED
@@ -1,30 +1,7 @@
1
- # !!!!! Precautions when using with Rails !!!!!
2
-
3
- Write this line your Gemfile.
4
- ```
5
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
- ```
7
-
8
1
  # WebStat
9
2
 
10
3
  Fetch the web pages and stat.
11
4
 
12
- ## Requirements
13
-
14
- - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
15
- - add runtime dependency
16
- - "bundler", "~> 2.0"
17
- - "nokogiri", "~> 1.10"
18
- - "mechanize", "~> 2.7"
19
- - "ruby-readability", "~> 0.7"
20
- - "final_redirect_url", "~> 0.1.0"
21
- - "natto", "~> 1.1.2"
22
- - add development dependency
23
- - "rake", "~> 10.0"
24
- - "rspec", "~> 3.0"
25
- - "rake", "~> 10.0"
26
- - "rspec", "~> 3.0"
27
-
28
5
  ### Install mecab
29
6
 
30
7
  $ sudo apt install mecab-ipadic-utf8 libmecab
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,7 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
4
-
5
4
  # Get title
6
5
  # @return [String] title
7
6
  def title
@@ -19,7 +18,8 @@ module WebStat
19
18
  title.strip
20
19
  end
21
20
  end
22
- # Get name of domain
21
+
22
+ # Get name of domain
23
23
  def site_name
24
24
  begin
25
25
  site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -34,9 +34,9 @@ module WebStat
34
34
  end
35
35
  # Get main section
36
36
  def content
37
- Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
37
+ Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
38
38
  end
39
-
39
+
40
40
  # Get temporary path of image
41
41
  def eyecatch_image_path
42
42
  # Reuse `path` in this method
@@ -47,9 +47,15 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
- readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
50
+ # If there is a thumbnail rule, apply it.
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
52
+ if @url.match(v[0])
53
+ return @url.gsub(v[0], v[1])
54
+ end
55
+ end
56
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
51
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
52
- path = readability_content.xpath('//img').first.attr('src')
58
+ path = readability_content.xpath('//img').first.attr('src')
53
59
  end
54
60
  if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
55
61
  path = @nokogiri.xpath('//img').first.attr('src')
@@ -60,7 +66,7 @@ module WebStat
60
66
  path
61
67
  end
62
68
  end
63
-
69
+
64
70
  # Get local path to save url
65
71
  # @param [String] url
66
72
  def save_local_path(url)
@@ -71,13 +77,13 @@ module WebStat
71
77
  File.open(tmp_file, "w+b") do |_file|
72
78
  if image.class == Mechanize::File
73
79
  _file.puts(image.body)
74
- else
80
+ elsif image.respond_to?(:body_io)
75
81
  _file.puts(image.body_io.read)
76
82
  end
77
83
  end
78
84
  tmp_file
79
85
  end
80
-
86
+
81
87
  # Get url
82
88
  # @param [String] url
83
89
  # @param [String] body
@@ -107,7 +113,7 @@ module WebStat
107
113
  end
108
114
  body
109
115
  end
110
-
116
+
111
117
  # Get the informations of @url
112
118
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
113
119
  def stat(userdics: nil)
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.15"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
31
  spec.add_runtime_dependency "pdf-reader", "2.4.0"
32
+ spec.add_runtime_dependency "webrick", ">= 1.7.0"
33
+ spec.add_runtime_dependency "rexml", ">= 3.2.4"
32
34
 
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", ">= 3.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.15
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-03 00:00:00.000000000 Z
11
+ date: 2021-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
138
  version: 2.4.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: webrick
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: 1.7.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.7.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rexml
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: 3.2.4
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 3.2.4
139
167
  - !ruby/object:Gem::Dependency
140
168
  name: rake
141
169
  requirement: !ruby/object:Gem::Requirement
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
302
  - !ruby/object:Gem::Version
275
303
  version: '0'
276
304
  requirements: []
277
- rubygems_version: 3.1.2
305
+ rubygems_version: 3.2.3
278
306
  signing_key:
279
307
  specification_version: 4
280
308
  summary: Get the status of the web pages.