web_stat 0.3.15 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 82732f779004a5a2ef1f7259ce40126de8f51dfb07e46cd4972aafd3ab386ac3
4
- data.tar.gz: 17c2b99bdeb5db8c134d107fc9d4957dab985c7e680b1c92060bf7090315477b
3
+ metadata.gz: 29d04d8379978d1e295b829193a330fd9bab8cb92326e83aab39d066059dcb02
4
+ data.tar.gz: c2df9896f21ec9d777dea101a1c4c4be513ad28c11428fc1104ccefe08f98b40
5
5
  SHA512:
6
- metadata.gz: d6691a57b0498fcfbb609042cdb7e12b4ac453b5cedaf6dd39671090c97fc19958a4162ba50947392116320ba78c9a21491acab27be2a41287825d3d4d2194d1
7
- data.tar.gz: 1e2e3d33f5c232532f442bbd16540c47810c93caef436a351124165b5294ddc7354d766ea66aa24ebc052c69432a639a6d5ef7b61a7cde896f6dc4676f95650d
6
+ metadata.gz: 728bc129ddced4cc58081ca4d8c02b761ba77e4adf277b736115ce51b1e7293f7b208adc5742fe50ba4e406f2a7c3f65af19d5c55f0cf0188a4a4b25704719bd
7
+ data.tar.gz: ef8ee476834a6fb75fed33476d070e1b00044f9115b85eaf3148f1901af10e4cd32f79ef3b369e59366a56ab7e8560180da3def8562e2b396f2b8b6bfc26196b
@@ -1 +1 @@
1
- 2.7.1
1
+ 3.0.0
data/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
1
  # Define base image, you can use --build-arg
2
- ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
3
3
  FROM $base_image
4
4
 
5
5
  # Set locale
@@ -1,16 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.15)
4
+ web_stat (0.3.19)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
11
+ rexml (>= 3.2.4)
11
12
  ruby-readability (>= 0.7)
12
13
  sanitize (>= 5.0.0)
13
14
  selenium-webdriver (= 3.142.7)
15
+ webrick (>= 1.7.0)
14
16
 
15
17
  GEM
16
18
  remote: https://rubygems.org/
@@ -24,14 +26,13 @@ GEM
24
26
  cld (0.8.0)
25
27
  ffi
26
28
  coderay (1.1.3)
27
- connection_pool (2.2.3)
28
29
  crack (0.4.3)
29
30
  safe_yaml (~> 1.0.0)
30
31
  crass (1.0.6)
31
32
  diff-lcs (1.3)
32
33
  domain_name (0.5.20190701)
33
34
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.13.1)
35
+ ffi (1.14.2)
35
36
  guess_html_encoding (0.0.11)
36
37
  hashdiff (1.0.1)
37
38
  hashery (2.1.2)
@@ -49,16 +50,14 @@ GEM
49
50
  method_source (1.0.0)
50
51
  mime-types (3.3.1)
51
52
  mime-types-data (~> 3.2015)
52
- mime-types-data (3.2020.0512)
53
- mini_portile2 (2.4.0)
53
+ mime-types-data (3.2020.1104)
54
54
  natto (1.2.0)
55
55
  ffi (>= 1.9.0)
56
56
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (4.0.0)
58
- connection_pool (~> 2.2)
59
- nokogiri (1.10.9)
60
- mini_portile2 (~> 2.4.0)
61
- nokogumbo (2.0.2)
57
+ net-http-persistent (2.9.4)
58
+ nokogiri (1.11.1-x86_64-linux)
59
+ racc (~> 1.4)
60
+ nokogumbo (2.0.4)
62
61
  nokogiri (~> 1.8, >= 1.8.4)
63
62
  ntlm-http (0.1.1)
64
63
  pdf-reader (2.4.0)
@@ -74,7 +73,9 @@ GEM
74
73
  byebug (~> 11.0)
75
74
  pry (~> 0.13.0)
76
75
  public_suffix (4.0.5)
76
+ racc (1.5.2)
77
77
  rake (13.0.1)
78
+ rexml (3.2.4)
78
79
  rspec (3.9.0)
79
80
  rspec-core (~> 3.9.0)
80
81
  rspec-expectations (~> 3.9.0)
@@ -94,14 +95,14 @@ GEM
94
95
  nokogiri (>= 1.6.0)
95
96
  rubyzip (2.3.0)
96
97
  safe_yaml (1.0.5)
97
- sanitize (5.2.1)
98
+ sanitize (5.2.2)
98
99
  crass (~> 1.0.2)
99
100
  nokogiri (>= 1.8.0)
100
101
  nokogumbo (~> 2.0)
101
102
  selenium-webdriver (3.142.7)
102
103
  childprocess (>= 0.5, < 4.0)
103
104
  rubyzip (>= 1.2.2)
104
- ttfunk (1.6.2.1)
105
+ ttfunk (1.7.0)
105
106
  unf (0.1.4)
106
107
  unf_ext
107
108
  unf_ext (0.0.7.7)
@@ -109,6 +110,7 @@ GEM
109
110
  addressable (>= 2.3.6)
110
111
  crack (>= 0.3.2)
111
112
  hashdiff (>= 0.4.0, < 2.0.0)
113
+ webrick (1.7.0)
112
114
  webrobots (0.1.2)
113
115
 
114
116
  PLATFORMS
@@ -123,4 +125,4 @@ DEPENDENCIES
123
125
  webmock (>= 3.8.3)
124
126
 
125
127
  BUNDLED WITH
126
- 2.1.4
128
+ 2.2.4
data/README.md CHANGED
@@ -1,30 +1,7 @@
1
- # !!!!! Precautions when using with Rails !!!!!
2
-
3
- Write this line your Gemfile.
4
- ```
5
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
- ```
7
-
8
1
  # WebStat
9
2
 
10
3
  Fetch the web pages and stat.
11
4
 
12
- ## Requirements
13
-
14
- - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
15
- - add runtime dependency
16
- - "bundler", "~> 2.0"
17
- - "nokogiri", "~> 1.10"
18
- - "mechanize", "~> 2.7"
19
- - "ruby-readability", "~> 0.7"
20
- - "final_redirect_url", "~> 0.1.0"
21
- - "natto", "~> 1.1.2"
22
- - add development dependency
23
- - "rake", "~> 10.0"
24
- - "rspec", "~> 3.0"
25
- - "rake", "~> 10.0"
26
- - "rspec", "~> 3.0"
27
-
28
5
  ### Install mecab
29
6
 
30
7
  $ sudo apt install mecab-ipadic-utf8 libmecab
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,7 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
4
-
5
4
  # Get title
6
5
  # @return [String] title
7
6
  def title
@@ -19,7 +18,8 @@ module WebStat
19
18
  title.strip
20
19
  end
21
20
  end
22
- # Get name of domain
21
+
22
+ # Get name of domain
23
23
  def site_name
24
24
  begin
25
25
  site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -34,9 +34,9 @@ module WebStat
34
34
  end
35
35
  # Get main section
36
36
  def content
37
- Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
37
+ Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
38
38
  end
39
-
39
+
40
40
  # Get temporary path of image
41
41
  def eyecatch_image_path
42
42
  # Reuse `path` in this method
@@ -47,9 +47,15 @@ module WebStat
47
47
  break
48
48
  end
49
49
  end
50
- readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
50
+ # If there is a thumbnail rule, apply it.
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
52
+ if @url.match(v[0])
53
+ return @url.gsub(v[0], v[1])
54
+ end
55
+ end
56
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
51
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
52
- path = readability_content.xpath('//img').first.attr('src')
58
+ path = readability_content.xpath('//img').first.attr('src')
53
59
  end
54
60
  if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
55
61
  path = @nokogiri.xpath('//img').first.attr('src')
@@ -60,7 +66,7 @@ module WebStat
60
66
  path
61
67
  end
62
68
  end
63
-
69
+
64
70
  # Get local path to save url
65
71
  # @param [String] url
66
72
  def save_local_path(url)
@@ -71,13 +77,13 @@ module WebStat
71
77
  File.open(tmp_file, "w+b") do |_file|
72
78
  if image.class == Mechanize::File
73
79
  _file.puts(image.body)
74
- else
80
+ elsif image.respond_to?(:body_io)
75
81
  _file.puts(image.body_io.read)
76
82
  end
77
83
  end
78
84
  tmp_file
79
85
  end
80
-
86
+
81
87
  # Get url
82
88
  # @param [String] url
83
89
  # @param [String] body
@@ -107,7 +113,7 @@ module WebStat
107
113
  end
108
114
  body
109
115
  end
110
-
116
+
111
117
  # Get the informations of @url
112
118
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
113
119
  def stat(userdics: nil)
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.15"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
31
  spec.add_runtime_dependency "pdf-reader", "2.4.0"
32
+ spec.add_runtime_dependency "webrick", ">= 1.7.0"
33
+ spec.add_runtime_dependency "rexml", ">= 3.2.4"
32
34
 
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", ">= 3.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.15
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-03 00:00:00.000000000 Z
11
+ date: 2021-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
138
  version: 2.4.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: webrick
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: 1.7.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.7.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rexml
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: 3.2.4
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 3.2.4
139
167
  - !ruby/object:Gem::Dependency
140
168
  name: rake
141
169
  requirement: !ruby/object:Gem::Requirement
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
302
  - !ruby/object:Gem::Version
275
303
  version: '0'
276
304
  requirements: []
277
- rubygems_version: 3.1.2
305
+ rubygems_version: 3.2.3
278
306
  signing_key:
279
307
  specification_version: 4
280
308
  summary: Get the status of the web pages.