web_stat 0.3.17 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5fae0f2888c29c8641d2d214e75f1aaa229683c3ee0e859423275acf860681be
4
- data.tar.gz: 321e71c91a8563475b12044bcfe2fd73c47367ca7ed772987f6d8b36769029d5
3
+ metadata.gz: 871e9eb97dc238635bd6a46571e86f6a9548c0ca32b6bf3576017ad71c81394f
4
+ data.tar.gz: 7ed7e9750fc2030f486c14f4de9441ba2c2c98a8357565bf708e6d43f7ae620e
5
5
  SHA512:
6
- metadata.gz: ad1e0a59a4aa01d08f9f99ce519878c2451f73d248ff1d3280c558e57561a1ec057fdd2e37d5931d600b963db0bc4c0a28f947f0669da5595d06302ef54666a2
7
- data.tar.gz: f2341445001985492ab0e074710db753c8c701c7cb614addcb22304672789fe6425db0cefc4a57385b1add7f6263569f142b1be88d0fc16c5888852e1e220576
6
+ metadata.gz: 963dca0c991935d084ba40d2ff35de87c55d1584f9d39f3c54c7df0d89aa701ddef21014c1ea6b4a0a83d118b9abd6daf98cd084663992d0a3ae09c9b92e4267
7
+ data.tar.gz: df4c99e70693d004d312219dd2550cd7a3b39e4eb64ac101cd0654a886ee0d6a4d667a82dbdb15574f825aa4cc71fb3d34d9e10f080a977fe026c6f7b7d6006e
@@ -1 +1 @@
1
- 2.7.1
1
+ 3.0.0
data/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
1
  # Define base image, you can use --build-arg
2
- ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
3
3
  FROM $base_image
4
4
 
5
5
  # Set locale
@@ -1,16 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.17)
4
+ web_stat (0.4.2)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
11
+ rexml (>= 3.2.4)
11
12
  ruby-readability (>= 0.7)
12
13
  sanitize (>= 5.0.0)
13
14
  selenium-webdriver (= 3.142.7)
15
+ webrick (>= 1.7.0)
14
16
 
15
17
  GEM
16
18
  remote: https://rubygems.org/
@@ -24,14 +26,13 @@ GEM
24
26
  cld (0.8.0)
25
27
  ffi
26
28
  coderay (1.1.3)
27
- connection_pool (2.2.3)
28
29
  crack (0.4.3)
29
30
  safe_yaml (~> 1.0.0)
30
31
  crass (1.0.6)
31
32
  diff-lcs (1.3)
32
33
  domain_name (0.5.20190701)
33
34
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.13.1)
35
+ ffi (1.14.2)
35
36
  guess_html_encoding (0.0.11)
36
37
  hashdiff (1.0.1)
37
38
  hashery (2.1.2)
@@ -49,16 +50,14 @@ GEM
49
50
  method_source (1.0.0)
50
51
  mime-types (3.3.1)
51
52
  mime-types-data (~> 3.2015)
52
- mime-types-data (3.2020.0512)
53
- mini_portile2 (2.4.0)
53
+ mime-types-data (3.2020.1104)
54
54
  natto (1.2.0)
55
55
  ffi (>= 1.9.0)
56
56
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (4.0.0)
58
- connection_pool (~> 2.2)
59
- nokogiri (1.10.10)
60
- mini_portile2 (~> 2.4.0)
61
- nokogumbo (2.0.2)
57
+ net-http-persistent (2.9.4)
58
+ nokogiri (1.11.1-x86_64-linux)
59
+ racc (~> 1.4)
60
+ nokogumbo (2.0.4)
62
61
  nokogiri (~> 1.8, >= 1.8.4)
63
62
  ntlm-http (0.1.1)
64
63
  pdf-reader (2.4.0)
@@ -74,7 +73,9 @@ GEM
74
73
  byebug (~> 11.0)
75
74
  pry (~> 0.13.0)
76
75
  public_suffix (4.0.5)
76
+ racc (1.5.2)
77
77
  rake (13.0.1)
78
+ rexml (3.2.4)
78
79
  rspec (3.9.0)
79
80
  rspec-core (~> 3.9.0)
80
81
  rspec-expectations (~> 3.9.0)
@@ -94,14 +95,14 @@ GEM
94
95
  nokogiri (>= 1.6.0)
95
96
  rubyzip (2.3.0)
96
97
  safe_yaml (1.0.5)
97
- sanitize (5.2.1)
98
+ sanitize (5.2.2)
98
99
  crass (~> 1.0.2)
99
100
  nokogiri (>= 1.8.0)
100
101
  nokogumbo (~> 2.0)
101
102
  selenium-webdriver (3.142.7)
102
103
  childprocess (>= 0.5, < 4.0)
103
104
  rubyzip (>= 1.2.2)
104
- ttfunk (1.6.2.1)
105
+ ttfunk (1.7.0)
105
106
  unf (0.1.4)
106
107
  unf_ext
107
108
  unf_ext (0.0.7.7)
@@ -109,6 +110,7 @@ GEM
109
110
  addressable (>= 2.3.6)
110
111
  crack (>= 0.3.2)
111
112
  hashdiff (>= 0.4.0, < 2.0.0)
113
+ webrick (1.7.0)
112
114
  webrobots (0.1.2)
113
115
 
114
116
  PLATFORMS
@@ -123,4 +125,4 @@ DEPENDENCIES
123
125
  webmock (>= 3.8.3)
124
126
 
125
127
  BUNDLED WITH
126
- 2.1.4
128
+ 2.2.4
data/README.md CHANGED
@@ -1,30 +1,7 @@
1
- # !!!!! Precautions when using with Rails !!!!!
2
-
3
- Write this line your Gemfile.
4
- ```
5
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
- ```
7
-
8
1
  # WebStat
9
2
 
10
3
  Fetch the web pages and stat.
11
4
 
12
- ## Requirements
13
-
14
- - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
15
- - add runtime dependency
16
- - "bundler", "~> 2.0"
17
- - "nokogiri", "~> 1.10"
18
- - "mechanize", "~> 2.7"
19
- - "ruby-readability", "~> 0.7"
20
- - "final_redirect_url", "~> 0.1.0"
21
- - "natto", "~> 1.1.2"
22
- - add development dependency
23
- - "rake", "~> 10.0"
24
- - "rspec", "~> 3.0"
25
- - "rake", "~> 10.0"
26
- - "rspec", "~> 3.0"
27
-
28
5
  ### Install mecab
29
6
 
30
7
  $ sudo apt install mecab-ipadic-utf8 libmecab
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,11 +1,5 @@
1
1
  module WebStat
2
2
  class Fetch
3
- THUMBNAIL_REGEXS = {
4
- :youtube => [
5
- %r{^https://www.youtube.com/watch\?v=([^&]+)},
6
- 'http://img.youtube.com/vi/\1/default.jpg'
7
- ]
8
- }
9
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
10
4
  # Get title
11
5
  # @return [String] title
@@ -40,7 +34,7 @@ module WebStat
40
34
  end
41
35
  # Get main section
42
36
  def content
43
- Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
37
+ Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
44
38
  end
45
39
 
46
40
  # Get temporary path of image
@@ -54,12 +48,12 @@ module WebStat
54
48
  end
55
49
  end
56
50
  # If there is a thumbnail rule, apply it.
57
- THUMBNAIL_REGEXS.each do |provider, v|
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
58
52
  if @url.match(v[0])
59
53
  return @url.gsub(v[0], v[1])
60
54
  end
61
55
  end
62
- readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
56
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
63
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
64
58
  path = readability_content.xpath('//img').first.attr('src')
65
59
  end
@@ -83,7 +77,7 @@ module WebStat
83
77
  File.open(tmp_file, "w+b") do |_file|
84
78
  if image.class == Mechanize::File
85
79
  _file.puts(image.body)
86
- elsif image.respond_to?(:body)
80
+ elsif image.respond_to?(:body_io)
87
81
  _file.puts(image.body_io.read)
88
82
  end
89
83
  end
@@ -102,7 +96,16 @@ module WebStat
102
96
  raise Mechanize::RobotsDisallowedError.new(url)
103
97
  end
104
98
  if WebStat::Configure.get["use_chromedirver"]
105
- body = WebStat::WebDriverHelper.get_source(url)
99
+ begin
100
+ body = WebStat::WebDriverHelper.get_source(url)
101
+ rescue Selenium::WebDriver::Error::UnknownError => e
102
+ document = mech.agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
103
+ if document.class == Mechanize::File
104
+ body = document.body
105
+ else
106
+ body = document.body.encode('UTF-8', document.encoding)
107
+ end
108
+ end
106
109
  @status = 200
107
110
  else
108
111
  document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.17"
2
+ VERSION = "0.4.2"
3
3
  end
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
31
  spec.add_runtime_dependency "pdf-reader", "2.4.0"
32
+ spec.add_runtime_dependency "webrick", ">= 1.7.0"
33
+ spec.add_runtime_dependency "rexml", ">= 3.2.4"
32
34
 
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", ">= 3.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.17
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-11 00:00:00.000000000 Z
11
+ date: 2021-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
138
  version: 2.4.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: webrick
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: 1.7.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.7.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rexml
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: 3.2.4
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 3.2.4
139
167
  - !ruby/object:Gem::Dependency
140
168
  name: rake
141
169
  requirement: !ruby/object:Gem::Requirement
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
302
  - !ruby/object:Gem::Version
275
303
  version: '0'
276
304
  requirements: []
277
- rubygems_version: 3.1.2
305
+ rubygems_version: 3.2.3
278
306
  signing_key:
279
307
  specification_version: 4
280
308
  summary: Get the status of the web pages.