web_stat 0.3.18 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1577e388af881db0fb90000e62a9768711d6098fb256235a7976ac317a2f2f2
4
- data.tar.gz: 06d1a79f46b436e20fc0730232ff4997eec11eccd939a1d7d28311780f496273
3
+ metadata.gz: beb43ff474210a7d97855aa4f0df4c0083b8560302b20debd27ecce87434e6af
4
+ data.tar.gz: ef3a1f8077801a2e0f55dda383f03bbea94b7fadaf1d83b3bc120582180794ad
5
5
  SHA512:
6
- metadata.gz: 15f4eca65d1fdc2d3c4bc5d1e06817e78896774abfe384419b8416a0ff73ff6b423c15b029632e28708e58e868f0c126e86f56295b618cc4ea4d93ac6cc7d26d
7
- data.tar.gz: e15ee2f11d221b1d6997a6f7aad45571c6bf9456b813e95465f5c029617b701e4f18e3b4b7ec35246d0fddefca1f7648d07bff5a585eaf18cc9859362ceb5ca9
6
+ metadata.gz: 3408516abee8fb1e422dcbfc29af4c2ad31d0a1a80dc15fdd893073f39408a3e797ce080bc2a28557031d64acd72431b89e82586ded7e1d64f90656c36793e61
7
+ data.tar.gz: 1003e6bb4d6824ae72e6ada48040b84758dc7002ce7258828f5764ae13224fc0818dc52fda533a2cd58c11c1194f31f807a37ae4e1237adc7d67038a6108dbba
@@ -1 +1 @@
1
- 2.7.1
1
+ 3.0.0
data/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
1
  # Define base image, you can use --build-arg
2
- ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
3
3
  FROM $base_image
4
4
 
5
5
  # Set locale
@@ -1,16 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.18)
4
+ web_stat (0.4.3)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
11
+ rexml (>= 3.2.4)
11
12
  ruby-readability (>= 0.7)
12
13
  sanitize (>= 5.0.0)
13
14
  selenium-webdriver (= 3.142.7)
15
+ webrick (>= 1.7.0)
14
16
 
15
17
  GEM
16
18
  remote: https://rubygems.org/
@@ -24,14 +26,13 @@ GEM
24
26
  cld (0.8.0)
25
27
  ffi
26
28
  coderay (1.1.3)
27
- connection_pool (2.2.3)
28
29
  crack (0.4.3)
29
30
  safe_yaml (~> 1.0.0)
30
31
  crass (1.0.6)
31
32
  diff-lcs (1.3)
32
33
  domain_name (0.5.20190701)
33
34
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.13.1)
35
+ ffi (1.14.2)
35
36
  guess_html_encoding (0.0.11)
36
37
  hashdiff (1.0.1)
37
38
  hashery (2.1.2)
@@ -49,16 +50,14 @@ GEM
49
50
  method_source (1.0.0)
50
51
  mime-types (3.3.1)
51
52
  mime-types-data (~> 3.2015)
52
- mime-types-data (3.2020.0512)
53
- mini_portile2 (2.4.0)
53
+ mime-types-data (3.2020.1104)
54
54
  natto (1.2.0)
55
55
  ffi (>= 1.9.0)
56
56
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (4.0.0)
58
- connection_pool (~> 2.2)
59
- nokogiri (1.10.10)
60
- mini_portile2 (~> 2.4.0)
61
- nokogumbo (2.0.2)
57
+ net-http-persistent (2.9.4)
58
+ nokogiri (1.11.1-x86_64-linux)
59
+ racc (~> 1.4)
60
+ nokogumbo (2.0.4)
62
61
  nokogiri (~> 1.8, >= 1.8.4)
63
62
  ntlm-http (0.1.1)
64
63
  pdf-reader (2.4.0)
@@ -74,7 +73,9 @@ GEM
74
73
  byebug (~> 11.0)
75
74
  pry (~> 0.13.0)
76
75
  public_suffix (4.0.5)
76
+ racc (1.5.2)
77
77
  rake (13.0.1)
78
+ rexml (3.2.4)
78
79
  rspec (3.9.0)
79
80
  rspec-core (~> 3.9.0)
80
81
  rspec-expectations (~> 3.9.0)
@@ -94,14 +95,14 @@ GEM
94
95
  nokogiri (>= 1.6.0)
95
96
  rubyzip (2.3.0)
96
97
  safe_yaml (1.0.5)
97
- sanitize (5.2.1)
98
+ sanitize (5.2.2)
98
99
  crass (~> 1.0.2)
99
100
  nokogiri (>= 1.8.0)
100
101
  nokogumbo (~> 2.0)
101
102
  selenium-webdriver (3.142.7)
102
103
  childprocess (>= 0.5, < 4.0)
103
104
  rubyzip (>= 1.2.2)
104
- ttfunk (1.6.2.1)
105
+ ttfunk (1.7.0)
105
106
  unf (0.1.4)
106
107
  unf_ext
107
108
  unf_ext (0.0.7.7)
@@ -109,6 +110,7 @@ GEM
109
110
  addressable (>= 2.3.6)
110
111
  crack (>= 0.3.2)
111
112
  hashdiff (>= 0.4.0, < 2.0.0)
113
+ webrick (1.7.0)
112
114
  webrobots (0.1.2)
113
115
 
114
116
  PLATFORMS
@@ -123,4 +125,4 @@ DEPENDENCIES
123
125
  webmock (>= 3.8.3)
124
126
 
125
127
  BUNDLED WITH
126
- 2.1.4
128
+ 2.2.4
data/README.md CHANGED
@@ -1,30 +1,7 @@
1
- # !!!!! Precautions when using with Rails !!!!!
2
-
3
- Write this line your Gemfile.
4
- ```
5
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
- ```
7
-
8
1
  # WebStat
9
2
 
10
3
  Fetch the web pages and stat.
11
4
 
12
- ## Requirements
13
-
14
- - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
15
- - add runtime dependency
16
- - "bundler", "~> 2.0"
17
- - "nokogiri", "~> 1.10"
18
- - "mechanize", "~> 2.7"
19
- - "ruby-readability", "~> 0.7"
20
- - "final_redirect_url", "~> 0.1.0"
21
- - "natto", "~> 1.1.2"
22
- - add development dependency
23
- - "rake", "~> 10.0"
24
- - "rspec", "~> 3.0"
25
- - "rake", "~> 10.0"
26
- - "rspec", "~> 3.0"
27
-
28
5
  ### Install mecab
29
6
 
30
7
  $ sudo apt install mecab-ipadic-utf8 libmecab
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,11 +1,5 @@
1
1
  module WebStat
2
2
  class Fetch
3
- THUMBNAIL_REGEXS = {
4
- :youtube => [
5
- %r{^https://www.youtube.com/watch\?v=([^&]+)},
6
- 'http://img.youtube.com/vi/\1/default.jpg'
7
- ]
8
- }
9
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
10
4
  # Get title
11
5
  # @return [String] title
@@ -40,7 +34,7 @@ module WebStat
40
34
  end
41
35
  # Get main section
42
36
  def content
43
- Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
37
+ Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
44
38
  end
45
39
 
46
40
  # Get temporary path of image
@@ -54,12 +48,12 @@ module WebStat
54
48
  end
55
49
  end
56
50
  # If there is a thumbnail rule, apply it.
57
- THUMBNAIL_REGEXS.each do |provider, v|
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
58
52
  if @url.match(v[0])
59
53
  return @url.gsub(v[0], v[1])
60
54
  end
61
55
  end
62
- readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
56
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
63
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
64
58
  path = readability_content.xpath('//img').first.attr('src')
65
59
  end
@@ -102,7 +96,16 @@ module WebStat
102
96
  raise Mechanize::RobotsDisallowedError.new(url)
103
97
  end
104
98
  if WebStat::Configure.get["use_chromedirver"]
105
- body = WebStat::WebDriverHelper.get_source(url)
99
+ begin
100
+ body = WebStat::WebDriverHelper.get_source(url)
101
+ rescue Selenium::WebDriver::Error::UnknownError => e
102
+ document = mech.agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
103
+ if document.class == Mechanize::File
104
+ body = document.body
105
+ else
106
+ body = document.body.encode('UTF-8', document.encoding)
107
+ end
108
+ end
106
109
  @status = 200
107
110
  else
108
111
  document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
@@ -5,7 +5,7 @@ module WebStat
5
5
  # initialize class
6
6
  # @param [String] url
7
7
  def initialize(url)
8
- unless url_valid?(url)
8
+ unless FetchAsWeb.url_valid?(url)
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
@@ -36,11 +36,12 @@ module WebStat
36
36
  end
37
37
  @nokogiri = ::Nokogiri::HTML(@html)
38
38
  end
39
-
40
- # Validation url
41
- def url_valid?(url)
42
- regexp = Regexp.new("^https?://([a-zA-Z0-9][a-zA-Z0-9\\\-\.]{1,61}[a-zA-Z0-9])\\\.([a-zA-Z]{2,})(.*)?$", Regexp::IGNORECASE)
43
- regexp.match?(url)
44
- end
39
+ class << self
40
+ # Validation url
41
+ def url_valid?(url)
42
+ regexp = Regexp.new("^https?://([a-z0-9][a-z0-9\\\-\.]{0,61})\\\.([a-z]{2,})(.*)?$", Regexp::IGNORECASE)
43
+ regexp.match?(url)
44
+ end
45
+ end
45
46
  end
46
47
  end
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.18"
3
- end
2
+ VERSION = "0.4.3"
3
+ end
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -197,11 +197,11 @@ RSpec.describe WebStat::Fetch do
197
197
  end
198
198
 
199
199
  it "valid url" do
200
- web_stat_fetch_web_class = WebStat::FetchAsWeb.new("https://newsdict.blog/content/images/size/w100/2019/03/facebook-3.jpg")
201
- expect(web_stat_fetch_web_class.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
202
- expect(web_stat_fetch_web_class.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
203
- expect(web_stat_fetch_web_class.url_valid?("https://www.meetup.com/pro/docker")).to be true
204
- expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
205
- expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
200
+ expect(WebStat::FetchAsWeb.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
201
+ expect(WebStat::FetchAsWeb.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
202
+ expect(WebStat::FetchAsWeb.url_valid?("https://www.meetup.com/pro/docker")).to be true
203
+ expect(WebStat::FetchAsWeb.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
204
+ expect(WebStat::FetchAsWeb.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
205
+ expect(WebStat::FetchAsWeb.url_valid?("http://g.co/arts/SK1jZHJpT8N1BGaM7")).to be true
206
206
  end
207
207
  end
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
31
  spec.add_runtime_dependency "pdf-reader", "2.4.0"
32
+ spec.add_runtime_dependency "webrick", ">= 1.7.0"
33
+ spec.add_runtime_dependency "rexml", ">= 3.2.4"
32
34
 
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", ">= 3.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.18
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-11 00:00:00.000000000 Z
11
+ date: 2021-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
138
  version: 2.4.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: webrick
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: 1.7.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.7.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rexml
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: 3.2.4
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 3.2.4
139
167
  - !ruby/object:Gem::Dependency
140
168
  name: rake
141
169
  requirement: !ruby/object:Gem::Requirement
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
302
  - !ruby/object:Gem::Version
275
303
  version: '0'
276
304
  requirements: []
277
- rubygems_version: 3.1.2
305
+ rubygems_version: 3.2.3
278
306
  signing_key:
279
307
  specification_version: 4
280
308
  summary: Get the status of the web pages.