web_stat 0.3.16 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c332bb9cf67262c2b5d8e3c30c861107bd6ac8e3d86136730864e49b10dabcce
4
- data.tar.gz: 712f39109989e917e8af6b84fe40b64ce84009a9de4b3647c03177ecb69bc58d
3
+ metadata.gz: 9fc357cc98a214af7a391fe834ec927b43649ca8d8c5b1105e7fb1947dd7127a
4
+ data.tar.gz: f7c5f6d28101ff677aaa4db5b9b486d067798b0a8461f91352db252f68374513
5
5
  SHA512:
6
- metadata.gz: 3607fbe1e76018e3e523ed26a2777837c539a80f5c68da195996616d3566f6034125f0d49e96db9338ae40aeda5dac000968f6e089a7d96c19a412f14d2acbd2
7
- data.tar.gz: 027d3b911cbee0dfdd7a8f85b32d24e4b8c9e2f916a47a2d3ea31b34fe0e2f07b452e00372d2028d511d39fba7df8f26347789fd795103422711d0e979b1d8a2
6
+ metadata.gz: 1616c4f1a64ebc693ee4e822b06150028783736448850d8aca80c90909ca879580a82f871f95375904b38813b10155105fd5d1121ed0321ae9cc898a4b95549e
7
+ data.tar.gz: 1637fb0b6b6f8ee83b133d95e3ab31a6c4ad2a64399f4ac19173f215ec7a79eac851c1348d9efa1efbfdc88a7ac8e5d6ecd9f9c067c4295b0a82f2662ef4c448
@@ -1 +1 @@
1
- 2.7.1
1
+ 3.0.0
data/Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
1
  # Define base image, you can use --build-arg
2
- ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.37.0_nodev15.2.1_rubyv3.0.0_sasscv2.4.0_ffiv1.13.1_chromedriver"
3
3
  FROM $base_image
4
4
 
5
5
  # Set locale
@@ -1,16 +1,18 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.15)
4
+ web_stat (0.4.1)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
7
  mechanize (>= 2.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
11
+ rexml (>= 3.2.4)
11
12
  ruby-readability (>= 0.7)
12
13
  sanitize (>= 5.0.0)
13
14
  selenium-webdriver (= 3.142.7)
15
+ webrick (>= 1.7.0)
14
16
 
15
17
  GEM
16
18
  remote: https://rubygems.org/
@@ -24,14 +26,13 @@ GEM
24
26
  cld (0.8.0)
25
27
  ffi
26
28
  coderay (1.1.3)
27
- connection_pool (2.2.3)
28
29
  crack (0.4.3)
29
30
  safe_yaml (~> 1.0.0)
30
31
  crass (1.0.6)
31
32
  diff-lcs (1.3)
32
33
  domain_name (0.5.20190701)
33
34
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.13.1)
35
+ ffi (1.14.2)
35
36
  guess_html_encoding (0.0.11)
36
37
  hashdiff (1.0.1)
37
38
  hashery (2.1.2)
@@ -49,16 +50,14 @@ GEM
49
50
  method_source (1.0.0)
50
51
  mime-types (3.3.1)
51
52
  mime-types-data (~> 3.2015)
52
- mime-types-data (3.2020.0512)
53
- mini_portile2 (2.4.0)
53
+ mime-types-data (3.2020.1104)
54
54
  natto (1.2.0)
55
55
  ffi (>= 1.9.0)
56
56
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (4.0.0)
58
- connection_pool (~> 2.2)
59
- nokogiri (1.10.9)
60
- mini_portile2 (~> 2.4.0)
61
- nokogumbo (2.0.2)
57
+ net-http-persistent (2.9.4)
58
+ nokogiri (1.11.1-x86_64-linux)
59
+ racc (~> 1.4)
60
+ nokogumbo (2.0.4)
62
61
  nokogiri (~> 1.8, >= 1.8.4)
63
62
  ntlm-http (0.1.1)
64
63
  pdf-reader (2.4.0)
@@ -74,7 +73,9 @@ GEM
74
73
  byebug (~> 11.0)
75
74
  pry (~> 0.13.0)
76
75
  public_suffix (4.0.5)
76
+ racc (1.5.2)
77
77
  rake (13.0.1)
78
+ rexml (3.2.4)
78
79
  rspec (3.9.0)
79
80
  rspec-core (~> 3.9.0)
80
81
  rspec-expectations (~> 3.9.0)
@@ -94,14 +95,14 @@ GEM
94
95
  nokogiri (>= 1.6.0)
95
96
  rubyzip (2.3.0)
96
97
  safe_yaml (1.0.5)
97
- sanitize (5.2.1)
98
+ sanitize (5.2.2)
98
99
  crass (~> 1.0.2)
99
100
  nokogiri (>= 1.8.0)
100
101
  nokogumbo (~> 2.0)
101
102
  selenium-webdriver (3.142.7)
102
103
  childprocess (>= 0.5, < 4.0)
103
104
  rubyzip (>= 1.2.2)
104
- ttfunk (1.6.2.1)
105
+ ttfunk (1.7.0)
105
106
  unf (0.1.4)
106
107
  unf_ext
107
108
  unf_ext (0.0.7.7)
@@ -109,6 +110,7 @@ GEM
109
110
  addressable (>= 2.3.6)
110
111
  crack (>= 0.3.2)
111
112
  hashdiff (>= 0.4.0, < 2.0.0)
113
+ webrick (1.7.0)
112
114
  webrobots (0.1.2)
113
115
 
114
116
  PLATFORMS
@@ -123,4 +125,4 @@ DEPENDENCIES
123
125
  webmock (>= 3.8.3)
124
126
 
125
127
  BUNDLED WITH
126
- 2.1.4
128
+ 2.2.4
data/README.md CHANGED
@@ -1,30 +1,7 @@
1
- # !!!!! Precautions when using with Rails !!!!!
2
-
3
- Write this line your Gemfile.
4
- ```
5
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
- ```
7
-
8
1
  # WebStat
9
2
 
10
3
  Fetch the web pages and stat.
11
4
 
12
- ## Requirements
13
-
14
- - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
15
- - add runtime dependency
16
- - "bundler", "~> 2.0"
17
- - "nokogiri", "~> 1.10"
18
- - "mechanize", "~> 2.7"
19
- - "ruby-readability", "~> 0.7"
20
- - "final_redirect_url", "~> 0.1.0"
21
- - "natto", "~> 1.1.2"
22
- - add development dependency
23
- - "rake", "~> 10.0"
24
- - "rspec", "~> 3.0"
25
- - "rake", "~> 10.0"
26
- - "rspec", "~> 3.0"
27
-
28
5
  ### Install mecab
29
6
 
30
7
  $ sudo apt install mecab-ipadic-utf8 libmecab
@@ -1,7 +1,7 @@
1
1
  development: &development
2
2
  # Minimum number of characters to detect meta title
3
3
  min_length_of_meta_title: 10
4
- # Split regular expression for titles
4
+ # Split regular expression for titles
5
5
  regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
6
  # User Agent
7
7
  user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
14
14
  - '//img/@src'
15
15
  userdic: ""
16
16
  use_chromedirver: false
17
+ thumbnail_regex:
18
+ youtube:
19
+ - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
20
+ - 'http://img.youtube.com/vi/\1/default.jpg'
17
21
  test:
18
22
  <<: *development
19
23
  production:
@@ -1,11 +1,5 @@
1
1
  module WebStat
2
2
  class Fetch
3
- THUMBNAIL_REGEXS = {
4
- :youtube => [
5
- %r{^https://www.youtube.com/watch\?v=([^&]+)},
6
- 'http://img.youtube.com/vi/\1/default.jpg'
7
- ]
8
- }
9
3
  attr_accessor :url, :html, :nokogiri, :userdic, :status
10
4
  # Get title
11
5
  # @return [String] title
@@ -40,7 +34,7 @@ module WebStat
40
34
  end
41
35
  # Get main section
42
36
  def content
43
- Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
37
+ Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
44
38
  end
45
39
 
46
40
  # Get temporary path of image
@@ -54,12 +48,12 @@ module WebStat
54
48
  end
55
49
  end
56
50
  # If there is a thumbnail rule, apply it.
57
- THUMBNAIL_REGEXS.each do |provider, v|
51
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
58
52
  if @url.match(v[0])
59
53
  return @url.gsub(v[0], v[1])
60
54
  end
61
55
  end
62
- readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
56
+ readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
63
57
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
64
58
  path = readability_content.xpath('//img').first.attr('src')
65
59
  end
@@ -83,7 +77,7 @@ module WebStat
83
77
  File.open(tmp_file, "w+b") do |_file|
84
78
  if image.class == Mechanize::File
85
79
  _file.puts(image.body)
86
- else
80
+ elsif image.respond_to?(:body_io)
87
81
  _file.puts(image.body_io.read)
88
82
  end
89
83
  end
@@ -102,7 +96,16 @@ module WebStat
102
96
  raise Mechanize::RobotsDisallowedError.new(url)
103
97
  end
104
98
  if WebStat::Configure.get["use_chromedirver"]
105
- body = WebStat::WebDriverHelper.get_source(url)
99
+ begin
100
+ body = WebStat::WebDriverHelper.get_source(url)
101
+ rescue Selenium::WebDriver::Error::UnknownError => e
102
+ document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
103
+ if document.class == Mechanize::File
104
+ body = document.body
105
+ else
106
+ body = document.body.encode('UTF-8', document.encoding)
107
+ end
108
+ end
106
109
  @status = 200
107
110
  else
108
111
  document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.3.16"
2
+ VERSION = "0.4.1"
3
3
  end
@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
3
3
  configure = WebStat::Configure.get
4
4
  expect(configure).not_to eq nil
5
5
  end
6
-
6
+
7
7
  it "Readable Config" do
8
8
  config = WebStat::Configure.get
9
-
9
+
10
10
  expect(config["min_length_of_meta_title"]).to eq 10
11
11
  expect(config["regex_to_sprit_title"]).to eq '\||-|:|||:|〜|\~| – '
12
12
  end
13
+
14
+ it "Get thumbnail_regex.youtube." do
15
+ config = WebStat::Configure.get
16
+ expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
17
+ expect(config["thumbnail_regex"]["youtube"].count).to eq 2
18
+ end
19
+
20
+ it "Match youtube url." do
21
+ sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
22
+ WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
23
+ if sample_url.match(v[0])
24
+ expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
25
+ end
26
+ end
27
+ end
13
28
  end
@@ -29,6 +29,8 @@ Gem::Specification.new do |spec|
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
30
  spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
31
31
  spec.add_runtime_dependency "pdf-reader", "2.4.0"
32
+ spec.add_runtime_dependency "webrick", ">= 1.7.0"
33
+ spec.add_runtime_dependency "rexml", ">= 3.2.4"
32
34
 
33
35
  spec.add_development_dependency "rake", ">= 10.0"
34
36
  spec.add_development_dependency "rspec", ">= 3.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.16
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-07 00:00:00.000000000 Z
11
+ date: 2021-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,34 @@ dependencies:
136
136
  - - '='
137
137
  - !ruby/object:Gem::Version
138
138
  version: 2.4.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: webrick
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: 1.7.0
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.7.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rexml
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: 3.2.4
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 3.2.4
139
167
  - !ruby/object:Gem::Dependency
140
168
  name: rake
141
169
  requirement: !ruby/object:Gem::Requirement
@@ -274,7 +302,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
274
302
  - !ruby/object:Gem::Version
275
303
  version: '0'
276
304
  requirements: []
277
- rubygems_version: 3.1.2
305
+ rubygems_version: 3.2.3
278
306
  signing_key:
279
307
  specification_version: 4
280
308
  summary: Get the status of the web pages.