web_stat 0.4.1 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fc357cc98a214af7a391fe834ec927b43649ca8d8c5b1105e7fb1947dd7127a
4
- data.tar.gz: f7c5f6d28101ff677aaa4db5b9b486d067798b0a8461f91352db252f68374513
3
+ metadata.gz: 234586f7faedd6315e64b118b3f63532d2ea64f5a56fa046c78d56ae3be52935
4
+ data.tar.gz: 71b73587d78fa05da29ffe15c2c381e3fccd31b76025060bbeedcff1f5ea5cf5
5
5
  SHA512:
6
- metadata.gz: 1616c4f1a64ebc693ee4e822b06150028783736448850d8aca80c90909ca879580a82f871f95375904b38813b10155105fd5d1121ed0321ae9cc898a4b95549e
7
- data.tar.gz: 1637fb0b6b6f8ee83b133d95e3ab31a6c4ad2a64399f4ac19173f215ec7a79eac851c1348d9efa1efbfdc88a7ac8e5d6ecd9f9c067c4295b0a82f2662ef4c448
6
+ metadata.gz: 34b7f3a68413b53865c2c5e57dc78ab2627689e656ed6ebc9d69938804cbbbf490e4148f2e0e44afbd8da4c9052423427bd053f90bf24fe65195c4ded0ede57d
7
+ data.tar.gz: ae11480e79de83e5c082e3663fa6570a64e85da5739b3c009a500ad4bd88bbd95f9fa7045362f211aff014f3e3aed9d363158f91e33ab87d60fc09e9dc26b3d3
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.4.1)
4
+ web_stat (0.4.7)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
- mechanize (>= 2.7)
7
+ mechanize (>= 2.7.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
@@ -26,10 +26,11 @@ GEM
26
26
  cld (0.8.0)
27
27
  ffi
28
28
  coderay (1.1.3)
29
- crack (0.4.3)
30
- safe_yaml (~> 1.0.0)
29
+ connection_pool (2.2.3)
30
+ crack (0.4.5)
31
+ rexml
31
32
  crass (1.0.6)
32
- diff-lcs (1.3)
33
+ diff-lcs (1.4.4)
33
34
  domain_name (0.5.20190701)
34
35
  unf (>= 0.0.5, < 1.0.0)
35
36
  ffi (1.14.2)
@@ -38,7 +39,7 @@ GEM
38
39
  hashery (2.1.2)
39
40
  http-cookie (1.0.3)
40
41
  domain_name (~> 0.5)
41
- mechanize (2.7.6)
42
+ mechanize (2.7.7)
42
43
  domain_name (~> 0.5, >= 0.5.1)
43
44
  http-cookie (~> 1.0)
44
45
  mime-types (>= 1.17.2)
@@ -46,15 +47,17 @@ GEM
46
47
  net-http-persistent (>= 2.5.2)
47
48
  nokogiri (~> 1.6)
48
49
  ntlm-http (~> 0.1, >= 0.1.1)
50
+ webrick (~> 1.7)
49
51
  webrobots (>= 0.0.9, < 0.2)
50
52
  method_source (1.0.0)
51
53
  mime-types (3.3.1)
52
54
  mime-types-data (~> 3.2015)
53
- mime-types-data (3.2020.1104)
55
+ mime-types-data (3.2021.0225)
54
56
  natto (1.2.0)
55
57
  ffi (>= 1.9.0)
56
58
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (2.9.4)
59
+ net-http-persistent (4.0.1)
60
+ connection_pool (~> 2.2)
58
61
  nokogiri (1.11.1-x86_64-linux)
59
62
  racc (~> 1.4)
60
63
  nokogumbo (2.0.4)
@@ -72,30 +75,29 @@ GEM
72
75
  pry-byebug (3.9.0)
73
76
  byebug (~> 11.0)
74
77
  pry (~> 0.13.0)
75
- public_suffix (4.0.5)
78
+ public_suffix (4.0.6)
76
79
  racc (1.5.2)
77
- rake (13.0.1)
80
+ rake (13.0.3)
78
81
  rexml (3.2.4)
79
- rspec (3.9.0)
80
- rspec-core (~> 3.9.0)
81
- rspec-expectations (~> 3.9.0)
82
- rspec-mocks (~> 3.9.0)
83
- rspec-core (3.9.2)
84
- rspec-support (~> 3.9.3)
85
- rspec-expectations (3.9.2)
82
+ rspec (3.10.0)
83
+ rspec-core (~> 3.10.0)
84
+ rspec-expectations (~> 3.10.0)
85
+ rspec-mocks (~> 3.10.0)
86
+ rspec-core (3.10.1)
87
+ rspec-support (~> 3.10.0)
88
+ rspec-expectations (3.10.1)
86
89
  diff-lcs (>= 1.2.0, < 2.0)
87
- rspec-support (~> 3.9.0)
88
- rspec-mocks (3.9.1)
90
+ rspec-support (~> 3.10.0)
91
+ rspec-mocks (3.10.2)
89
92
  diff-lcs (>= 1.2.0, < 2.0)
90
- rspec-support (~> 3.9.0)
91
- rspec-support (3.9.3)
93
+ rspec-support (~> 3.10.0)
94
+ rspec-support (3.10.2)
92
95
  ruby-rc4 (0.1.5)
93
96
  ruby-readability (0.7.0)
94
97
  guess_html_encoding (>= 0.0.4)
95
98
  nokogiri (>= 1.6.0)
96
99
  rubyzip (2.3.0)
97
- safe_yaml (1.0.5)
98
- sanitize (5.2.2)
100
+ sanitize (5.2.3)
99
101
  crass (~> 1.0.2)
100
102
  nokogiri (>= 1.8.0)
101
103
  nokogumbo (~> 2.0)
@@ -106,7 +108,7 @@ GEM
106
108
  unf (0.1.4)
107
109
  unf_ext
108
110
  unf_ext (0.0.7.7)
109
- webmock (3.8.3)
111
+ webmock (3.11.2)
110
112
  addressable (>= 2.3.6)
111
113
  crack (>= 0.3.2)
112
114
  hashdiff (>= 0.4.0, < 2.0.0)
@@ -1,6 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
- attr_accessor :url, :html, :nokogiri, :userdic, :status
3
+ attr_accessor :url, :html, :nokogiri, :userdic, :status, :header
4
4
  # Get title
5
5
  # @return [String] title
6
6
  def title
@@ -82,6 +82,8 @@ module WebStat
82
82
  end
83
83
  end
84
84
  tmp_file
85
+ rescue
86
+ false
85
87
  end
86
88
 
87
89
  # Get url
@@ -95,20 +97,13 @@ module WebStat
95
97
  if mech.agent.robots_disallowed?(url)
96
98
  raise Mechanize::RobotsDisallowedError.new(url)
97
99
  end
98
- if WebStat::Configure.get["use_chromedirver"]
99
- begin
100
- body = WebStat::WebDriverHelper.get_source(url)
101
- rescue Selenium::WebDriver::Error::UnknownError => e
102
- document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
103
- if document.class == Mechanize::File
104
- body = document.body
105
- else
106
- body = document.body.encode('UTF-8', document.encoding)
107
- end
108
- end
100
+ document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
101
+ @header = document.header
102
+ begin
103
+ raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"]
104
+ body = WebStat::WebDriverHelper.get_source(url)
109
105
  @status = 200
110
- else
111
- document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
106
+ rescue
112
107
  if document.class == Mechanize::File
113
108
  body = document.body
114
109
  else
@@ -122,6 +117,24 @@ module WebStat
122
117
  end
123
118
  body
124
119
  end
120
+
121
+ # Return Date or last modified header.
122
+ # @param [String] url
123
+ # @return DataTime
124
+ def get_last_modified
125
+ @header = @header || {}
126
+ if @header.has_key?("date") && @header.has_key?("last-modified")
127
+ if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"])
128
+ DateTime.parse(@header["date"])
129
+ else
130
+ DateTime.parse(@header["last-modified"])
131
+ end
132
+ elsif @header.has_key?("date")
133
+ DateTime.parse(@header["date"])
134
+ elsif @header.has_key?("last-modified")
135
+ DateTime.parse(@header["last-modified"])
136
+ end
137
+ end
125
138
 
126
139
  # Get the informations of @url
127
140
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
@@ -142,6 +155,7 @@ module WebStat
142
155
  language_code: language_code,
143
156
  status: @status,
144
157
  url: @url,
158
+ last_modified_at: get_last_modified,
145
159
  eyecatch_image_path: save_local_path(eyecatch_image_path),
146
160
  tags: tag.nouns
147
161
  }
@@ -5,7 +5,7 @@ module WebStat
5
5
  # initialize class
6
6
  # @param [String] url
7
7
  def initialize(url)
8
- unless url_valid?(url)
8
+ unless FetchAsWeb.url_valid?(url)
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
@@ -36,11 +36,12 @@ module WebStat
36
36
  end
37
37
  @nokogiri = ::Nokogiri::HTML(@html)
38
38
  end
39
-
40
- # Validation url
41
- def url_valid?(url)
42
- regexp = Regexp.new("^https?://([a-zA-Z0-9][a-zA-Z0-9\\\-\.]{1,61}[a-zA-Z0-9])\\\.([a-zA-Z]{2,})(.*)?$", Regexp::IGNORECASE)
43
- regexp.match?(url)
44
- end
39
+ class << self
40
+ # Validation url
41
+ def url_valid?(url)
42
+ regexp = Regexp.new("^https?://([a-z0-9][a-z0-9\\\-\.]{0,61})\\\.([a-z]{2,})(.*)?$", Regexp::IGNORECASE)
43
+ regexp.match?(url)
44
+ end
45
+ end
45
46
  end
46
47
  end
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.7"
3
3
  end
data/spec/spec_helper.rb CHANGED
@@ -102,4 +102,10 @@ WebMock.stub_request(:get, "https://cdn.newsdict.jp/assets/newsdict-5d8601394c3f
102
102
  .to_return(
103
103
  status: 200,
104
104
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "images", "newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png")),
105
- headers: {content_type: 'application/html; charset=utf-8'})
105
+ headers: {content_type: 'application/html; charset=utf-8'})
106
+
107
+ WebMock.stub_request(:get, "https://newsdict.blog/last_modified_at")
108
+ .to_return(
109
+ status: 200,
110
+ body: "ok",
111
+ headers: {content_type: 'application/html; charset=utf-8', date: "Tue, 05 Apr 2016 07:43:08 GMT", "Last-Modified": "Tue, 05 Apr 2020 07:43:08 JST"})
@@ -197,11 +197,16 @@ RSpec.describe WebStat::Fetch do
197
197
  end
198
198
 
199
199
  it "valid url" do
200
- web_stat_fetch_web_class = WebStat::FetchAsWeb.new("https://newsdict.blog/content/images/size/w100/2019/03/facebook-3.jpg")
201
- expect(web_stat_fetch_web_class.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
202
- expect(web_stat_fetch_web_class.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
203
- expect(web_stat_fetch_web_class.url_valid?("https://www.meetup.com/pro/docker")).to be true
204
- expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
205
- expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
200
+ expect(WebStat::FetchAsWeb.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
201
+ expect(WebStat::FetchAsWeb.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
202
+ expect(WebStat::FetchAsWeb.url_valid?("https://www.meetup.com/pro/docker")).to be true
203
+ expect(WebStat::FetchAsWeb.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
204
+ expect(WebStat::FetchAsWeb.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
205
+ expect(WebStat::FetchAsWeb.url_valid?("http://g.co/arts/SK1jZHJpT8N1BGaM7")).to be true
206
+ end
207
+
208
+ it "get_last_modified" do
209
+ web_stat = WebStat::FetchAsWeb.new("https://newsdict.blog/last_modified_at")
210
+ web_stat.stat[:last_modified_at] === DateTime.parse("Tue, 05 Apr 2020 07:43:08 JST")
206
211
  end
207
212
  end
data/web_stat.gemspec CHANGED
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
22
22
 
23
23
  spec.add_runtime_dependency "bundler", ">= 2.0.2"
24
24
  spec.add_runtime_dependency "nokogiri", ">= 1.10.4"
25
- spec.add_runtime_dependency "mechanize", ">= 2.7"
25
+ spec.add_runtime_dependency "mechanize", ">= 2.7.7"
26
26
  spec.add_runtime_dependency "ruby-readability", ">= 0.7"
27
27
  spec.add_runtime_dependency "natto", ">= 1.1.2"
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-31 00:00:00.000000000 Z
11
+ date: 2021-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '2.7'
47
+ version: 2.7.7
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '2.7'
54
+ version: 2.7.7
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: ruby-readability
57
57
  requirement: !ruby/object:Gem::Requirement