web_stat 0.4.0 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 29d04d8379978d1e295b829193a330fd9bab8cb92326e83aab39d066059dcb02
4
- data.tar.gz: c2df9896f21ec9d777dea101a1c4c4be513ad28c11428fc1104ccefe08f98b40
3
+ metadata.gz: 3ce60bcf4a31f90024abc35cf1ecc57e32626dfb5ecf4bc4f2280bd72931ff34
4
+ data.tar.gz: 575f805a63a995b2d0e3bc909978dcd4e3b4f15462f717cf83d6aac00c96078d
5
5
  SHA512:
6
- metadata.gz: 728bc129ddced4cc58081ca4d8c02b761ba77e4adf277b736115ce51b1e7293f7b208adc5742fe50ba4e406f2a7c3f65af19d5c55f0cf0188a4a4b25704719bd
7
- data.tar.gz: ef8ee476834a6fb75fed33476d070e1b00044f9115b85eaf3148f1901af10e4cd32f79ef3b369e59366a56ab7e8560180da3def8562e2b396f2b8b6bfc26196b
6
+ metadata.gz: 7af1262b25163205eabdfa26e1671f95d53963387eceec6c2c99da0a3a17359b77aaa097ec9556010b8000ecc0772d1dc19a67d128310c8d6bdd8379d008a913
7
+ data.tar.gz: 855706ad8525609e8a2a50ca64091081794940d69187f951a79c02c66acfa5be9f53cdeacf0f99f6c0431ad41a71ce616691590d40bce0a56576275fba96c453
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- web_stat (0.3.19)
4
+ web_stat (0.4.6)
5
5
  bundler (>= 2.0.2)
6
6
  cld (>= 0.8.0)
7
- mechanize (>= 2.7)
7
+ mechanize (>= 2.7.7)
8
8
  natto (>= 1.1.2)
9
9
  nokogiri (>= 1.10.4)
10
10
  pdf-reader (= 2.4.0)
@@ -26,10 +26,11 @@ GEM
26
26
  cld (0.8.0)
27
27
  ffi
28
28
  coderay (1.1.3)
29
- crack (0.4.3)
30
- safe_yaml (~> 1.0.0)
29
+ connection_pool (2.2.3)
30
+ crack (0.4.5)
31
+ rexml
31
32
  crass (1.0.6)
32
- diff-lcs (1.3)
33
+ diff-lcs (1.4.4)
33
34
  domain_name (0.5.20190701)
34
35
  unf (>= 0.0.5, < 1.0.0)
35
36
  ffi (1.14.2)
@@ -38,7 +39,7 @@ GEM
38
39
  hashery (2.1.2)
39
40
  http-cookie (1.0.3)
40
41
  domain_name (~> 0.5)
41
- mechanize (2.7.6)
42
+ mechanize (2.7.7)
42
43
  domain_name (~> 0.5, >= 0.5.1)
43
44
  http-cookie (~> 1.0)
44
45
  mime-types (>= 1.17.2)
@@ -46,16 +47,20 @@ GEM
46
47
  net-http-persistent (>= 2.5.2)
47
48
  nokogiri (~> 1.6)
48
49
  ntlm-http (~> 0.1, >= 0.1.1)
50
+ webrick (~> 1.7)
49
51
  webrobots (>= 0.0.9, < 0.2)
50
52
  method_source (1.0.0)
51
53
  mime-types (3.3.1)
52
54
  mime-types-data (~> 3.2015)
53
- mime-types-data (3.2020.1104)
55
+ mime-types-data (3.2021.0212)
56
+ mini_portile2 (2.5.0)
54
57
  natto (1.2.0)
55
58
  ffi (>= 1.9.0)
56
59
  net-http-digest_auth (1.4.1)
57
- net-http-persistent (2.9.4)
58
- nokogiri (1.11.1-x86_64-linux)
60
+ net-http-persistent (4.0.1)
61
+ connection_pool (~> 2.2)
62
+ nokogiri (1.11.1)
63
+ mini_portile2 (~> 2.5.0)
59
64
  racc (~> 1.4)
60
65
  nokogumbo (2.0.4)
61
66
  nokogiri (~> 1.8, >= 1.8.4)
@@ -72,30 +77,29 @@ GEM
72
77
  pry-byebug (3.9.0)
73
78
  byebug (~> 11.0)
74
79
  pry (~> 0.13.0)
75
- public_suffix (4.0.5)
80
+ public_suffix (4.0.6)
76
81
  racc (1.5.2)
77
- rake (13.0.1)
82
+ rake (13.0.3)
78
83
  rexml (3.2.4)
79
- rspec (3.9.0)
80
- rspec-core (~> 3.9.0)
81
- rspec-expectations (~> 3.9.0)
82
- rspec-mocks (~> 3.9.0)
83
- rspec-core (3.9.2)
84
- rspec-support (~> 3.9.3)
85
- rspec-expectations (3.9.2)
84
+ rspec (3.10.0)
85
+ rspec-core (~> 3.10.0)
86
+ rspec-expectations (~> 3.10.0)
87
+ rspec-mocks (~> 3.10.0)
88
+ rspec-core (3.10.1)
89
+ rspec-support (~> 3.10.0)
90
+ rspec-expectations (3.10.1)
86
91
  diff-lcs (>= 1.2.0, < 2.0)
87
- rspec-support (~> 3.9.0)
88
- rspec-mocks (3.9.1)
92
+ rspec-support (~> 3.10.0)
93
+ rspec-mocks (3.10.2)
89
94
  diff-lcs (>= 1.2.0, < 2.0)
90
- rspec-support (~> 3.9.0)
91
- rspec-support (3.9.3)
95
+ rspec-support (~> 3.10.0)
96
+ rspec-support (3.10.2)
92
97
  ruby-rc4 (0.1.5)
93
98
  ruby-readability (0.7.0)
94
99
  guess_html_encoding (>= 0.0.4)
95
100
  nokogiri (>= 1.6.0)
96
101
  rubyzip (2.3.0)
97
- safe_yaml (1.0.5)
98
- sanitize (5.2.2)
102
+ sanitize (5.2.3)
99
103
  crass (~> 1.0.2)
100
104
  nokogiri (>= 1.8.0)
101
105
  nokogumbo (~> 2.0)
@@ -106,7 +110,7 @@ GEM
106
110
  unf (0.1.4)
107
111
  unf_ext
108
112
  unf_ext (0.0.7.7)
109
- webmock (3.8.3)
113
+ webmock (3.11.2)
110
114
  addressable (>= 2.3.6)
111
115
  crack (>= 0.3.2)
112
116
  hashdiff (>= 0.4.0, < 2.0.0)
@@ -1,6 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
- attr_accessor :url, :html, :nokogiri, :userdic, :status
3
+ attr_accessor :url, :html, :nokogiri, :userdic, :status, :header
4
4
  # Get title
5
5
  # @return [String] title
6
6
  def title
@@ -95,11 +95,13 @@ module WebStat
95
95
  if mech.agent.robots_disallowed?(url)
96
96
  raise Mechanize::RobotsDisallowedError.new(url)
97
97
  end
98
- if WebStat::Configure.get["use_chromedirver"]
98
+ document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
99
+ @header = document.header
100
+ begin
101
+ raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"]
99
102
  body = WebStat::WebDriverHelper.get_source(url)
100
103
  @status = 200
101
- else
102
- document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
104
+ rescue
103
105
  if document.class == Mechanize::File
104
106
  body = document.body
105
107
  else
@@ -113,6 +115,24 @@ module WebStat
113
115
  end
114
116
  body
115
117
  end
118
+
119
+ # Return Date or last modified header.
120
+ # @param [String] url
121
+ # @return DataTime
122
+ def get_last_modified
123
+ @header = @header || {}
124
+ if @header.has_key?("date") && @header.has_key?("last-modified")
125
+ if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"])
126
+ DateTime.parse(@header["date"])
127
+ else
128
+ DateTime.parse(@header["last-modified"])
129
+ end
130
+ elsif @header.has_key?("date")
131
+ DateTime.parse(@header["date"])
132
+ elsif @header.has_key?("last-modified")
133
+ DateTime.parse(@header["last-modified"])
134
+ end
135
+ end
116
136
 
117
137
  # Get the informations of @url
118
138
  # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
@@ -133,6 +153,7 @@ module WebStat
133
153
  language_code: language_code,
134
154
  status: @status,
135
155
  url: @url,
156
+ last_modified_at: get_last_modified,
136
157
  eyecatch_image_path: save_local_path(eyecatch_image_path),
137
158
  tags: tag.nouns
138
159
  }
@@ -5,7 +5,7 @@ module WebStat
5
5
  # initialize class
6
6
  # @param [String] url
7
7
  def initialize(url)
8
- unless url_valid?(url)
8
+ unless FetchAsWeb.url_valid?(url)
9
9
  raise WebStat::INVALID_URL, url
10
10
  end
11
11
  @url = original_url(url)
@@ -36,11 +36,12 @@ module WebStat
36
36
  end
37
37
  @nokogiri = ::Nokogiri::HTML(@html)
38
38
  end
39
-
40
- # Validation url
41
- def url_valid?(url)
42
- regexp = Regexp.new("^https?://([a-zA-Z0-9][a-zA-Z0-9\\\-\.]{1,61}[a-zA-Z0-9])\\\.([a-zA-Z]{2,})(.*)?$", Regexp::IGNORECASE)
43
- regexp.match?(url)
44
- end
39
+ class << self
40
+ # Validation url
41
+ def url_valid?(url)
42
+ regexp = Regexp.new("^https?://([a-z0-9][a-z0-9\\\-\.]{0,61})\\\.([a-z]{2,})(.*)?$", Regexp::IGNORECASE)
43
+ regexp.match?(url)
44
+ end
45
+ end
45
46
  end
46
47
  end
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.4.0"
3
- end
2
+ VERSION = "0.4.6"
3
+ end
data/spec/spec_helper.rb CHANGED
@@ -102,4 +102,10 @@ WebMock.stub_request(:get, "https://cdn.newsdict.jp/assets/newsdict-5d8601394c3f
102
102
  .to_return(
103
103
  status: 200,
104
104
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "images", "newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png")),
105
- headers: {content_type: 'application/html; charset=utf-8'})
105
+ headers: {content_type: 'application/html; charset=utf-8'})
106
+
107
+ WebMock.stub_request(:get, "https://newsdict.blog/last_modified_at")
108
+ .to_return(
109
+ status: 200,
110
+ body: "ok",
111
+ headers: {content_type: 'application/html; charset=utf-8', date: "Tue, 05 Apr 2016 07:43:08 GMT", "Last-Modified": "Tue, 05 Apr 2020 07:43:08 JST"})
@@ -197,11 +197,16 @@ RSpec.describe WebStat::Fetch do
197
197
  end
198
198
 
199
199
  it "valid url" do
200
- web_stat_fetch_web_class = WebStat::FetchAsWeb.new("https://newsdict.blog/content/images/size/w100/2019/03/facebook-3.jpg")
201
- expect(web_stat_fetch_web_class.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
202
- expect(web_stat_fetch_web_class.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
203
- expect(web_stat_fetch_web_class.url_valid?("https://www.meetup.com/pro/docker")).to be true
204
- expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
205
- expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
200
+ expect(WebStat::FetchAsWeb.url_valid?("http://status.aws.amazon.com/#cloudfront_12345")).to be true
201
+ expect(WebStat::FetchAsWeb.url_valid?("https://findy-code.io?h=NWsZey5UgJ51u&t=omikuji-22")).to be true
202
+ expect(WebStat::FetchAsWeb.url_valid?("https://www.meetup.com/pro/docker")).to be true
203
+ expect(WebStat::FetchAsWeb.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
204
+ expect(WebStat::FetchAsWeb.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
205
+ expect(WebStat::FetchAsWeb.url_valid?("http://g.co/arts/SK1jZHJpT8N1BGaM7")).to be true
206
+ end
207
+
208
+ it "get_last_modified" do
209
+ web_stat = WebStat::FetchAsWeb.new("https://newsdict.blog/last_modified_at")
210
+ web_stat.stat[:last_modified_at] === DateTime.parse("Tue, 05 Apr 2020 07:43:08 JST")
206
211
  end
207
212
  end
data/web_stat.gemspec CHANGED
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
22
22
 
23
23
  spec.add_runtime_dependency "bundler", ">= 2.0.2"
24
24
  spec.add_runtime_dependency "nokogiri", ">= 1.10.4"
25
- spec.add_runtime_dependency "mechanize", ">= 2.7"
25
+ spec.add_runtime_dependency "mechanize", ">= 2.7.7"
26
26
  spec.add_runtime_dependency "ruby-readability", ">= 0.7"
27
27
  spec.add_runtime_dependency "natto", ">= 1.1.2"
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-09 00:00:00.000000000 Z
11
+ date: 2021-02-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '2.7'
47
+ version: 2.7.7
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '2.7'
54
+ version: 2.7.7
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: ruby-readability
57
57
  requirement: !ruby/object:Gem::Requirement