webget 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Rakefile +1 -1
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +105 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6912004e235fc4a3aad299490d04d9ab9d44ea5f98005a348aa7a456349cfbcb
|
4
|
+
data.tar.gz: f2b8bcda71c738557c76957eb4e6e78ee4af52ec9311e5c18a47f27baf2b45df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 40d7bf5f844e00b67554455f1c191a7f189169ad075bffd0b8f8888f856128ed1c1ece29bf1a5932c7d22dc841b9f3b2c2f8ce9640fbed70ba4952c0cd7aa2f7
|
7
|
+
data.tar.gz: 5e4d940c50db3f43bb10160a2019eaa7dfc8d56743bf8d5aa1ad8130db089a971d8eb18ab7913398a5606a9347288734d34b5e5ad19bce63b903de899bcd3f3a
|
data/CHANGELOG.md
CHANGED
data/Rakefile
CHANGED
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -2,6 +2,69 @@
|
|
2
2
|
|
3
3
|
module Webcache
|
4
4
|
|
5
|
+
|
6
|
+
class Headers # nested class for convenience access to (meta) headers
|
7
|
+
|
8
|
+
def self.parse( txt )
|
9
|
+
data = {}
|
10
|
+
txt.each_line do |line|
|
11
|
+
line = line.strip
|
12
|
+
next if line.empty? || line.start_with?( '#' )
|
13
|
+
|
14
|
+
key, value = line.split( ':', 2 ) ## split on first colon
|
15
|
+
## always downcase keys for now
|
16
|
+
## and strip value from leading and trailing spaces
|
17
|
+
##
|
18
|
+
## todo/fix: deal with possible duplicate header keys!!
|
19
|
+
## if duplicate do NOT replease, add with leading ", " comma-separated!!!
|
20
|
+
##
|
21
|
+
## check if multi-line headers are possible!!!
|
22
|
+
data[ key.strip.downcase ] = value.strip
|
23
|
+
end
|
24
|
+
new( data )
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
def initialize( data )
|
30
|
+
@data = data
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_h() @data; end
|
34
|
+
def [](key) @data[key];; end
|
35
|
+
|
36
|
+
def each( &blk )
|
37
|
+
@data.each do |key, value|
|
38
|
+
blk.call( key, value )
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def date
|
44
|
+
## return date header
|
45
|
+
## parses the time as RFC 1123 date of HTTP-date defined by RFC 2616:
|
46
|
+
## day-of-week, DD month-name CCYY hh:mm:ss GMT
|
47
|
+
## !!! Note that the result is always UTC (GMT). !!!
|
48
|
+
## e.g. Sun, 19 May 2024 15:15:34 GMT
|
49
|
+
## Mon, 10 Jun 2024 15:58:16 GMT
|
50
|
+
@date ||= Time.httpdate( @data['date'] )
|
51
|
+
@date
|
52
|
+
end
|
53
|
+
|
54
|
+
## default to 12h (60secs*60min*12h)
|
55
|
+
def expired?( expires_in_date=Time.now.utc-60*60*12 )
|
56
|
+
## pp expires_in_date
|
57
|
+
expires_in_date > date
|
58
|
+
end
|
59
|
+
|
60
|
+
## add convenience helpers - why? why not?
|
61
|
+
def expired_in_12h?() expired?( Time.now.utc-60*60*12 ); end
|
62
|
+
def expired_in_24h?() expired?( Time.now.utc-60*60*24 ); end
|
63
|
+
alias_method :expired_in_1d?, :expired_in_24h?
|
64
|
+
end # class Headers
|
65
|
+
|
66
|
+
|
67
|
+
|
5
68
|
#####
|
6
69
|
# copied from props gem, see Env.home
|
7
70
|
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
@@ -73,6 +136,26 @@ module Webcache
|
|
73
136
|
def self.read_json( url ) cache.read_json( url ); end
|
74
137
|
def self.read_csv( url ) cache.read_csv( url ); end
|
75
138
|
|
139
|
+
#### new - read (cached) meta data
|
140
|
+
## todo/check - find a better/different name - why? why not?
|
141
|
+
## e.g. read_headers or simply meta or headers or such
|
142
|
+
def self.read_meta( url ) cache.read_meta( url ); end
|
143
|
+
|
144
|
+
## add convenience expire (shortcut) helpers
|
145
|
+
def self.expired?( url, expires_in: Time.now.utc-60*60*12 )
|
146
|
+
if cached?( url )
|
147
|
+
meta = read_meta( url )
|
148
|
+
meta.expired?( expires_in )
|
149
|
+
else
|
150
|
+
true # note - not in cache; expired by default
|
151
|
+
end
|
152
|
+
end
|
153
|
+
def self.expired_in_12h?( url ) expired?( url, expires_in: Time.now.utc-60*60*12 ); end
|
154
|
+
def self.expired_in_24h?( url ) expired?( url, expires_in: Time.now.utc-60*60*24 ); end
|
155
|
+
class << self
|
156
|
+
alias_method :expired_in_1d?, :expired_in_24h?
|
157
|
+
end
|
158
|
+
|
76
159
|
|
77
160
|
|
78
161
|
class DiskCache
|
@@ -103,6 +186,15 @@ class DiskCache
|
|
103
186
|
end
|
104
187
|
|
105
188
|
|
189
|
+
def read_meta( url )
|
190
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
191
|
+
meta_path = "#{body_path}.meta.txt"
|
192
|
+
txt = File.open( meta_path, 'r:utf-8' ) {|f| f.read }
|
193
|
+
data = Headers.parse( txt )
|
194
|
+
data
|
195
|
+
end
|
196
|
+
|
197
|
+
|
106
198
|
## add more save / put / etc. aliases - why? why not?
|
107
199
|
## rename to record_html - why? why not?
|
108
200
|
def record( url, response,
|
@@ -177,7 +269,19 @@ class DiskCache
|
|
177
269
|
|
178
270
|
### special "prettify" rule for weltfussball
|
179
271
|
## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
|
180
|
-
|
272
|
+
|
273
|
+
### todo/fix - move rules downstream to user - why? why not?
|
274
|
+
|
275
|
+
if host_dir.index( 'uefa.com' ) ||
|
276
|
+
host_dir.index( 'kicker.de' ) ||
|
277
|
+
host_dir.index( 'kicekr.at' )
|
278
|
+
if req_path.end_with?( '/' )
|
279
|
+
req_path = "#{req_path[0..-2]}.html"
|
280
|
+
else
|
281
|
+
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
282
|
+
exit 1
|
283
|
+
end
|
284
|
+
elsif host_dir.index( 'weltfussball.de' ) ||
|
181
285
|
host_dir.index( 'worldfootball.net' )
|
182
286
|
if req_path.end_with?( '/' )
|
183
287
|
req_path = "#{req_path[0..-2]}.html"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -117,7 +117,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
117
117
|
requirements:
|
118
118
|
- - ">="
|
119
119
|
- !ruby/object:Gem::Version
|
120
|
-
version:
|
120
|
+
version: 3.1.0
|
121
121
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
122
|
requirements:
|
123
123
|
- - ">="
|