webget 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +2 -1
- data/lib/webget.rb +4 -0
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +25 -6
- data/lib/webget/webget.rb +30 -4
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 199c22178984860a75e48ecc68db3be4f3e1f210
|
4
|
+
data.tar.gz: ca9268c0e0650acee9125f333d567c8c3d65e3e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02e990893588bfef74a704a1de9b0c361e184759ba81f5579aa6724254dd8321bdb900ae4c50556c9b4f6f7cd02ea0f7c3e13ce16d7eb4da36f679d4285bc134
|
7
|
+
data.tar.gz: 0ddefec72ef2900b43dc4160ddb97b1a94640c08fefcb1d4c0f5656e93d46e106ac6768ed030bbe905cbf03db0d4532ee89fcb87d956b894de2e2649e37db1cc
|
data/Rakefile
CHANGED
data/lib/webget.rb
CHANGED
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,16 +54,18 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response, format: 'html' )
|
58
|
-
cache.record( url, response, format: format );
|
57
|
+
def self.record( url, response, encoding: 'UTF-8', format: 'html' )
|
58
|
+
cache.record( url, response, encoding: encoding, format: format );
|
59
59
|
end
|
60
60
|
def self.cached?( url ) cache.cached?( url ); end
|
61
61
|
class << self
|
62
62
|
alias_method :exist?, :cached?
|
63
63
|
end
|
64
|
-
def self.url_to_id( url )
|
64
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
65
|
def self.read( url ) cache.read( url ); end
|
66
66
|
def self.read_json( url ) cache.read_json( url ); end
|
67
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
68
|
+
|
67
69
|
|
68
70
|
|
69
71
|
class DiskCache
|
@@ -86,11 +88,17 @@ class DiskCache
|
|
86
88
|
data
|
87
89
|
end
|
88
90
|
|
91
|
+
def read_csv( url )
|
92
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
93
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
94
|
+
data = CsvHash.parse( txt )
|
95
|
+
data
|
96
|
+
end
|
89
97
|
|
90
98
|
|
91
99
|
## add more save / put / etc. aliases - why? why not?
|
92
100
|
## rename to record_html - why? why not?
|
93
|
-
def record( url, response, format: 'html' )
|
101
|
+
def record( url, response, encoding: 'UTF-8', format: 'html' )
|
94
102
|
|
95
103
|
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
96
104
|
meta_path = "#{body_path}.meta.txt"
|
@@ -102,14 +110,22 @@ class DiskCache
|
|
102
110
|
puts "[cache] saving #{body_path}..."
|
103
111
|
|
104
112
|
## todo/check: verify content-type - why? why not?
|
113
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
105
114
|
if format == 'json'
|
106
115
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
116
|
+
elsif format == 'csv'
|
117
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
118
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
119
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
107
120
|
else
|
108
|
-
|
109
|
-
File.open( body_path, 'w:utf-8' ) {|f| f.write(
|
121
|
+
text = response.text( encoding: encoding )
|
122
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
110
123
|
end
|
111
124
|
|
125
|
+
|
112
126
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
127
|
+
## todo/check:
|
128
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
113
129
|
response.headers.each do |key, value| # iterate all response headers
|
114
130
|
f.write( "#{key}: #{value}" )
|
115
131
|
f.write( "\n" )
|
@@ -164,6 +180,9 @@ class DiskCache
|
|
164
180
|
.gsub( '=', '~')
|
165
181
|
|
166
182
|
req_path = "#{req_path}.html"
|
183
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
184
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
185
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
167
186
|
elsif host_dir.index( 'football-data.org' )
|
168
187
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
169
188
|
|
data/lib/webget/webget.rb
CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
19
19
|
|
20
20
|
|
21
21
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
25
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
40
40
|
response
|
41
41
|
end # method self.call
|
42
42
|
|
43
|
-
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
47
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
49
49
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
53
54
|
else
|
54
55
|
## todo/check - log error
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
@@ -60,5 +61,30 @@ class Webget # a web (go get) crawler
|
|
60
61
|
response
|
61
62
|
end # method self.page
|
62
63
|
|
64
|
+
|
65
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
66
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
67
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
68
|
+
puts " sleep #{config.sleep} sec(s)..."
|
69
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
70
|
+
|
71
|
+
response = Webclient.get( url, headers: headers )
|
72
|
+
|
73
|
+
if response.status.ok? ## must be HTTP 200
|
74
|
+
puts "#{response.status.code} #{response.status.message}"
|
75
|
+
Webcache.record( url, response,
|
76
|
+
encoding: encoding,
|
77
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
78
|
+
else
|
79
|
+
## todo/check - log error
|
80
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
81
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
82
|
+
end
|
83
|
+
|
84
|
+
## to be done / continued
|
85
|
+
response
|
86
|
+
end # method self.dataset
|
87
|
+
|
88
|
+
|
63
89
|
end # class Webget
|
64
90
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-11-
|
11
|
+
date: 2020-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -16,14 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csvreader
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.4
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rdoc
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|