webget 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +2 -1
- data/lib/webget.rb +4 -0
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +25 -6
- data/lib/webget/webget.rb +30 -4
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 199c22178984860a75e48ecc68db3be4f3e1f210
|
4
|
+
data.tar.gz: ca9268c0e0650acee9125f333d567c8c3d65e3e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02e990893588bfef74a704a1de9b0c361e184759ba81f5579aa6724254dd8321bdb900ae4c50556c9b4f6f7cd02ea0f7c3e13ce16d7eb4da36f679d4285bc134
|
7
|
+
data.tar.gz: 0ddefec72ef2900b43dc4160ddb97b1a94640c08fefcb1d4c0f5656e93d46e106ac6768ed030bbe905cbf03db0d4532ee89fcb87d956b894de2e2649e37db1cc
|
data/Rakefile
CHANGED
data/lib/webget.rb
CHANGED
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,16 +54,18 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response, format: 'html' )
|
58
|
-
cache.record( url, response, format: format );
|
57
|
+
def self.record( url, response, encoding: 'UTF-8', format: 'html' )
|
58
|
+
cache.record( url, response, encoding: encoding, format: format );
|
59
59
|
end
|
60
60
|
def self.cached?( url ) cache.cached?( url ); end
|
61
61
|
class << self
|
62
62
|
alias_method :exist?, :cached?
|
63
63
|
end
|
64
|
-
def self.url_to_id( url )
|
64
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
65
|
def self.read( url ) cache.read( url ); end
|
66
66
|
def self.read_json( url ) cache.read_json( url ); end
|
67
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
68
|
+
|
67
69
|
|
68
70
|
|
69
71
|
class DiskCache
|
@@ -86,11 +88,17 @@ class DiskCache
|
|
86
88
|
data
|
87
89
|
end
|
88
90
|
|
91
|
+
def read_csv( url )
|
92
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
93
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
94
|
+
data = CsvHash.parse( txt )
|
95
|
+
data
|
96
|
+
end
|
89
97
|
|
90
98
|
|
91
99
|
## add more save / put / etc. aliases - why? why not?
|
92
100
|
## rename to record_html - why? why not?
|
93
|
-
def record( url, response, format: 'html' )
|
101
|
+
def record( url, response, encoding: 'UTF-8', format: 'html' )
|
94
102
|
|
95
103
|
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
96
104
|
meta_path = "#{body_path}.meta.txt"
|
@@ -102,14 +110,22 @@ class DiskCache
|
|
102
110
|
puts "[cache] saving #{body_path}..."
|
103
111
|
|
104
112
|
## todo/check: verify content-type - why? why not?
|
113
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
105
114
|
if format == 'json'
|
106
115
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
116
|
+
elsif format == 'csv'
|
117
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
118
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
119
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
107
120
|
else
|
108
|
-
|
109
|
-
File.open( body_path, 'w:utf-8' ) {|f| f.write(
|
121
|
+
text = response.text( encoding: encoding )
|
122
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
110
123
|
end
|
111
124
|
|
125
|
+
|
112
126
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
127
|
+
## todo/check:
|
128
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
113
129
|
response.headers.each do |key, value| # iterate all response headers
|
114
130
|
f.write( "#{key}: #{value}" )
|
115
131
|
f.write( "\n" )
|
@@ -164,6 +180,9 @@ class DiskCache
|
|
164
180
|
.gsub( '=', '~')
|
165
181
|
|
166
182
|
req_path = "#{req_path}.html"
|
183
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
184
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
185
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
167
186
|
elsif host_dir.index( 'football-data.org' )
|
168
187
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
169
188
|
|
data/lib/webget/webget.rb
CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
19
19
|
|
20
20
|
|
21
21
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
25
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
40
40
|
response
|
41
41
|
end # method self.call
|
42
42
|
|
43
|
-
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
47
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
49
49
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
53
54
|
else
|
54
55
|
## todo/check - log error
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
@@ -60,5 +61,30 @@ class Webget # a web (go get) crawler
|
|
60
61
|
response
|
61
62
|
end # method self.page
|
62
63
|
|
64
|
+
|
65
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
66
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
67
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
68
|
+
puts " sleep #{config.sleep} sec(s)..."
|
69
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
70
|
+
|
71
|
+
response = Webclient.get( url, headers: headers )
|
72
|
+
|
73
|
+
if response.status.ok? ## must be HTTP 200
|
74
|
+
puts "#{response.status.code} #{response.status.message}"
|
75
|
+
Webcache.record( url, response,
|
76
|
+
encoding: encoding,
|
77
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
78
|
+
else
|
79
|
+
## todo/check - log error
|
80
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
81
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
82
|
+
end
|
83
|
+
|
84
|
+
## to be done / continued
|
85
|
+
response
|
86
|
+
end # method self.dataset
|
87
|
+
|
88
|
+
|
63
89
|
end # class Webget
|
64
90
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-11-
|
11
|
+
date: 2020-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -16,14 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csvreader
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.4
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rdoc
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|