webget 0.2.0 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Rakefile +2 -1
- data/lib/webget.rb +4 -0
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +67 -13
- data/lib/webget/webget.rb +54 -4
- metadata +22 -9
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
|
|
4
|
+
data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
|
|
7
|
+
data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
|
data/Rakefile
CHANGED
data/lib/webget.rb
CHANGED
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
|
@@ -54,16 +54,24 @@ module Webcache
|
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
|
56
56
|
|
|
57
|
-
def self.record( url, response,
|
|
58
|
-
|
|
57
|
+
def self.record( url, response,
|
|
58
|
+
path: nil,
|
|
59
|
+
encoding: 'UTF-8',
|
|
60
|
+
format: 'html' )
|
|
61
|
+
cache.record( url, response,
|
|
62
|
+
path: path,
|
|
63
|
+
encoding: encoding,
|
|
64
|
+
format: format );
|
|
59
65
|
end
|
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
|
61
67
|
class << self
|
|
62
68
|
alias_method :exist?, :cached?
|
|
63
69
|
end
|
|
64
|
-
def self.url_to_id( url )
|
|
70
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
|
65
71
|
def self.read( url ) cache.read( url ); end
|
|
66
72
|
def self.read_json( url ) cache.read_json( url ); end
|
|
73
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
|
74
|
+
|
|
67
75
|
|
|
68
76
|
|
|
69
77
|
class DiskCache
|
|
@@ -86,13 +94,22 @@ class DiskCache
|
|
|
86
94
|
data
|
|
87
95
|
end
|
|
88
96
|
|
|
97
|
+
def read_csv( url )
|
|
98
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
99
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
|
100
|
+
data = CsvHash.parse( txt )
|
|
101
|
+
data
|
|
102
|
+
end
|
|
89
103
|
|
|
90
104
|
|
|
91
105
|
## add more save / put / etc. aliases - why? why not?
|
|
92
106
|
## rename to record_html - why? why not?
|
|
93
|
-
def record( url, response,
|
|
107
|
+
def record( url, response,
|
|
108
|
+
path: nil,
|
|
109
|
+
encoding: 'UTF-8',
|
|
110
|
+
format: 'html' )
|
|
94
111
|
|
|
95
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
|
96
113
|
meta_path = "#{body_path}.meta.txt"
|
|
97
114
|
|
|
98
115
|
## make sure path exits
|
|
@@ -102,14 +119,23 @@ class DiskCache
|
|
|
102
119
|
puts "[cache] saving #{body_path}..."
|
|
103
120
|
|
|
104
121
|
## todo/check: verify content-type - why? why not?
|
|
122
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
|
105
123
|
if format == 'json'
|
|
106
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
|
107
|
-
|
|
108
|
-
##
|
|
109
|
-
|
|
125
|
+
elsif format == 'csv'
|
|
126
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
|
128
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
|
129
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
|
130
|
+
else ## html or txt
|
|
131
|
+
text = response.text( encoding: encoding )
|
|
132
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
|
110
133
|
end
|
|
111
134
|
|
|
135
|
+
|
|
112
136
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
|
137
|
+
## todo/check:
|
|
138
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
|
113
139
|
response.headers.each do |key, value| # iterate all response headers
|
|
114
140
|
f.write( "#{key}: #{value}" )
|
|
115
141
|
f.write( "\n" )
|
|
@@ -125,7 +151,7 @@ class DiskCache
|
|
|
125
151
|
|
|
126
152
|
|
|
127
153
|
### helpers
|
|
128
|
-
def url_to_path( str )
|
|
154
|
+
def url_to_path( str, path: nil )
|
|
129
155
|
## map url to file path
|
|
130
156
|
uri = URI.parse( str )
|
|
131
157
|
|
|
@@ -134,10 +160,14 @@ class DiskCache
|
|
|
134
160
|
## always downcase for now (internet domain is case insensitive)
|
|
135
161
|
host_dir = uri.host.downcase
|
|
136
162
|
|
|
137
|
-
## "
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
|
164
|
+
path
|
|
165
|
+
else
|
|
166
|
+
## "/this/is/everything?query=params"
|
|
167
|
+
## cut-off leading slash and
|
|
168
|
+
## convert query ? =
|
|
169
|
+
uri.request_uri[1..-1]
|
|
170
|
+
end
|
|
141
171
|
|
|
142
172
|
|
|
143
173
|
|
|
@@ -151,6 +181,25 @@ class DiskCache
|
|
|
151
181
|
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
|
152
182
|
exit 1
|
|
153
183
|
end
|
|
184
|
+
elsif host_dir.index( 'tipp3.at' )
|
|
185
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
|
186
|
+
|
|
187
|
+
## change ? to -I-
|
|
188
|
+
## change = to ~
|
|
189
|
+
## Example:
|
|
190
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
|
191
|
+
## =>
|
|
192
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
|
193
|
+
req_path = req_path.gsub( '?', '-I-' )
|
|
194
|
+
.gsub( '=', '~')
|
|
195
|
+
|
|
196
|
+
req_path = "#{req_path}.html"
|
|
197
|
+
elsif host_dir.index( 'fbref.com' )
|
|
198
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
|
199
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
|
200
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
|
201
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
|
202
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
|
154
203
|
elsif host_dir.index( 'football-data.org' )
|
|
155
204
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
|
156
205
|
|
|
@@ -162,6 +211,11 @@ class DiskCache
|
|
|
162
211
|
.gsub( '/', '~~' )
|
|
163
212
|
.gsub( '=', '~')
|
|
164
213
|
|
|
214
|
+
req_path = "#{req_path}.json"
|
|
215
|
+
elsif host_dir.index( 'api.cryptokitties.co' )
|
|
216
|
+
## for now always auto-add .json extensions e.g.
|
|
217
|
+
## kitties/1 => kitties/1.json
|
|
218
|
+
## cattributes => cattributes.json
|
|
165
219
|
req_path = "#{req_path}.json"
|
|
166
220
|
else
|
|
167
221
|
## no special rule
|
data/lib/webget/webget.rb
CHANGED
|
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
|
25
25
|
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
|
40
40
|
response
|
|
41
41
|
end # method self.call
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
|
47
47
|
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
|
49
49
|
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
|
52
|
-
Webcache.record( url, response
|
|
52
|
+
Webcache.record( url, response,
|
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
|
53
54
|
else
|
|
54
55
|
## todo/check - log error
|
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
|
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
|
|
|
60
61
|
response
|
|
61
62
|
end # method self.page
|
|
62
63
|
|
|
64
|
+
|
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
|
68
|
+
|
|
69
|
+
response = Webclient.get( url, headers: headers )
|
|
70
|
+
|
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
|
74
|
+
Webcache.record( url, response,
|
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
|
76
|
+
format: 'txt' )
|
|
77
|
+
else
|
|
78
|
+
## todo/check - log error
|
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
## to be done / continued
|
|
84
|
+
response
|
|
85
|
+
end # method self.text
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
|
90
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
|
91
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
|
92
|
+
puts " sleep #{config.sleep} sec(s)..."
|
|
93
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
|
94
|
+
|
|
95
|
+
response = Webclient.get( url, headers: headers )
|
|
96
|
+
|
|
97
|
+
if response.status.ok? ## must be HTTP 200
|
|
98
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
99
|
+
Webcache.record( url, response,
|
|
100
|
+
encoding: encoding,
|
|
101
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
|
102
|
+
else
|
|
103
|
+
## todo/check - log error
|
|
104
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
|
105
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
## to be done / continued
|
|
109
|
+
response
|
|
110
|
+
end # method self.dataset
|
|
111
|
+
|
|
112
|
+
|
|
63
113
|
end # class Webget
|
|
64
114
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: webget
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gerald Bauer
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-02-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: webclient
|
|
@@ -16,14 +16,28 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.
|
|
19
|
+
version: 0.2.0
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.
|
|
26
|
+
version: 0.2.0
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: csvreader
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 1.2.4
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: 1.2.4
|
|
27
41
|
- !ruby/object:Gem::Dependency
|
|
28
42
|
name: rdoc
|
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -79,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
|
|
|
79
93
|
licenses:
|
|
80
94
|
- Public Domain
|
|
81
95
|
metadata: {}
|
|
82
|
-
post_install_message:
|
|
96
|
+
post_install_message:
|
|
83
97
|
rdoc_options:
|
|
84
98
|
- "--main"
|
|
85
99
|
- README.md
|
|
@@ -96,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
96
110
|
- !ruby/object:Gem::Version
|
|
97
111
|
version: '0'
|
|
98
112
|
requirements: []
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
signing_key:
|
|
113
|
+
rubygems_version: 3.1.4
|
|
114
|
+
signing_key:
|
|
102
115
|
specification_version: 4
|
|
103
116
|
summary: webget gem - a web (go get) crawler incl. web cache
|
|
104
117
|
test_files: []
|