webget 0.2.0 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Rakefile +2 -1
- data/lib/webget.rb +4 -0
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +67 -13
- data/lib/webget/webget.rb +54 -4
- metadata +22 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
|
4
|
+
data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
|
7
|
+
data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
|
data/Rakefile
CHANGED
data/lib/webget.rb
CHANGED
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,16 +54,24 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response,
|
58
|
-
|
57
|
+
def self.record( url, response,
|
58
|
+
path: nil,
|
59
|
+
encoding: 'UTF-8',
|
60
|
+
format: 'html' )
|
61
|
+
cache.record( url, response,
|
62
|
+
path: path,
|
63
|
+
encoding: encoding,
|
64
|
+
format: format );
|
59
65
|
end
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
61
67
|
class << self
|
62
68
|
alias_method :exist?, :cached?
|
63
69
|
end
|
64
|
-
def self.url_to_id( url )
|
70
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
71
|
def self.read( url ) cache.read( url ); end
|
66
72
|
def self.read_json( url ) cache.read_json( url ); end
|
73
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
74
|
+
|
67
75
|
|
68
76
|
|
69
77
|
class DiskCache
|
@@ -86,13 +94,22 @@ class DiskCache
|
|
86
94
|
data
|
87
95
|
end
|
88
96
|
|
97
|
+
def read_csv( url )
|
98
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
99
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
100
|
+
data = CsvHash.parse( txt )
|
101
|
+
data
|
102
|
+
end
|
89
103
|
|
90
104
|
|
91
105
|
## add more save / put / etc. aliases - why? why not?
|
92
106
|
## rename to record_html - why? why not?
|
93
|
-
def record( url, response,
|
107
|
+
def record( url, response,
|
108
|
+
path: nil,
|
109
|
+
encoding: 'UTF-8',
|
110
|
+
format: 'html' )
|
94
111
|
|
95
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
96
113
|
meta_path = "#{body_path}.meta.txt"
|
97
114
|
|
98
115
|
## make sure path exits
|
@@ -102,14 +119,23 @@ class DiskCache
|
|
102
119
|
puts "[cache] saving #{body_path}..."
|
103
120
|
|
104
121
|
## todo/check: verify content-type - why? why not?
|
122
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
105
123
|
if format == 'json'
|
106
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
107
|
-
|
108
|
-
##
|
109
|
-
|
125
|
+
elsif format == 'csv'
|
126
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
128
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
129
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
130
|
+
else ## html or txt
|
131
|
+
text = response.text( encoding: encoding )
|
132
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
110
133
|
end
|
111
134
|
|
135
|
+
|
112
136
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
137
|
+
## todo/check:
|
138
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
113
139
|
response.headers.each do |key, value| # iterate all response headers
|
114
140
|
f.write( "#{key}: #{value}" )
|
115
141
|
f.write( "\n" )
|
@@ -125,7 +151,7 @@ class DiskCache
|
|
125
151
|
|
126
152
|
|
127
153
|
### helpers
|
128
|
-
def url_to_path( str )
|
154
|
+
def url_to_path( str, path: nil )
|
129
155
|
## map url to file path
|
130
156
|
uri = URI.parse( str )
|
131
157
|
|
@@ -134,10 +160,14 @@ class DiskCache
|
|
134
160
|
## always downcase for now (internet domain is case insensitive)
|
135
161
|
host_dir = uri.host.downcase
|
136
162
|
|
137
|
-
## "
|
138
|
-
|
139
|
-
|
140
|
-
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
164
|
+
path
|
165
|
+
else
|
166
|
+
## "/this/is/everything?query=params"
|
167
|
+
## cut-off leading slash and
|
168
|
+
## convert query ? =
|
169
|
+
uri.request_uri[1..-1]
|
170
|
+
end
|
141
171
|
|
142
172
|
|
143
173
|
|
@@ -151,6 +181,25 @@ class DiskCache
|
|
151
181
|
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
152
182
|
exit 1
|
153
183
|
end
|
184
|
+
elsif host_dir.index( 'tipp3.at' )
|
185
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
186
|
+
|
187
|
+
## change ? to -I-
|
188
|
+
## change = to ~
|
189
|
+
## Example:
|
190
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
191
|
+
## =>
|
192
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
193
|
+
req_path = req_path.gsub( '?', '-I-' )
|
194
|
+
.gsub( '=', '~')
|
195
|
+
|
196
|
+
req_path = "#{req_path}.html"
|
197
|
+
elsif host_dir.index( 'fbref.com' )
|
198
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
199
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
200
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
201
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
202
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
154
203
|
elsif host_dir.index( 'football-data.org' )
|
155
204
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
156
205
|
|
@@ -162,6 +211,11 @@ class DiskCache
|
|
162
211
|
.gsub( '/', '~~' )
|
163
212
|
.gsub( '=', '~')
|
164
213
|
|
214
|
+
req_path = "#{req_path}.json"
|
215
|
+
elsif host_dir.index( 'api.cryptokitties.co' )
|
216
|
+
## for now always auto-add .json extensions e.g.
|
217
|
+
## kitties/1 => kitties/1.json
|
218
|
+
## cattributes => cattributes.json
|
165
219
|
req_path = "#{req_path}.json"
|
166
220
|
else
|
167
221
|
## no special rule
|
data/lib/webget/webget.rb
CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
19
19
|
|
20
20
|
|
21
21
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
25
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
40
40
|
response
|
41
41
|
end # method self.call
|
42
42
|
|
43
|
-
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
47
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
49
49
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
53
54
|
else
|
54
55
|
## todo/check - log error
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
|
|
60
61
|
response
|
61
62
|
end # method self.page
|
62
63
|
|
64
|
+
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
68
|
+
|
69
|
+
response = Webclient.get( url, headers: headers )
|
70
|
+
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
74
|
+
Webcache.record( url, response,
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
76
|
+
format: 'txt' )
|
77
|
+
else
|
78
|
+
## todo/check - log error
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
81
|
+
end
|
82
|
+
|
83
|
+
## to be done / continued
|
84
|
+
response
|
85
|
+
end # method self.text
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
90
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
91
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
92
|
+
puts " sleep #{config.sleep} sec(s)..."
|
93
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
94
|
+
|
95
|
+
response = Webclient.get( url, headers: headers )
|
96
|
+
|
97
|
+
if response.status.ok? ## must be HTTP 200
|
98
|
+
puts "#{response.status.code} #{response.status.message}"
|
99
|
+
Webcache.record( url, response,
|
100
|
+
encoding: encoding,
|
101
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
102
|
+
else
|
103
|
+
## todo/check - log error
|
104
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
105
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
106
|
+
end
|
107
|
+
|
108
|
+
## to be done / continued
|
109
|
+
response
|
110
|
+
end # method self.dataset
|
111
|
+
|
112
|
+
|
63
113
|
end # class Webget
|
64
114
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -16,14 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csvreader
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.4
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rdoc
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -79,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
|
|
79
93
|
licenses:
|
80
94
|
- Public Domain
|
81
95
|
metadata: {}
|
82
|
-
post_install_message:
|
96
|
+
post_install_message:
|
83
97
|
rdoc_options:
|
84
98
|
- "--main"
|
85
99
|
- README.md
|
@@ -96,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
110
|
- !ruby/object:Gem::Version
|
97
111
|
version: '0'
|
98
112
|
requirements: []
|
99
|
-
|
100
|
-
|
101
|
-
signing_key:
|
113
|
+
rubygems_version: 3.1.4
|
114
|
+
signing_key:
|
102
115
|
specification_version: 4
|
103
116
|
summary: webget gem - a web (go get) crawler incl. web cache
|
104
117
|
test_files: []
|