webget 0.2.0 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: fbf6e29e3e80b0a0fe7b80ad8a4870c00601006c
4
- data.tar.gz: c03efecb7b2362efef7392f26293c921f8422eca
2
+ SHA256:
3
+ metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
4
+ data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
5
5
  SHA512:
6
- metadata.gz: a3d026c4b3b6932eca4b22377ce1da9b08bd7836713c4e69c6a5fd28ba8de52449b28bca2be877e759eb73a30fb17b62966b779bed25a26fd072913abd284120
7
- data.tar.gz: 5f7822300304672b181a3001fb5d5e4cb3ad2adc55853179895ba100e03b21a8ede90a800ad2cced34e3b13603162d22876acaf34abfabe61c665f1d821f0455
6
+ metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
7
+ data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'webget' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['webclient', '>= 0.1.0']
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
22
23
  ]
23
24
 
24
25
  self.licenses = ['Public Domain']
data/lib/webget.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  require 'webclient'
2
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
3
7
  ## our own code
4
8
  require 'webget/version' # let version go first
5
9
  require 'webget/webcache'
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 0
6
+ PATCH = 5
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,24 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
62
68
  alias_method :exist?, :cached?
63
69
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
70
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
71
  def self.read( url ) cache.read( url ); end
66
72
  def self.read_json( url ) cache.read_json( url ); end
73
+ def self.read_csv( url ) cache.read_csv( url ); end
74
+
67
75
 
68
76
 
69
77
  class DiskCache
@@ -86,13 +94,22 @@ class DiskCache
86
94
  data
87
95
  end
88
96
 
97
+ def read_csv( url )
98
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
+ data = CsvHash.parse( txt )
101
+ data
102
+ end
89
103
 
90
104
 
91
105
  ## add more save / put / etc. aliases - why? why not?
92
106
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
94
111
 
95
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
96
113
  meta_path = "#{body_path}.meta.txt"
97
114
 
98
115
  ## make sure path exits
@@ -102,14 +119,23 @@ class DiskCache
102
119
  puts "[cache] saving #{body_path}..."
103
120
 
104
121
  ## todo/check: verify content-type - why? why not?
122
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
123
  if format == 'json'
106
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
107
- else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
125
+ elsif format == 'csv'
126
+ ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
128
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
+ else ## html or txt
131
+ text = response.text( encoding: encoding )
132
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
133
  end
111
134
 
135
+
112
136
  File.open( meta_path, 'w:utf-8' ) do |f|
137
+ ## todo/check:
138
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
139
  response.headers.each do |key, value| # iterate all response headers
114
140
  f.write( "#{key}: #{value}" )
115
141
  f.write( "\n" )
@@ -125,7 +151,7 @@ class DiskCache
125
151
 
126
152
 
127
153
  ### helpers
128
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
129
155
  ## map url to file path
130
156
  uri = URI.parse( str )
131
157
 
@@ -134,10 +160,14 @@ class DiskCache
134
160
  ## always downcase for now (internet domain is case insensitive)
135
161
  host_dir = uri.host.downcase
136
162
 
137
- ## "/this/is/everything?query=params"
138
- ## cut-off leading slash and
139
- ## convert query ? =
140
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
141
171
 
142
172
 
143
173
 
@@ -151,6 +181,25 @@ class DiskCache
151
181
  puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
152
182
  exit 1
153
183
  end
184
+ elsif host_dir.index( 'tipp3.at' )
185
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
+
187
+ ## change ? to -I-
188
+ ## change = to ~
189
+ ## Example:
190
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
+ ## =>
192
+ ## sportwetten/classicresults-I-oddsetProgramID~888
193
+ req_path = req_path.gsub( '?', '-I-' )
194
+ .gsub( '=', '~')
195
+
196
+ req_path = "#{req_path}.html"
197
+ elsif host_dir.index( 'fbref.com' )
198
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
+ req_path = "#{req_path}.html" # auto-add html extension
200
+ elsif host_dir.index( 'football-data.co.uk' )
201
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
154
203
  elsif host_dir.index( 'football-data.org' )
155
204
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
156
205
 
@@ -162,6 +211,11 @@ class DiskCache
162
211
  .gsub( '/', '~~' )
163
212
  .gsub( '=', '~')
164
213
 
214
+ req_path = "#{req_path}.json"
215
+ elsif host_dir.index( 'api.cryptokitties.co' )
216
+ ## for now always auto-add .json extensions e.g.
217
+ ## kitties/1 => kitties/1.json
218
+ ## cattributes => cattributes.json
165
219
  req_path = "#{req_path}.json"
166
220
  else
167
221
  ## no special rule
data/lib/webget/webget.rb CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
89
+ ## todo/check: rename to csv or file or records or - why? why not?
90
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
91
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
+ puts " sleep #{config.sleep} sec(s)..."
93
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
+
95
+ response = Webclient.get( url, headers: headers )
96
+
97
+ if response.status.ok? ## must be HTTP 200
98
+ puts "#{response.status.code} #{response.status.message}"
99
+ Webcache.record( url, response,
100
+ encoding: encoding,
101
+ format: 'csv' ) ## pass along csv format - why? why not?
102
+ else
103
+ ## todo/check - log error
104
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
+ end
107
+
108
+ ## to be done / continued
109
+ response
110
+ end # method self.dataset
111
+
112
+
63
113
  end # class Webget
64
114
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2021-02-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.1.0
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.1.0
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rdoc
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -79,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
79
93
  licenses:
80
94
  - Public Domain
81
95
  metadata: {}
82
- post_install_message:
96
+ post_install_message:
83
97
  rdoc_options:
84
98
  - "--main"
85
99
  - README.md
@@ -96,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
96
110
  - !ruby/object:Gem::Version
97
111
  version: '0'
98
112
  requirements: []
99
- rubyforge_project:
100
- rubygems_version: 2.5.2
101
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
102
115
  specification_version: 4
103
116
  summary: webget gem - a web (go get) crawler incl. web cache
104
117
  test_files: []