webget 0.2.0 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: fbf6e29e3e80b0a0fe7b80ad8a4870c00601006c
4
- data.tar.gz: c03efecb7b2362efef7392f26293c921f8422eca
2
+ SHA256:
3
+ metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
4
+ data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
5
5
  SHA512:
6
- metadata.gz: a3d026c4b3b6932eca4b22377ce1da9b08bd7836713c4e69c6a5fd28ba8de52449b28bca2be877e759eb73a30fb17b62966b779bed25a26fd072913abd284120
7
- data.tar.gz: 5f7822300304672b181a3001fb5d5e4cb3ad2adc55853179895ba100e03b21a8ede90a800ad2cced34e3b13603162d22876acaf34abfabe61c665f1d821f0455
6
+ metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
7
+ data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'webget' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['webclient', '>= 0.1.0']
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
22
23
  ]
23
24
 
24
25
  self.licenses = ['Public Domain']
data/lib/webget.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  require 'webclient'
2
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
3
7
  ## our own code
4
8
  require 'webget/version' # let version go first
5
9
  require 'webget/webcache'
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 0
6
+ PATCH = 5
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,24 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
62
68
  alias_method :exist?, :cached?
63
69
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
70
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
71
  def self.read( url ) cache.read( url ); end
66
72
  def self.read_json( url ) cache.read_json( url ); end
73
+ def self.read_csv( url ) cache.read_csv( url ); end
74
+
67
75
 
68
76
 
69
77
  class DiskCache
@@ -86,13 +94,22 @@ class DiskCache
86
94
  data
87
95
  end
88
96
 
97
+ def read_csv( url )
98
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
+ data = CsvHash.parse( txt )
101
+ data
102
+ end
89
103
 
90
104
 
91
105
  ## add more save / put / etc. aliases - why? why not?
92
106
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
94
111
 
95
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
96
113
  meta_path = "#{body_path}.meta.txt"
97
114
 
98
115
  ## make sure path exits
@@ -102,14 +119,23 @@ class DiskCache
102
119
  puts "[cache] saving #{body_path}..."
103
120
 
104
121
  ## todo/check: verify content-type - why? why not?
122
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
123
  if format == 'json'
106
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
107
- else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
125
+ elsif format == 'csv'
126
+ ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
128
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
+ else ## html or txt
131
+ text = response.text( encoding: encoding )
132
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
133
  end
111
134
 
135
+
112
136
  File.open( meta_path, 'w:utf-8' ) do |f|
137
+ ## todo/check:
138
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
139
  response.headers.each do |key, value| # iterate all response headers
114
140
  f.write( "#{key}: #{value}" )
115
141
  f.write( "\n" )
@@ -125,7 +151,7 @@ class DiskCache
125
151
 
126
152
 
127
153
  ### helpers
128
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
129
155
  ## map url to file path
130
156
  uri = URI.parse( str )
131
157
 
@@ -134,10 +160,14 @@ class DiskCache
134
160
  ## always downcase for now (internet domain is case insensitive)
135
161
  host_dir = uri.host.downcase
136
162
 
137
- ## "/this/is/everything?query=params"
138
- ## cut-off leading slash and
139
- ## convert query ? =
140
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
141
171
 
142
172
 
143
173
 
@@ -151,6 +181,25 @@ class DiskCache
151
181
  puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
152
182
  exit 1
153
183
  end
184
+ elsif host_dir.index( 'tipp3.at' )
185
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
+
187
+ ## change ? to -I-
188
+ ## change = to ~
189
+ ## Example:
190
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
+ ## =>
192
+ ## sportwetten/classicresults-I-oddsetProgramID~888
193
+ req_path = req_path.gsub( '?', '-I-' )
194
+ .gsub( '=', '~')
195
+
196
+ req_path = "#{req_path}.html"
197
+ elsif host_dir.index( 'fbref.com' )
198
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
+ req_path = "#{req_path}.html" # auto-add html extension
200
+ elsif host_dir.index( 'football-data.co.uk' )
201
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
154
203
  elsif host_dir.index( 'football-data.org' )
155
204
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
156
205
 
@@ -162,6 +211,11 @@ class DiskCache
162
211
  .gsub( '/', '~~' )
163
212
  .gsub( '=', '~')
164
213
 
214
+ req_path = "#{req_path}.json"
215
+ elsif host_dir.index( 'api.cryptokitties.co' )
216
+ ## for now always auto-add .json extensions e.g.
217
+ ## kitties/1 => kitties/1.json
218
+ ## cattributes => cattributes.json
165
219
  req_path = "#{req_path}.json"
166
220
  else
167
221
  ## no special rule
data/lib/webget/webget.rb CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
89
+ ## todo/check: rename to csv or file or records or - why? why not?
90
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
91
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
+ puts " sleep #{config.sleep} sec(s)..."
93
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
+
95
+ response = Webclient.get( url, headers: headers )
96
+
97
+ if response.status.ok? ## must be HTTP 200
98
+ puts "#{response.status.code} #{response.status.message}"
99
+ Webcache.record( url, response,
100
+ encoding: encoding,
101
+ format: 'csv' ) ## pass along csv format - why? why not?
102
+ else
103
+ ## todo/check - log error
104
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
+ end
107
+
108
+ ## to be done / continued
109
+ response
110
+ end # method self.dataset
111
+
112
+
63
113
  end # class Webget
64
114
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2021-02-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.1.0
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.1.0
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rdoc
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -79,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
79
93
  licenses:
80
94
  - Public Domain
81
95
  metadata: {}
82
- post_install_message:
96
+ post_install_message:
83
97
  rdoc_options:
84
98
  - "--main"
85
99
  - README.md
@@ -96,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
96
110
  - !ruby/object:Gem::Version
97
111
  version: '0'
98
112
  requirements: []
99
- rubyforge_project:
100
- rubygems_version: 2.5.2
101
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
102
115
  specification_version: 4
103
116
  summary: webget gem - a web (go get) crawler incl. web cache
104
117
  test_files: []