webget 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1e55270df44760e784f72743f62bed6e23f7c4f4
4
- data.tar.gz: f2e529e98e53b294813fc00f8a14ba7c4d6bc58c
3
+ metadata.gz: 199c22178984860a75e48ecc68db3be4f3e1f210
4
+ data.tar.gz: ca9268c0e0650acee9125f333d567c8c3d65e3e2
5
5
  SHA512:
6
- metadata.gz: a074c8c43fae86befca8c16b687a7701ebc7ae9e7e4a6ad3cee527178bd01a447440f1cbfaad6ff60f708fad43a5d70be90de9f20e576f8532658d3a47d08a5d
7
- data.tar.gz: 968ae57d7a26b40f31ae17df384eb72f69fc83f2192c398a9982ea0549800c137bccc7bca0e4af669ee92aef3af7bccd43b48a6ee7d337d845f726d05dc68c46
6
+ metadata.gz: 02e990893588bfef74a704a1de9b0c361e184759ba81f5579aa6724254dd8321bdb900ae4c50556c9b4f6f7cd02ea0f7c3e13ce16d7eb4da36f679d4285bc134
7
+ data.tar.gz: 0ddefec72ef2900b43dc4160ddb97b1a94640c08fefcb1d4c0f5656e93d46e106ac6768ed030bbe905cbf03db0d4532ee89fcb87d956b894de2e2649e37db1cc
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'webget' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['webclient', '>= 0.1.0']
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
22
23
  ]
23
24
 
24
25
  self.licenses = ['Public Domain']
@@ -1,5 +1,9 @@
1
1
  require 'webclient'
2
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
3
7
  ## our own code
4
8
  require 'webget/version' # let version go first
5
9
  require 'webget/webcache'
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 1
6
+ PATCH = 2
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,18 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response, encoding: 'UTF-8', format: 'html' )
58
+ cache.record( url, response, encoding: encoding, format: format );
59
59
  end
60
60
  def self.cached?( url ) cache.cached?( url ); end
61
61
  class << self
62
62
  alias_method :exist?, :cached?
63
63
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
64
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
65
  def self.read( url ) cache.read( url ); end
66
66
  def self.read_json( url ) cache.read_json( url ); end
67
+ def self.read_csv( url ) cache.read_csv( url ); end
68
+
67
69
 
68
70
 
69
71
  class DiskCache
@@ -86,11 +88,17 @@ class DiskCache
86
88
  data
87
89
  end
88
90
 
91
+ def read_csv( url )
92
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
93
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
94
+ data = CsvHash.parse( txt )
95
+ data
96
+ end
89
97
 
90
98
 
91
99
  ## add more save / put / etc. aliases - why? why not?
92
100
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
101
+ def record( url, response, encoding: 'UTF-8', format: 'html' )
94
102
 
95
103
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
96
104
  meta_path = "#{body_path}.meta.txt"
@@ -102,14 +110,22 @@ class DiskCache
102
110
  puts "[cache] saving #{body_path}..."
103
111
 
104
112
  ## todo/check: verify content-type - why? why not?
113
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
114
  if format == 'json'
106
115
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
116
+ elsif format == 'csv'
117
+ ## fix: newlines - always use "unix" style" - why? why not?
118
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
119
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
107
120
  else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
121
+ text = response.text( encoding: encoding )
122
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
123
  end
111
124
 
125
+
112
126
  File.open( meta_path, 'w:utf-8' ) do |f|
127
+ ## todo/check:
128
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
129
  response.headers.each do |key, value| # iterate all response headers
114
130
  f.write( "#{key}: #{value}" )
115
131
  f.write( "\n" )
@@ -164,6 +180,9 @@ class DiskCache
164
180
  .gsub( '=', '~')
165
181
 
166
182
  req_path = "#{req_path}.html"
183
+ elsif host_dir.index( 'football-data.co.uk' )
184
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
185
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
167
186
  elsif host_dir.index( 'football-data.org' )
168
187
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
169
188
 
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,30 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ ## todo/check: rename to csv or file or records or - why? why not?
66
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
67
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
68
+ puts " sleep #{config.sleep} sec(s)..."
69
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
70
+
71
+ response = Webclient.get( url, headers: headers )
72
+
73
+ if response.status.ok? ## must be HTTP 200
74
+ puts "#{response.status.code} #{response.status.message}"
75
+ Webcache.record( url, response,
76
+ encoding: encoding,
77
+ format: 'csv' ) ## pass along csv format - why? why not?
78
+ else
79
+ ## todo/check - log error
80
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
81
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
82
+ end
83
+
84
+ ## to be done / continued
85
+ response
86
+ end # method self.dataset
87
+
88
+
63
89
  end # class Webget
64
90
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-07 00:00:00.000000000 Z
11
+ date: 2020-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.1.0
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.1.0
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rdoc
29
43
  requirement: !ruby/object:Gem::Requirement