webget 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1e55270df44760e784f72743f62bed6e23f7c4f4
4
- data.tar.gz: f2e529e98e53b294813fc00f8a14ba7c4d6bc58c
3
+ metadata.gz: 199c22178984860a75e48ecc68db3be4f3e1f210
4
+ data.tar.gz: ca9268c0e0650acee9125f333d567c8c3d65e3e2
5
5
  SHA512:
6
- metadata.gz: a074c8c43fae86befca8c16b687a7701ebc7ae9e7e4a6ad3cee527178bd01a447440f1cbfaad6ff60f708fad43a5d70be90de9f20e576f8532658d3a47d08a5d
7
- data.tar.gz: 968ae57d7a26b40f31ae17df384eb72f69fc83f2192c398a9982ea0549800c137bccc7bca0e4af669ee92aef3af7bccd43b48a6ee7d337d845f726d05dc68c46
6
+ metadata.gz: 02e990893588bfef74a704a1de9b0c361e184759ba81f5579aa6724254dd8321bdb900ae4c50556c9b4f6f7cd02ea0f7c3e13ce16d7eb4da36f679d4285bc134
7
+ data.tar.gz: 0ddefec72ef2900b43dc4160ddb97b1a94640c08fefcb1d4c0f5656e93d46e106ac6768ed030bbe905cbf03db0d4532ee89fcb87d956b894de2e2649e37db1cc
data/Rakefile CHANGED
@@ -18,7 +18,8 @@ Hoe.spec 'webget' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['webclient', '>= 0.1.0']
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
22
23
  ]
23
24
 
24
25
  self.licenses = ['Public Domain']
@@ -1,5 +1,9 @@
1
1
  require 'webclient'
2
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
3
7
  ## our own code
4
8
  require 'webget/version' # let version go first
5
9
  require 'webget/webcache'
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 1
6
+ PATCH = 2
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,18 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response, encoding: 'UTF-8', format: 'html' )
58
+ cache.record( url, response, encoding: encoding, format: format );
59
59
  end
60
60
  def self.cached?( url ) cache.cached?( url ); end
61
61
  class << self
62
62
  alias_method :exist?, :cached?
63
63
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
64
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
65
  def self.read( url ) cache.read( url ); end
66
66
  def self.read_json( url ) cache.read_json( url ); end
67
+ def self.read_csv( url ) cache.read_csv( url ); end
68
+
67
69
 
68
70
 
69
71
  class DiskCache
@@ -86,11 +88,17 @@ class DiskCache
86
88
  data
87
89
  end
88
90
 
91
+ def read_csv( url )
92
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
93
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
94
+ data = CsvHash.parse( txt )
95
+ data
96
+ end
89
97
 
90
98
 
91
99
  ## add more save / put / etc. aliases - why? why not?
92
100
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
101
+ def record( url, response, encoding: 'UTF-8', format: 'html' )
94
102
 
95
103
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
96
104
  meta_path = "#{body_path}.meta.txt"
@@ -102,14 +110,22 @@ class DiskCache
102
110
  puts "[cache] saving #{body_path}..."
103
111
 
104
112
  ## todo/check: verify content-type - why? why not?
113
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
114
  if format == 'json'
106
115
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
116
+ elsif format == 'csv'
117
+ ## fix: newlines - always use "unix" style" - why? why not?
118
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
119
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
107
120
  else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
121
+ text = response.text( encoding: encoding )
122
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
123
  end
111
124
 
125
+
112
126
  File.open( meta_path, 'w:utf-8' ) do |f|
127
+ ## todo/check:
128
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
129
  response.headers.each do |key, value| # iterate all response headers
114
130
  f.write( "#{key}: #{value}" )
115
131
  f.write( "\n" )
@@ -164,6 +180,9 @@ class DiskCache
164
180
  .gsub( '=', '~')
165
181
 
166
182
  req_path = "#{req_path}.html"
183
+ elsif host_dir.index( 'football-data.co.uk' )
184
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
185
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
167
186
  elsif host_dir.index( 'football-data.org' )
168
187
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
169
188
 
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,30 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ ## todo/check: rename to csv or file or records or - why? why not?
66
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
67
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
68
+ puts " sleep #{config.sleep} sec(s)..."
69
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
70
+
71
+ response = Webclient.get( url, headers: headers )
72
+
73
+ if response.status.ok? ## must be HTTP 200
74
+ puts "#{response.status.code} #{response.status.message}"
75
+ Webcache.record( url, response,
76
+ encoding: encoding,
77
+ format: 'csv' ) ## pass along csv format - why? why not?
78
+ else
79
+ ## todo/check - log error
80
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
81
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
82
+ end
83
+
84
+ ## to be done / continued
85
+ response
86
+ end # method self.dataset
87
+
88
+
63
89
  end # class Webget
64
90
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-07 00:00:00.000000000 Z
11
+ date: 2020-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -16,14 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 0.1.0
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.1.0
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rdoc
29
43
  requirement: !ruby/object:Gem::Requirement