webget 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
4
- data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
3
+ metadata.gz: de0060acabe176d1a11a4f2e36e8bee5090083c4907b75d9b22e9cbb9c7b22e2
4
+ data.tar.gz: fb64c2b1294932b00fa401af006cd27b263b255789bad75e4ef62bec5598fd74
5
5
  SHA512:
6
- metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
7
- data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
6
+ metadata.gz: d1f1653c68729e7d609c3c848e0146de9668cd2932ddab2e794ab361d3f786603a10c7586bcf0029de4955255803987000b2fb35b0d1e14cfa35d0582f919be7
7
+ data.tar.gz: c6cff08b2f683bb5b8e39607735250f954115244be3e19dfcdeb29fd2d0a0f0a27be1fed5a2add9a1b75a9cbe81dbafea1e80f06b0a1bdd93d4bd3bd12d5a9a1
data/CHANGELOG.md CHANGED
@@ -1,4 +1,6 @@
1
- ### 0.0.1 / 2020-10-04
2
-
3
- * Everything is new. First release.
4
-
1
+ ### 0.3.1
2
+
3
+ ### 0.0.1 / 2020-10-04
4
+
5
+ * Everything is new. First release.
6
+
data/README.md CHANGED
@@ -1,20 +1,19 @@
1
- # webget
2
-
3
- webget gem - a web (go get) crawler incl. web cache
4
-
5
- * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
- * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
- * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
- * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
- * forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
10
-
11
-
12
- ## Usage
13
-
14
- TBD
15
-
16
-
17
- ## License
18
-
19
- The `webget` scripts are dedicated to the public domain.
20
- Use it as you please with no restrictions whatsoever.
1
+ # webget
2
+
3
+ webget gem - a web (go get) crawler incl. web cache
4
+
5
+ * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
+ * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
+ * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
+ * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
+
10
+
11
+ ## Usage
12
+
13
+ TBD
14
+
15
+
16
+ ## License
17
+
18
+ The `webget` scripts are dedicated to the public domain.
19
+ Use it as you please with no restrictions whatsoever.
data/Rakefile CHANGED
@@ -1,31 +1,32 @@
1
- require 'hoe'
2
- require './lib/webget/version.rb'
3
-
4
- Hoe.spec 'webget' do
5
-
6
- self.version = Webget::VERSION
7
-
8
- self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
- self.description = summary
10
-
11
- self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
-
13
- self.author = 'Gerald Bauer'
14
- self.email = 'ruby-talk@ruby-lang.org'
15
-
16
- # switch extension to .markdown for gihub formatting
17
- self.readme_file = 'README.md'
18
- self.history_file = 'CHANGELOG.md'
19
-
20
- self.extra_deps = [
21
- ['webclient', '>= 0.2.0'],
22
- ['csvreader', '>= 1.2.4'],
23
- ]
24
-
25
- self.licenses = ['Public Domain']
26
-
27
- self.spec_extras = {
28
- required_ruby_version: '>= 2.2.2'
29
- }
30
-
31
- end
1
+ require 'hoe'
2
+ require './lib/webget/version.rb'
3
+
4
+ Hoe.spec 'webget' do
5
+
6
+ self.version = Webget::VERSION
7
+
8
+ self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.extra_deps = [
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
23
+ ['cocos'],
24
+ ]
25
+
26
+ self.licenses = ['Public Domain']
27
+
28
+ self.spec_extras = {
29
+ required_ruby_version: '>= 2.2.2'
30
+ }
31
+
32
+ end
@@ -1,23 +1,21 @@
1
-
2
- class Webget
3
-
4
- MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
- MINOR = 2
6
- PATCH = 5
7
- VERSION = [MAJOR,MINOR,PATCH].join('.')
8
-
9
- def self.version
10
- VERSION
11
- end
12
-
13
- # version string for generator meta tag (includes ruby version)
14
- def self.banner
15
- "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end # module Webget
23
-
1
+
2
+ class Webget
3
+ MAJOR = 0 ## todo: namespace inside version or something - why? why not??
4
+ MINOR = 3
5
+ PATCH = 1
6
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
7
+
8
+ def self.version
9
+ VERSION
10
+ end
11
+
12
+ # version string for generator meta tag (includes ruby version)
13
+ def self.banner
14
+ "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
15
+ end
16
+
17
+ def self.root
18
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
+ end
20
+ end # module Webget
21
+
@@ -1,230 +1,234 @@
1
-
2
-
3
- module Webcache
4
-
5
- #####
6
- # copied from props gem, see Env.home
7
- # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
- # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
- def self.home
10
- path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
- ENV['HOME'] || ENV['USERPROFILE']
12
- elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
- "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
- else
15
- begin
16
- File.expand_path('~')
17
- rescue
18
- if File::ALT_SEPARATOR
19
- 'C:/'
20
- else
21
- '/'
22
- end
23
- end
24
- end
25
-
26
- ## note: use File.expand_path to "unify" path e.g
27
- ## C:\Users\roman becomes
28
- ## C:/Users/roman
29
-
30
- File.expand_path( path )
31
- end
32
-
33
-
34
- class Configuration
35
- ## root directory - todo/check: find/use a better name - why? why not?
36
- def root() @root || "#{Webcache.home}/.cache"; end
37
- def root=(value) @root = value; end
38
- end # class Configuration
39
-
40
-
41
- ## lets you use
42
- ## Webcache.configure do |config|
43
- ## config.root = './cache'
44
- ## end
45
- def self.configure() yield( config ); end
46
- def self.config() @config ||= Configuration.new; end
47
-
48
-
49
- ## add "high level" root convenience helpers
50
- def self.root() config.root; end
51
- def self.root=(value) config.root = value; end
52
-
53
-
54
- ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
- def self.cache() @cache ||= DiskCache.new; end
56
-
57
- def self.record( url, response,
58
- path: nil,
59
- encoding: 'UTF-8',
60
- format: 'html' )
61
- cache.record( url, response,
62
- path: path,
63
- encoding: encoding,
64
- format: format );
65
- end
66
- def self.cached?( url ) cache.cached?( url ); end
67
- class << self
68
- alias_method :exist?, :cached?
69
- end
70
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
71
- def self.read( url ) cache.read( url ); end
72
- def self.read_json( url ) cache.read_json( url ); end
73
- def self.read_csv( url ) cache.read_csv( url ); end
74
-
75
-
76
-
77
- class DiskCache
78
- def cached?( url )
79
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
80
- File.exist?( body_path )
81
- end
82
- alias_method :exist?, :cached?
83
-
84
-
85
- def read( url )
86
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
87
- File.open( body_path, 'r:utf-8' ) {|f| f.read }
88
- end
89
-
90
- def read_json( url )
91
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
92
- txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
93
- data = JSON.parse( txt )
94
- data
95
- end
96
-
97
- def read_csv( url )
98
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
- txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
- data = CsvHash.parse( txt )
101
- data
102
- end
103
-
104
-
105
- ## add more save / put / etc. aliases - why? why not?
106
- ## rename to record_html - why? why not?
107
- def record( url, response,
108
- path: nil,
109
- encoding: 'UTF-8',
110
- format: 'html' )
111
-
112
- body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
113
- meta_path = "#{body_path}.meta.txt"
114
-
115
- ## make sure path exits
116
- FileUtils.mkdir_p( File.dirname( body_path ) )
117
-
118
-
119
- puts "[cache] saving #{body_path}..."
120
-
121
- ## todo/check: verify content-type - why? why not?
122
- ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
123
- if format == 'json'
124
- File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
125
- elsif format == 'csv'
126
- ## fix: newlines - always use "unix" style" - why? why not?
127
- ## fix: use :newline => :universal option? translates to univeral "\n"
128
- text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
- File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
- else ## html or txt
131
- text = response.text( encoding: encoding )
132
- File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
133
- end
134
-
135
-
136
- File.open( meta_path, 'w:utf-8' ) do |f|
137
- ## todo/check:
138
- ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
139
- response.headers.each do |key, value| # iterate all response headers
140
- f.write( "#{key}: #{value}" )
141
- f.write( "\n" )
142
- end
143
- end
144
- end
145
-
146
-
147
-
148
- ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
149
- ## use file:// instead of disk:// - why? why not?
150
- def url_to_id( str ) "disk://#{url_to_path( str )}"; end
151
-
152
-
153
- ### helpers
154
- def url_to_path( str, path: nil )
155
- ## map url to file path
156
- uri = URI.parse( str )
157
-
158
- ## note: ignore scheme (e.g. http/https)
159
- ## and post (e.g. 80, 8080, etc.) for now
160
- ## always downcase for now (internet domain is case insensitive)
161
- host_dir = uri.host.downcase
162
-
163
- req_path = if path ## use "custom" (file)path for cache storage if passed in
164
- path
165
- else
166
- ## "/this/is/everything?query=params"
167
- ## cut-off leading slash and
168
- ## convert query ? =
169
- uri.request_uri[1..-1]
170
- end
171
-
172
-
173
-
174
- ### special "prettify" rule for weltfussball
175
- ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
176
- if host_dir.index( 'weltfussball.de' ) ||
177
- host_dir.index( 'worldfootball.net' )
178
- if req_path.end_with?( '/' )
179
- req_path = "#{req_path[0..-2]}.html"
180
- else
181
- puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
182
- exit 1
183
- end
184
- elsif host_dir.index( 'tipp3.at' )
185
- req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
-
187
- ## change ? to -I-
188
- ## change = to ~
189
- ## Example:
190
- ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
- ## =>
192
- ## sportwetten/classicresults-I-oddsetProgramID~888
193
- req_path = req_path.gsub( '?', '-I-' )
194
- .gsub( '=', '~')
195
-
196
- req_path = "#{req_path}.html"
197
- elsif host_dir.index( 'fbref.com' )
198
- req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
- req_path = "#{req_path}.html" # auto-add html extension
200
- elsif host_dir.index( 'football-data.co.uk' )
201
- req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
- req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
203
- elsif host_dir.index( 'football-data.org' )
204
- req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
205
-
206
- ## flattern - make a file path - for auto-save
207
- ## change ? to -I-
208
- ## change / to ~~
209
- ## change = to ~
210
- req_path = req_path.gsub( '?', '-I-' )
211
- .gsub( '/', '~~' )
212
- .gsub( '=', '~')
213
-
214
- req_path = "#{req_path}.json"
215
- elsif host_dir.index( 'api.cryptokitties.co' )
216
- ## for now always auto-add .json extensions e.g.
217
- ## kitties/1 => kitties/1.json
218
- ## cattributes => cattributes.json
219
- req_path = "#{req_path}.json"
220
- else
221
- ## no special rule
222
- end
223
-
224
- page_path = "#{host_dir}/#{req_path}"
225
- page_path
226
- end
227
- end # class DiskCache
228
-
229
-
230
- end # module Webcache
1
+
2
+
3
+ module Webcache
4
+
5
+ #####
6
+ # copied from props gem, see Env.home
7
+ # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
+ # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
+ def self.home
10
+ path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
+ ENV['HOME'] || ENV['USERPROFILE']
12
+ elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
+ "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
+ else
15
+ begin
16
+ File.expand_path('~')
17
+ rescue
18
+ if File::ALT_SEPARATOR
19
+ 'C:/'
20
+ else
21
+ '/'
22
+ end
23
+ end
24
+ end
25
+
26
+ ## note: use File.expand_path to "unify" path e.g
27
+ ## C:\Users\roman becomes
28
+ ## C:/Users/roman
29
+
30
+ File.expand_path( path )
31
+ end
32
+
33
+
34
+ class Configuration
35
+ ## root directory - todo/check: find/use a better name - why? why not?
36
+ def root() @root || "#{Webcache.home}/.cache"; end
37
+ def root=(value) @root = value; end
38
+ end # class Configuration
39
+
40
+
41
+ ## lets you use
42
+ ## Webcache.configure do |config|
43
+ ## config.root = './cache'
44
+ ## end
45
+ def self.configure() yield( config ); end
46
+ def self.config() @config ||= Configuration.new; end
47
+
48
+
49
+ ## add "high level" root convenience helpers
50
+ ## use delegate helper - why? why not?
51
+ def self.root() config.root; end
52
+ def self.root=(value) config.root = value; end
53
+
54
+
55
+ ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
56
+ def self.cache() @cache ||= DiskCache.new; end
57
+
58
+ def self.record( url, response,
59
+ path: nil,
60
+ encoding: 'UTF-8',
61
+ format: 'html' )
62
+ cache.record( url, response,
63
+ path: path,
64
+ encoding: encoding,
65
+ format: format );
66
+ end
67
+ def self.cached?( url ) cache.cached?( url ); end
68
+ class << self
69
+ alias_method :exist?, :cached?
70
+ end
71
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
72
+ def self.read( url ) cache.read( url ); end
73
+ def self.read_json( url ) cache.read_json( url ); end
74
+ def self.read_csv( url ) cache.read_csv( url ); end
75
+
76
+
77
+
78
+ class DiskCache
79
+ def cached?( url )
80
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
81
+ File.exist?( body_path )
82
+ end
83
+ alias_method :exist?, :cached?
84
+
85
+
86
+ def read( url )
87
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
88
+ File.open( body_path, 'r:utf-8' ) {|f| f.read }
89
+ end
90
+
91
+ def read_json( url )
92
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
93
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
94
+ data = JSON.parse( txt )
95
+ data
96
+ end
97
+
98
+ def read_csv( url )
99
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
100
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
101
+ data = CsvHash.parse( txt )
102
+ data
103
+ end
104
+
105
+
106
+ ## add more save / put / etc. aliases - why? why not?
107
+ ## rename to record_html - why? why not?
108
+ def record( url, response,
109
+ path: nil,
110
+ encoding: 'UTF-8',
111
+ format: 'html' )
112
+
113
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
114
+ meta_path = "#{body_path}.meta.txt"
115
+
116
+ ## make sure path exits
117
+ FileUtils.mkdir_p( File.dirname( body_path ) )
118
+
119
+
120
+ puts "[cache] saving #{body_path}..."
121
+
122
+ ## todo/check: verify content-type - why? why not?
123
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
124
+ ##
125
+ ## fix: newlines - always use "unix" style" - why? why not?
126
+ ## fix: use :newline => :universal option? translates to univeral "\n"
127
+ if format == 'json'
128
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
129
+ elsif format == 'csv'
130
+ ## fix: newlines - always use "unix" style" - why? why not?
131
+ ## fix: use :newline => :universal option? translates to univeral "\n"
132
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
133
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
134
+ else ## html or txt
135
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
136
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
137
+ end
138
+
139
+
140
+ File.open( meta_path, 'w:utf-8' ) do |f|
141
+ ## todo/check:
142
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
143
+ response.headers.each do |key, value| # iterate all response headers
144
+ f.write( "#{key}: #{value}" )
145
+ f.write( "\n" )
146
+ end
147
+ end
148
+ end
149
+
150
+
151
+
152
+ ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
153
+ ## use file:// instead of disk:// - why? why not?
154
+ def url_to_id( str ) "disk://#{url_to_path( str )}"; end
155
+
156
+
157
+ ### helpers
158
+ def url_to_path( str, path: nil )
159
+ ## map url to file path
160
+ uri = URI.parse( str )
161
+
162
+ ## note: ignore scheme (e.g. http/https)
163
+ ## and post (e.g. 80, 8080, etc.) for now
164
+ ## always downcase for now (internet domain is case insensitive)
165
+ host_dir = uri.host.downcase
166
+
167
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
168
+ path
169
+ else
170
+ ## "/this/is/everything?query=params"
171
+ ## cut-off leading slash and
172
+ ## convert query ? =
173
+ uri.request_uri[1..-1]
174
+ end
175
+
176
+
177
+
178
+ ### special "prettify" rule for weltfussball
179
+ ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
180
+ if host_dir.index( 'weltfussball.de' ) ||
181
+ host_dir.index( 'worldfootball.net' )
182
+ if req_path.end_with?( '/' )
183
+ req_path = "#{req_path[0..-2]}.html"
184
+ else
185
+ puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
186
+ exit 1
187
+ end
188
+ elsif host_dir.index( 'tipp3.at' )
189
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
190
+
191
+ ## change ? to -I-
192
+ ## change = to ~
193
+ ## Example:
194
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
195
+ ## =>
196
+ ## sportwetten/classicresults-I-oddsetProgramID~888
197
+ req_path = req_path.gsub( '?', '-I-' )
198
+ .gsub( '=', '~')
199
+
200
+ req_path = "#{req_path}.html"
201
+ elsif host_dir.index( 'fbref.com' )
202
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
203
+ req_path = "#{req_path}.html" # auto-add html extension
204
+ elsif host_dir.index( 'football-data.co.uk' )
205
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
206
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
207
+ elsif host_dir.index( 'football-data.org' )
208
+ ## req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
209
+
210
+ ## flattern - make a file path - for auto-save
211
+ ## change ? to -I-
212
+ ## change / to ~~
213
+ ## change = to ~
214
+ req_path = req_path.gsub( '?', '-I-' )
215
+ .gsub( '/', '~~' )
216
+ .gsub( '=', '~')
217
+
218
+ req_path = "#{req_path}.json"
219
+ elsif host_dir.index( 'api.cryptokitties.co' )
220
+ ## for now always auto-add .json extensions e.g.
221
+ ## kitties/1 => kitties/1.json
222
+ ## cattributes => cattributes.json
223
+ req_path = "#{req_path}.json"
224
+ else
225
+ ## no special rule
226
+ end
227
+
228
+ page_path = "#{host_dir}/#{req_path}"
229
+ page_path
230
+ end
231
+ end # class DiskCache
232
+
233
+
234
+ end # module Webcache
data/lib/webget/webget.rb CHANGED
@@ -1,114 +1,124 @@
1
-
2
- class Webget # a web (go get) crawler
3
-
4
- class Configuration ## nested class
5
-
6
- #######################
7
- ## accessors
8
- def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
9
- def sleep=(value) @sleep = value; end
10
-
11
- end # (nested) class Configuration
12
-
13
- ## lets you use
14
- ## Webget.configure do |config|
15
- ## config.sleep = 10
16
- ## end
17
- def self.configure() yield( config ); end
18
- def self.config() @config ||= Configuration.new; end
19
-
20
-
21
-
22
- def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
- puts " sleep #{config.sleep} sec(s)..."
24
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
-
26
- response = Webclient.get( url, headers: headers )
27
-
28
- if response.status.ok? ## must be HTTP 200
29
- puts "#{response.status.code} #{response.status.message}"
30
- ## note: use format json for pretty printing and parse check!!!!
31
- Webcache.record( url, response,
32
- format: 'json' )
33
- else
34
- ## todo/check - log error
35
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
36
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
37
- end
38
-
39
- ## to be done / continued
40
- response
41
- end # method self.call
42
-
43
- ## todo/check: rename encoding to html/http-like charset - why? why not?
44
- def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
- puts " sleep #{config.sleep} sec(s)..."
46
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
-
48
- response = Webclient.get( url, headers: headers )
49
-
50
- if response.status.ok? ## must be HTTP 200
51
- puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response,
53
- encoding: encoding ) ## assumes format: html (default)
54
- else
55
- ## todo/check - log error
56
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
57
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
58
- end
59
-
60
- ## to be done / continued
61
- response
62
- end # method self.page
63
-
64
-
65
- def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
- puts " sleep #{config.sleep} sec(s)..."
67
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
-
69
- response = Webclient.get( url, headers: headers )
70
-
71
- if response.status.ok? ## must be HTTP 200
72
- puts "#{response.status.code} #{response.status.message}"
73
- ## note: like json assumes always utf-8 encoding for now !!!
74
- Webcache.record( url, response,
75
- path: path, ## optional "custom" (file)path for saving in cache
76
- format: 'txt' )
77
- else
78
- ## todo/check - log error
79
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
- end
82
-
83
- ## to be done / continued
84
- response
85
- end # method self.text
86
-
87
-
88
-
89
- ## todo/check: rename to csv or file or records or - why? why not?
90
- ## todo/check: rename encoding to html/http-like charset - why? why not?
91
- def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
- puts " sleep #{config.sleep} sec(s)..."
93
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
-
95
- response = Webclient.get( url, headers: headers )
96
-
97
- if response.status.ok? ## must be HTTP 200
98
- puts "#{response.status.code} #{response.status.message}"
99
- Webcache.record( url, response,
100
- encoding: encoding,
101
- format: 'csv' ) ## pass along csv format - why? why not?
102
- else
103
- ## todo/check - log error
104
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
- end
107
-
108
- ## to be done / continued
109
- response
110
- end # method self.dataset
111
-
112
-
113
- end # class Webget
114
-
1
+
2
+ class Webget # a web (go get) crawler
3
+
4
+ class Configuration ## nested class
5
+ #######################
6
+ ## accessors
7
+ def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
8
+ def sleep=(value) @sleep = value; end
9
+ ## add delay, delay_in_s alias - why? why not?
10
+ alias_method :delay, :sleep
11
+ alias_method :delay_in_s, :sleep
12
+ alias_method :delay=, :sleep=
13
+ alias_method :delay_in_s=, :sleep=
14
+ end # (nested) class Configuration
15
+
16
+ ## lets you use
17
+ ## Webget.configure do |config|
18
+ ## config.sleep = 10
19
+ ## end
20
+ def self.configure() yield( config ); end
21
+ def self.config() @config ||= Configuration.new; end
22
+
23
+
24
+
25
+ ## note - assumes json format
26
+ ## encoding always utf-8 by definition! - double check?)
27
+ def self.call( url, headers: {} )
28
+ response = _get( url, headers: headers )
29
+
30
+ if response.status.ok? ## must be HTTP 200
31
+ puts "#{response.status.code} #{response.status.message}"
32
+ ## note: use format json for pretty printing and parse check!!!!
33
+ Webcache.record( url, response,
34
+ format: 'json' )
35
+ else
36
+ ## todo/check - log error
37
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
38
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
39
+ end
40
+
41
+ ## to be done / continued
42
+ response
43
+ end # method self.call
44
+
45
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
46
+ ## check encoding UTF-8 or utf-8 - makes a difference?
47
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
48
+ response = _get( url, headers: headers )
49
+
50
+ if response.status.ok? ## must be HTTP 200
51
+ puts "#{response.status.code} #{response.status.message}"
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
54
+ else
55
+ ## todo/check - log error
56
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
57
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
58
+ end
59
+
60
+ ## to be done / continued
61
+ response
62
+ end # method self.page
63
+
64
+
65
+ ## assumes txt format
66
+ def self.text( url, path: nil, headers: {} )
67
+ response = _get( url, headers: headers )
68
+
69
+ if response.status.ok? ## must be HTTP 200
70
+ puts "#{response.status.code} #{response.status.message}"
71
+ ## note: like json assumes always utf-8 encoding for now !!!
72
+ Webcache.record( url, response,
73
+ path: path, ## optional "custom" (file)path for saving in cache
74
+ format: 'txt' )
75
+ else
76
+ ## todo/check - log error
77
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
78
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
79
+ end
80
+
81
+ ## to be done / continued
82
+ response
83
+ end # method self.text
84
+
85
+
86
+
87
+ ## todo/check: rename to csv or file or records or - why? why not?
88
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
89
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
90
+ response = _get( url, headers: headers )
91
+
92
+ if response.status.ok? ## must be HTTP 200
93
+ puts "#{response.status.code} #{response.status.message}"
94
+ Webcache.record( url, response,
95
+ encoding: encoding,
96
+ format: 'csv' ) ## pass along csv format - why? why not?
97
+ else
98
+ ## todo/check - log error
99
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
100
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
101
+ end
102
+
103
+ ## to be done / continued
104
+ response
105
+ end # method self.dataset
106
+
107
+
108
+
109
+ ####
110
+ ## private helpers
111
+ ## make private - why? why not?
112
+ def self._get( url, headers: {} )
113
+ @@requests ||= 0 ## track number of requests
114
+
115
+ if @@requests > 0 ## note - do NOT sleep on very first request!!!
116
+ puts " sleep #{config.sleep} sec(s)..."
117
+ sleep( config.sleep ) ## slow down - sleep x secs before each http request
118
+ end
119
+
120
+ @@requests += 1
121
+ Webclient.get( url, headers: headers ) ## returns respone
122
+ end
123
+ end # class Webget
124
+
data/lib/webget.rb CHANGED
@@ -1,25 +1,29 @@
1
- require 'webclient'
2
-
3
- ## more (our own) 3rd party libs
4
- require 'csvreader'
5
-
6
-
7
- ## our own code
8
- require 'webget/version' # let version go first
9
- require 'webget/webcache'
10
- require 'webget/webget'
11
-
12
-
13
-
14
-
15
- ############
16
- ## add convenience alias for camel case / alternate different spelling
17
- WebCache = Webcache
18
- WebGet = Webget
19
-
20
- ## use Webgo as (alias) name (keep reserver for now) - why? why not?
21
- WebGo = Webget
22
- Webgo = Webget
23
-
24
-
25
- puts Webget.banner # say hello
1
+ require 'webclient'
2
+
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
7
+ # NEW!! - require/add cocos
8
+ require 'cocos' # - note - cococs incl. webclient & cvsreader !!!!
9
+
10
+
11
+ ## our own code
12
+ require_relative 'webget/version' # let version go first
13
+ require_relative 'webget/webcache'
14
+ require_relative 'webget/webget'
15
+
16
+
17
+
18
+
19
+ ############
20
+ ## add convenience alias for camel case / alternate different spelling
21
+ WebCache = Webcache
22
+ WebGet = Webget
23
+
24
+ ## use Webgo as (alias) name (keep reserver for now) - why? why not?
25
+ WebGo = Webget
26
+ Webgo = Webget
27
+
28
+
29
+ puts Webget.banner # say hello
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-21 00:00:00.000000000 Z
11
+ date: 2024-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.2.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: cocos
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rdoc
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -64,16 +78,16 @@ dependencies:
64
78
  requirements:
65
79
  - - "~>"
66
80
  - !ruby/object:Gem::Version
67
- version: '3.22'
81
+ version: '4.1'
68
82
  type: :development
69
83
  prerelease: false
70
84
  version_requirements: !ruby/object:Gem::Requirement
71
85
  requirements:
72
86
  - - "~>"
73
87
  - !ruby/object:Gem::Version
74
- version: '3.22'
88
+ version: '4.1'
75
89
  description: webget gem - a web (go get) crawler incl. web cache
76
- email: ruby-talk@ruby-lang.org
90
+ email: gerald.bauer@gmail.com
77
91
  executables: []
78
92
  extensions: []
79
93
  extra_rdoc_files:
@@ -110,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
124
  - !ruby/object:Gem::Version
111
125
  version: '0'
112
126
  requirements: []
113
- rubygems_version: 3.1.4
127
+ rubygems_version: 3.4.10
114
128
  signing_key:
115
129
  specification_version: 4
116
130
  summary: webget gem - a web (go get) crawler incl. web cache