webget 0.2.5 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 33af918725af96b367234f9280264a7cbc0a5c5fd965f4f5cd6da07a26ac43f4
4
- data.tar.gz: a53145c4aa919e3073408decd06ebaa8262f70f2077a85bcb8c5cad1a295ec25
3
+ metadata.gz: de0060acabe176d1a11a4f2e36e8bee5090083c4907b75d9b22e9cbb9c7b22e2
4
+ data.tar.gz: fb64c2b1294932b00fa401af006cd27b263b255789bad75e4ef62bec5598fd74
5
5
  SHA512:
6
- metadata.gz: 7459a96f8235fd8a9cec9d0f12512a3d8b26a4ef6b9f6f87b768ef75e5b806f32986ba4d507a78957f068b181abab4ec4386fd059f2f7a2c6c1f1098c96f82c4
7
- data.tar.gz: 5272e8b6ce21110745d41b1c137aad4b92f99d7b1e519a1f0aade275e22d9bbc5f14b15c306eb9d92c81373519e3ef48e95b2e0dcf22367fd49514cceee5f265
6
+ metadata.gz: d1f1653c68729e7d609c3c848e0146de9668cd2932ddab2e794ab361d3f786603a10c7586bcf0029de4955255803987000b2fb35b0d1e14cfa35d0582f919be7
7
+ data.tar.gz: c6cff08b2f683bb5b8e39607735250f954115244be3e19dfcdeb29fd2d0a0f0a27be1fed5a2add9a1b75a9cbe81dbafea1e80f06b0a1bdd93d4bd3bd12d5a9a1
data/CHANGELOG.md CHANGED
@@ -1,4 +1,6 @@
1
- ### 0.0.1 / 2020-10-04
2
-
3
- * Everything is new. First release.
4
-
1
+ ### 0.3.1
2
+
3
+ ### 0.0.1 / 2020-10-04
4
+
5
+ * Everything is new. First release.
6
+
data/README.md CHANGED
@@ -1,20 +1,19 @@
1
- # webget
2
-
3
- webget gem - a web (go get) crawler incl. web cache
4
-
5
- * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
- * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
- * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
- * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
- * forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
10
-
11
-
12
- ## Usage
13
-
14
- TBD
15
-
16
-
17
- ## License
18
-
19
- The `webget` scripts are dedicated to the public domain.
20
- Use it as you please with no restrictions whatsoever.
1
+ # webget
2
+
3
+ webget gem - a web (go get) crawler incl. web cache
4
+
5
+ * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
+ * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
+ * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
+ * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
+
10
+
11
+ ## Usage
12
+
13
+ TBD
14
+
15
+
16
+ ## License
17
+
18
+ The `webget` scripts are dedicated to the public domain.
19
+ Use it as you please with no restrictions whatsoever.
data/Rakefile CHANGED
@@ -1,31 +1,32 @@
1
- require 'hoe'
2
- require './lib/webget/version.rb'
3
-
4
- Hoe.spec 'webget' do
5
-
6
- self.version = Webget::VERSION
7
-
8
- self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
- self.description = summary
10
-
11
- self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
-
13
- self.author = 'Gerald Bauer'
14
- self.email = 'ruby-talk@ruby-lang.org'
15
-
16
- # switch extension to .markdown for gihub formatting
17
- self.readme_file = 'README.md'
18
- self.history_file = 'CHANGELOG.md'
19
-
20
- self.extra_deps = [
21
- ['webclient', '>= 0.2.0'],
22
- ['csvreader', '>= 1.2.4'],
23
- ]
24
-
25
- self.licenses = ['Public Domain']
26
-
27
- self.spec_extras = {
28
- required_ruby_version: '>= 2.2.2'
29
- }
30
-
31
- end
1
+ require 'hoe'
2
+ require './lib/webget/version.rb'
3
+
4
+ Hoe.spec 'webget' do
5
+
6
+ self.version = Webget::VERSION
7
+
8
+ self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
+ self.description = summary
10
+
11
+ self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
+
13
+ self.author = 'Gerald Bauer'
14
+ self.email = 'gerald.bauer@gmail.com'
15
+
16
+ # switch extension to .markdown for gihub formatting
17
+ self.readme_file = 'README.md'
18
+ self.history_file = 'CHANGELOG.md'
19
+
20
+ self.extra_deps = [
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
23
+ ['cocos'],
24
+ ]
25
+
26
+ self.licenses = ['Public Domain']
27
+
28
+ self.spec_extras = {
29
+ required_ruby_version: '>= 2.2.2'
30
+ }
31
+
32
+ end
@@ -1,23 +1,21 @@
1
-
2
- class Webget
3
-
4
- MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
- MINOR = 2
6
- PATCH = 5
7
- VERSION = [MAJOR,MINOR,PATCH].join('.')
8
-
9
- def self.version
10
- VERSION
11
- end
12
-
13
- # version string for generator meta tag (includes ruby version)
14
- def self.banner
15
- "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end # module Webget
23
-
1
+
2
+ class Webget
3
+ MAJOR = 0 ## todo: namespace inside version or something - why? why not??
4
+ MINOR = 3
5
+ PATCH = 1
6
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
7
+
8
+ def self.version
9
+ VERSION
10
+ end
11
+
12
+ # version string for generator meta tag (includes ruby version)
13
+ def self.banner
14
+ "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
15
+ end
16
+
17
+ def self.root
18
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
+ end
20
+ end # module Webget
21
+
@@ -1,230 +1,234 @@
1
-
2
-
3
- module Webcache
4
-
5
- #####
6
- # copied from props gem, see Env.home
7
- # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
- # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
- def self.home
10
- path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
- ENV['HOME'] || ENV['USERPROFILE']
12
- elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
- "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
- else
15
- begin
16
- File.expand_path('~')
17
- rescue
18
- if File::ALT_SEPARATOR
19
- 'C:/'
20
- else
21
- '/'
22
- end
23
- end
24
- end
25
-
26
- ## note: use File.expand_path to "unify" path e.g
27
- ## C:\Users\roman becomes
28
- ## C:/Users/roman
29
-
30
- File.expand_path( path )
31
- end
32
-
33
-
34
- class Configuration
35
- ## root directory - todo/check: find/use a better name - why? why not?
36
- def root() @root || "#{Webcache.home}/.cache"; end
37
- def root=(value) @root = value; end
38
- end # class Configuration
39
-
40
-
41
- ## lets you use
42
- ## Webcache.configure do |config|
43
- ## config.root = './cache'
44
- ## end
45
- def self.configure() yield( config ); end
46
- def self.config() @config ||= Configuration.new; end
47
-
48
-
49
- ## add "high level" root convenience helpers
50
- def self.root() config.root; end
51
- def self.root=(value) config.root = value; end
52
-
53
-
54
- ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
- def self.cache() @cache ||= DiskCache.new; end
56
-
57
- def self.record( url, response,
58
- path: nil,
59
- encoding: 'UTF-8',
60
- format: 'html' )
61
- cache.record( url, response,
62
- path: path,
63
- encoding: encoding,
64
- format: format );
65
- end
66
- def self.cached?( url ) cache.cached?( url ); end
67
- class << self
68
- alias_method :exist?, :cached?
69
- end
70
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
71
- def self.read( url ) cache.read( url ); end
72
- def self.read_json( url ) cache.read_json( url ); end
73
- def self.read_csv( url ) cache.read_csv( url ); end
74
-
75
-
76
-
77
- class DiskCache
78
- def cached?( url )
79
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
80
- File.exist?( body_path )
81
- end
82
- alias_method :exist?, :cached?
83
-
84
-
85
- def read( url )
86
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
87
- File.open( body_path, 'r:utf-8' ) {|f| f.read }
88
- end
89
-
90
- def read_json( url )
91
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
92
- txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
93
- data = JSON.parse( txt )
94
- data
95
- end
96
-
97
- def read_csv( url )
98
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
- txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
- data = CsvHash.parse( txt )
101
- data
102
- end
103
-
104
-
105
- ## add more save / put / etc. aliases - why? why not?
106
- ## rename to record_html - why? why not?
107
- def record( url, response,
108
- path: nil,
109
- encoding: 'UTF-8',
110
- format: 'html' )
111
-
112
- body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
113
- meta_path = "#{body_path}.meta.txt"
114
-
115
- ## make sure path exits
116
- FileUtils.mkdir_p( File.dirname( body_path ) )
117
-
118
-
119
- puts "[cache] saving #{body_path}..."
120
-
121
- ## todo/check: verify content-type - why? why not?
122
- ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
123
- if format == 'json'
124
- File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
125
- elsif format == 'csv'
126
- ## fix: newlines - always use "unix" style" - why? why not?
127
- ## fix: use :newline => :universal option? translates to univeral "\n"
128
- text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
- File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
- else ## html or txt
131
- text = response.text( encoding: encoding )
132
- File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
133
- end
134
-
135
-
136
- File.open( meta_path, 'w:utf-8' ) do |f|
137
- ## todo/check:
138
- ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
139
- response.headers.each do |key, value| # iterate all response headers
140
- f.write( "#{key}: #{value}" )
141
- f.write( "\n" )
142
- end
143
- end
144
- end
145
-
146
-
147
-
148
- ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
149
- ## use file:// instead of disk:// - why? why not?
150
- def url_to_id( str ) "disk://#{url_to_path( str )}"; end
151
-
152
-
153
- ### helpers
154
- def url_to_path( str, path: nil )
155
- ## map url to file path
156
- uri = URI.parse( str )
157
-
158
- ## note: ignore scheme (e.g. http/https)
159
- ## and post (e.g. 80, 8080, etc.) for now
160
- ## always downcase for now (internet domain is case insensitive)
161
- host_dir = uri.host.downcase
162
-
163
- req_path = if path ## use "custom" (file)path for cache storage if passed in
164
- path
165
- else
166
- ## "/this/is/everything?query=params"
167
- ## cut-off leading slash and
168
- ## convert query ? =
169
- uri.request_uri[1..-1]
170
- end
171
-
172
-
173
-
174
- ### special "prettify" rule for weltfussball
175
- ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
176
- if host_dir.index( 'weltfussball.de' ) ||
177
- host_dir.index( 'worldfootball.net' )
178
- if req_path.end_with?( '/' )
179
- req_path = "#{req_path[0..-2]}.html"
180
- else
181
- puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
182
- exit 1
183
- end
184
- elsif host_dir.index( 'tipp3.at' )
185
- req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
-
187
- ## change ? to -I-
188
- ## change = to ~
189
- ## Example:
190
- ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
- ## =>
192
- ## sportwetten/classicresults-I-oddsetProgramID~888
193
- req_path = req_path.gsub( '?', '-I-' )
194
- .gsub( '=', '~')
195
-
196
- req_path = "#{req_path}.html"
197
- elsif host_dir.index( 'fbref.com' )
198
- req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
- req_path = "#{req_path}.html" # auto-add html extension
200
- elsif host_dir.index( 'football-data.co.uk' )
201
- req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
- req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
203
- elsif host_dir.index( 'football-data.org' )
204
- req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
205
-
206
- ## flattern - make a file path - for auto-save
207
- ## change ? to -I-
208
- ## change / to ~~
209
- ## change = to ~
210
- req_path = req_path.gsub( '?', '-I-' )
211
- .gsub( '/', '~~' )
212
- .gsub( '=', '~')
213
-
214
- req_path = "#{req_path}.json"
215
- elsif host_dir.index( 'api.cryptokitties.co' )
216
- ## for now always auto-add .json extensions e.g.
217
- ## kitties/1 => kitties/1.json
218
- ## cattributes => cattributes.json
219
- req_path = "#{req_path}.json"
220
- else
221
- ## no special rule
222
- end
223
-
224
- page_path = "#{host_dir}/#{req_path}"
225
- page_path
226
- end
227
- end # class DiskCache
228
-
229
-
230
- end # module Webcache
1
+
2
+
3
+ module Webcache
4
+
5
+ #####
6
+ # copied from props gem, see Env.home
7
+ # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
+ # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
+ def self.home
10
+ path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
+ ENV['HOME'] || ENV['USERPROFILE']
12
+ elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
+ "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
+ else
15
+ begin
16
+ File.expand_path('~')
17
+ rescue
18
+ if File::ALT_SEPARATOR
19
+ 'C:/'
20
+ else
21
+ '/'
22
+ end
23
+ end
24
+ end
25
+
26
+ ## note: use File.expand_path to "unify" path e.g
27
+ ## C:\Users\roman becomes
28
+ ## C:/Users/roman
29
+
30
+ File.expand_path( path )
31
+ end
32
+
33
+
34
+ class Configuration
35
+ ## root directory - todo/check: find/use a better name - why? why not?
36
+ def root() @root || "#{Webcache.home}/.cache"; end
37
+ def root=(value) @root = value; end
38
+ end # class Configuration
39
+
40
+
41
+ ## lets you use
42
+ ## Webcache.configure do |config|
43
+ ## config.root = './cache'
44
+ ## end
45
+ def self.configure() yield( config ); end
46
+ def self.config() @config ||= Configuration.new; end
47
+
48
+
49
+ ## add "high level" root convenience helpers
50
+ ## use delegate helper - why? why not?
51
+ def self.root() config.root; end
52
+ def self.root=(value) config.root = value; end
53
+
54
+
55
+ ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
56
+ def self.cache() @cache ||= DiskCache.new; end
57
+
58
+ def self.record( url, response,
59
+ path: nil,
60
+ encoding: 'UTF-8',
61
+ format: 'html' )
62
+ cache.record( url, response,
63
+ path: path,
64
+ encoding: encoding,
65
+ format: format );
66
+ end
67
+ def self.cached?( url ) cache.cached?( url ); end
68
+ class << self
69
+ alias_method :exist?, :cached?
70
+ end
71
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
72
+ def self.read( url ) cache.read( url ); end
73
+ def self.read_json( url ) cache.read_json( url ); end
74
+ def self.read_csv( url ) cache.read_csv( url ); end
75
+
76
+
77
+
78
+ class DiskCache
79
+ def cached?( url )
80
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
81
+ File.exist?( body_path )
82
+ end
83
+ alias_method :exist?, :cached?
84
+
85
+
86
+ def read( url )
87
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
88
+ File.open( body_path, 'r:utf-8' ) {|f| f.read }
89
+ end
90
+
91
+ def read_json( url )
92
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
93
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
94
+ data = JSON.parse( txt )
95
+ data
96
+ end
97
+
98
+ def read_csv( url )
99
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
100
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
101
+ data = CsvHash.parse( txt )
102
+ data
103
+ end
104
+
105
+
106
+ ## add more save / put / etc. aliases - why? why not?
107
+ ## rename to record_html - why? why not?
108
+ def record( url, response,
109
+ path: nil,
110
+ encoding: 'UTF-8',
111
+ format: 'html' )
112
+
113
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
114
+ meta_path = "#{body_path}.meta.txt"
115
+
116
+ ## make sure path exits
117
+ FileUtils.mkdir_p( File.dirname( body_path ) )
118
+
119
+
120
+ puts "[cache] saving #{body_path}..."
121
+
122
+ ## todo/check: verify content-type - why? why not?
123
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
124
+ ##
125
+ ## fix: newlines - always use "unix" style" - why? why not?
126
+ ## fix: use :newline => :universal option? translates to univeral "\n"
127
+ if format == 'json'
128
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
129
+ elsif format == 'csv'
130
+ ## fix: newlines - always use "unix" style" - why? why not?
131
+ ## fix: use :newline => :universal option? translates to univeral "\n"
132
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
133
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
134
+ else ## html or txt
135
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
136
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
137
+ end
138
+
139
+
140
+ File.open( meta_path, 'w:utf-8' ) do |f|
141
+ ## todo/check:
142
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
143
+ response.headers.each do |key, value| # iterate all response headers
144
+ f.write( "#{key}: #{value}" )
145
+ f.write( "\n" )
146
+ end
147
+ end
148
+ end
149
+
150
+
151
+
152
+ ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
153
+ ## use file:// instead of disk:// - why? why not?
154
+ def url_to_id( str ) "disk://#{url_to_path( str )}"; end
155
+
156
+
157
+ ### helpers
158
+ def url_to_path( str, path: nil )
159
+ ## map url to file path
160
+ uri = URI.parse( str )
161
+
162
+ ## note: ignore scheme (e.g. http/https)
163
+ ## and post (e.g. 80, 8080, etc.) for now
164
+ ## always downcase for now (internet domain is case insensitive)
165
+ host_dir = uri.host.downcase
166
+
167
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
168
+ path
169
+ else
170
+ ## "/this/is/everything?query=params"
171
+ ## cut-off leading slash and
172
+ ## convert query ? =
173
+ uri.request_uri[1..-1]
174
+ end
175
+
176
+
177
+
178
+ ### special "prettify" rule for weltfussball
179
+ ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
180
+ if host_dir.index( 'weltfussball.de' ) ||
181
+ host_dir.index( 'worldfootball.net' )
182
+ if req_path.end_with?( '/' )
183
+ req_path = "#{req_path[0..-2]}.html"
184
+ else
185
+ puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
186
+ exit 1
187
+ end
188
+ elsif host_dir.index( 'tipp3.at' )
189
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
190
+
191
+ ## change ? to -I-
192
+ ## change = to ~
193
+ ## Example:
194
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
195
+ ## =>
196
+ ## sportwetten/classicresults-I-oddsetProgramID~888
197
+ req_path = req_path.gsub( '?', '-I-' )
198
+ .gsub( '=', '~')
199
+
200
+ req_path = "#{req_path}.html"
201
+ elsif host_dir.index( 'fbref.com' )
202
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
203
+ req_path = "#{req_path}.html" # auto-add html extension
204
+ elsif host_dir.index( 'football-data.co.uk' )
205
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
206
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
207
+ elsif host_dir.index( 'football-data.org' )
208
+ ## req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
209
+
210
+ ## flattern - make a file path - for auto-save
211
+ ## change ? to -I-
212
+ ## change / to ~~
213
+ ## change = to ~
214
+ req_path = req_path.gsub( '?', '-I-' )
215
+ .gsub( '/', '~~' )
216
+ .gsub( '=', '~')
217
+
218
+ req_path = "#{req_path}.json"
219
+ elsif host_dir.index( 'api.cryptokitties.co' )
220
+ ## for now always auto-add .json extensions e.g.
221
+ ## kitties/1 => kitties/1.json
222
+ ## cattributes => cattributes.json
223
+ req_path = "#{req_path}.json"
224
+ else
225
+ ## no special rule
226
+ end
227
+
228
+ page_path = "#{host_dir}/#{req_path}"
229
+ page_path
230
+ end
231
+ end # class DiskCache
232
+
233
+
234
+ end # module Webcache
data/lib/webget/webget.rb CHANGED
@@ -1,114 +1,124 @@
1
-
2
- class Webget # a web (go get) crawler
3
-
4
- class Configuration ## nested class
5
-
6
- #######################
7
- ## accessors
8
- def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
9
- def sleep=(value) @sleep = value; end
10
-
11
- end # (nested) class Configuration
12
-
13
- ## lets you use
14
- ## Webget.configure do |config|
15
- ## config.sleep = 10
16
- ## end
17
- def self.configure() yield( config ); end
18
- def self.config() @config ||= Configuration.new; end
19
-
20
-
21
-
22
- def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
- puts " sleep #{config.sleep} sec(s)..."
24
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
-
26
- response = Webclient.get( url, headers: headers )
27
-
28
- if response.status.ok? ## must be HTTP 200
29
- puts "#{response.status.code} #{response.status.message}"
30
- ## note: use format json for pretty printing and parse check!!!!
31
- Webcache.record( url, response,
32
- format: 'json' )
33
- else
34
- ## todo/check - log error
35
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
36
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
37
- end
38
-
39
- ## to be done / continued
40
- response
41
- end # method self.call
42
-
43
- ## todo/check: rename encoding to html/http-like charset - why? why not?
44
- def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
- puts " sleep #{config.sleep} sec(s)..."
46
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
-
48
- response = Webclient.get( url, headers: headers )
49
-
50
- if response.status.ok? ## must be HTTP 200
51
- puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response,
53
- encoding: encoding ) ## assumes format: html (default)
54
- else
55
- ## todo/check - log error
56
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
57
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
58
- end
59
-
60
- ## to be done / continued
61
- response
62
- end # method self.page
63
-
64
-
65
- def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
- puts " sleep #{config.sleep} sec(s)..."
67
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
-
69
- response = Webclient.get( url, headers: headers )
70
-
71
- if response.status.ok? ## must be HTTP 200
72
- puts "#{response.status.code} #{response.status.message}"
73
- ## note: like json assumes always utf-8 encoding for now !!!
74
- Webcache.record( url, response,
75
- path: path, ## optional "custom" (file)path for saving in cache
76
- format: 'txt' )
77
- else
78
- ## todo/check - log error
79
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
- end
82
-
83
- ## to be done / continued
84
- response
85
- end # method self.text
86
-
87
-
88
-
89
- ## todo/check: rename to csv or file or records or - why? why not?
90
- ## todo/check: rename encoding to html/http-like charset - why? why not?
91
- def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
- puts " sleep #{config.sleep} sec(s)..."
93
- sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
-
95
- response = Webclient.get( url, headers: headers )
96
-
97
- if response.status.ok? ## must be HTTP 200
98
- puts "#{response.status.code} #{response.status.message}"
99
- Webcache.record( url, response,
100
- encoding: encoding,
101
- format: 'csv' ) ## pass along csv format - why? why not?
102
- else
103
- ## todo/check - log error
104
- puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
- pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
- end
107
-
108
- ## to be done / continued
109
- response
110
- end # method self.dataset
111
-
112
-
113
- end # class Webget
114
-
1
+
2
+ class Webget # a web (go get) crawler
3
+
4
+ class Configuration ## nested class
5
+ #######################
6
+ ## accessors
7
+ def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
8
+ def sleep=(value) @sleep = value; end
9
+ ## add delay, delay_in_s alias - why? why not?
10
+ alias_method :delay, :sleep
11
+ alias_method :delay_in_s, :sleep
12
+ alias_method :delay=, :sleep=
13
+ alias_method :delay_in_s=, :sleep=
14
+ end # (nested) class Configuration
15
+
16
+ ## lets you use
17
+ ## Webget.configure do |config|
18
+ ## config.sleep = 10
19
+ ## end
20
+ def self.configure() yield( config ); end
21
+ def self.config() @config ||= Configuration.new; end
22
+
23
+
24
+
25
+ ## note - assumes json format
26
+ ## encoding always utf-8 by definition! - double check?)
27
+ def self.call( url, headers: {} )
28
+ response = _get( url, headers: headers )
29
+
30
+ if response.status.ok? ## must be HTTP 200
31
+ puts "#{response.status.code} #{response.status.message}"
32
+ ## note: use format json for pretty printing and parse check!!!!
33
+ Webcache.record( url, response,
34
+ format: 'json' )
35
+ else
36
+ ## todo/check - log error
37
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
38
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
39
+ end
40
+
41
+ ## to be done / continued
42
+ response
43
+ end # method self.call
44
+
45
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
46
+ ## check encoding UTF-8 or utf-8 - makes a difference?
47
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
48
+ response = _get( url, headers: headers )
49
+
50
+ if response.status.ok? ## must be HTTP 200
51
+ puts "#{response.status.code} #{response.status.message}"
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
54
+ else
55
+ ## todo/check - log error
56
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
57
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
58
+ end
59
+
60
+ ## to be done / continued
61
+ response
62
+ end # method self.page
63
+
64
+
65
+ ## assumes txt format
66
+ def self.text( url, path: nil, headers: {} )
67
+ response = _get( url, headers: headers )
68
+
69
+ if response.status.ok? ## must be HTTP 200
70
+ puts "#{response.status.code} #{response.status.message}"
71
+ ## note: like json assumes always utf-8 encoding for now !!!
72
+ Webcache.record( url, response,
73
+ path: path, ## optional "custom" (file)path for saving in cache
74
+ format: 'txt' )
75
+ else
76
+ ## todo/check - log error
77
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
78
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
79
+ end
80
+
81
+ ## to be done / continued
82
+ response
83
+ end # method self.text
84
+
85
+
86
+
87
+ ## todo/check: rename to csv or file or records or - why? why not?
88
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
89
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
90
+ response = _get( url, headers: headers )
91
+
92
+ if response.status.ok? ## must be HTTP 200
93
+ puts "#{response.status.code} #{response.status.message}"
94
+ Webcache.record( url, response,
95
+ encoding: encoding,
96
+ format: 'csv' ) ## pass along csv format - why? why not?
97
+ else
98
+ ## todo/check - log error
99
+ puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
100
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
101
+ end
102
+
103
+ ## to be done / continued
104
+ response
105
+ end # method self.dataset
106
+
107
+
108
+
109
+ ####
110
+ ## private helpers
111
+ ## make private - why? why not?
112
+ def self._get( url, headers: {} )
113
+ @@requests ||= 0 ## track number of requests
114
+
115
+ if @@requests > 0 ## note - do NOT sleep on very first request!!!
116
+ puts " sleep #{config.sleep} sec(s)..."
117
+ sleep( config.sleep ) ## slow down - sleep x secs before each http request
118
+ end
119
+
120
+ @@requests += 1
121
+ Webclient.get( url, headers: headers ) ## returns respone
122
+ end
123
+ end # class Webget
124
+
data/lib/webget.rb CHANGED
@@ -1,25 +1,29 @@
1
- require 'webclient'
2
-
3
- ## more (our own) 3rd party libs
4
- require 'csvreader'
5
-
6
-
7
- ## our own code
8
- require 'webget/version' # let version go first
9
- require 'webget/webcache'
10
- require 'webget/webget'
11
-
12
-
13
-
14
-
15
- ############
16
- ## add convenience alias for camel case / alternate different spelling
17
- WebCache = Webcache
18
- WebGet = Webget
19
-
20
- ## use Webgo as (alias) name (keep reserver for now) - why? why not?
21
- WebGo = Webget
22
- Webgo = Webget
23
-
24
-
25
- puts Webget.banner # say hello
1
+ require 'webclient'
2
+
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
5
+
6
+
7
+ # NEW!! - require/add cocos
8
+ require 'cocos' # - note - cococs incl. webclient & cvsreader !!!!
9
+
10
+
11
+ ## our own code
12
+ require_relative 'webget/version' # let version go first
13
+ require_relative 'webget/webcache'
14
+ require_relative 'webget/webget'
15
+
16
+
17
+
18
+
19
+ ############
20
+ ## add convenience alias for camel case / alternate different spelling
21
+ WebCache = Webcache
22
+ WebGet = Webget
23
+
24
+ ## use Webgo as (alias) name (keep reserver for now) - why? why not?
25
+ WebGo = Webget
26
+ Webgo = Webget
27
+
28
+
29
+ puts Webget.banner # say hello
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-21 00:00:00.000000000 Z
11
+ date: 2024-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 1.2.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: cocos
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rdoc
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -64,16 +78,16 @@ dependencies:
64
78
  requirements:
65
79
  - - "~>"
66
80
  - !ruby/object:Gem::Version
67
- version: '3.22'
81
+ version: '4.1'
68
82
  type: :development
69
83
  prerelease: false
70
84
  version_requirements: !ruby/object:Gem::Requirement
71
85
  requirements:
72
86
  - - "~>"
73
87
  - !ruby/object:Gem::Version
74
- version: '3.22'
88
+ version: '4.1'
75
89
  description: webget gem - a web (go get) crawler incl. web cache
76
- email: ruby-talk@ruby-lang.org
90
+ email: gerald.bauer@gmail.com
77
91
  executables: []
78
92
  extensions: []
79
93
  extra_rdoc_files:
@@ -110,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
124
  - !ruby/object:Gem::Version
111
125
  version: '0'
112
126
  requirements: []
113
- rubygems_version: 3.1.4
127
+ rubygems_version: 3.4.10
114
128
  signing_key:
115
129
  specification_version: 4
116
130
  summary: webget gem - a web (go get) crawler incl. web cache