webget 0.2.5 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -4
- data/README.md +19 -20
- data/Rakefile +32 -31
- data/lib/webget/version.rb +21 -23
- data/lib/webget/webcache.rb +234 -230
- data/lib/webget/webget.rb +124 -114
- data/lib/webget.rb +29 -25
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de0060acabe176d1a11a4f2e36e8bee5090083c4907b75d9b22e9cbb9c7b22e2
|
4
|
+
data.tar.gz: fb64c2b1294932b00fa401af006cd27b263b255789bad75e4ef62bec5598fd74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1f1653c68729e7d609c3c848e0146de9668cd2932ddab2e794ab361d3f786603a10c7586bcf0029de4955255803987000b2fb35b0d1e14cfa35d0582f919be7
|
7
|
+
data.tar.gz: c6cff08b2f683bb5b8e39607735250f954115244be3e19dfcdeb29fd2d0a0f0a27be1fed5a2add9a1b75a9cbe81dbafea1e80f06b0a1bdd93d4bd3bd12d5a9a1
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,20 +1,19 @@
|
|
1
|
-
# webget
|
2
|
-
|
3
|
-
webget gem - a web (go get) crawler incl. web cache
|
4
|
-
|
5
|
-
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
6
|
-
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
7
|
-
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
|
-
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
Use it as you please with no restrictions whatsoever.
|
1
|
+
# webget
|
2
|
+
|
3
|
+
webget gem - a web (go get) crawler incl. web cache
|
4
|
+
|
5
|
+
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
6
|
+
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
7
|
+
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
|
+
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
|
+
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
TBD
|
14
|
+
|
15
|
+
|
16
|
+
## License
|
17
|
+
|
18
|
+
The `webget` scripts are dedicated to the public domain.
|
19
|
+
Use it as you please with no restrictions whatsoever.
|
data/Rakefile
CHANGED
@@ -1,31 +1,32 @@
|
|
1
|
-
require 'hoe'
|
2
|
-
require './lib/webget/version.rb'
|
3
|
-
|
4
|
-
Hoe.spec 'webget' do
|
5
|
-
|
6
|
-
self.version = Webget::VERSION
|
7
|
-
|
8
|
-
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
9
|
-
self.description = summary
|
10
|
-
|
11
|
-
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
12
|
-
|
13
|
-
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
15
|
-
|
16
|
-
# switch extension to .markdown for gihub formatting
|
17
|
-
self.readme_file = 'README.md'
|
18
|
-
self.history_file = 'CHANGELOG.md'
|
19
|
-
|
20
|
-
self.extra_deps = [
|
21
|
-
['webclient', '>= 0.2.0'],
|
22
|
-
['csvreader', '>= 1.2.4'],
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
1
|
+
require 'hoe'
|
2
|
+
require './lib/webget/version.rb'
|
3
|
+
|
4
|
+
Hoe.spec 'webget' do
|
5
|
+
|
6
|
+
self.version = Webget::VERSION
|
7
|
+
|
8
|
+
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
9
|
+
self.description = summary
|
10
|
+
|
11
|
+
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
12
|
+
|
13
|
+
self.author = 'Gerald Bauer'
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
15
|
+
|
16
|
+
# switch extension to .markdown for gihub formatting
|
17
|
+
self.readme_file = 'README.md'
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
19
|
+
|
20
|
+
self.extra_deps = [
|
21
|
+
['webclient', '>= 0.2.0'],
|
22
|
+
['csvreader', '>= 1.2.4'],
|
23
|
+
['cocos'],
|
24
|
+
]
|
25
|
+
|
26
|
+
self.licenses = ['Public Domain']
|
27
|
+
|
28
|
+
self.spec_extras = {
|
29
|
+
required_ruby_version: '>= 2.2.2'
|
30
|
+
}
|
31
|
+
|
32
|
+
end
|
data/lib/webget/version.rb
CHANGED
@@ -1,23 +1,21 @@
|
|
1
|
-
|
2
|
-
class Webget
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end # module Webget
|
23
|
-
|
1
|
+
|
2
|
+
class Webget
|
3
|
+
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
4
|
+
MINOR = 3
|
5
|
+
PATCH = 1
|
6
|
+
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
7
|
+
|
8
|
+
def self.version
|
9
|
+
VERSION
|
10
|
+
end
|
11
|
+
|
12
|
+
# version string for generator meta tag (includes ruby version)
|
13
|
+
def self.banner
|
14
|
+
"webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.root
|
18
|
+
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
19
|
+
end
|
20
|
+
end # module Webget
|
21
|
+
|
data/lib/webget/webcache.rb
CHANGED
@@ -1,230 +1,234 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module Webcache
|
4
|
-
|
5
|
-
#####
|
6
|
-
# copied from props gem, see Env.home
|
7
|
-
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
8
|
-
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
9
|
-
def self.home
|
10
|
-
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
11
|
-
ENV['HOME'] || ENV['USERPROFILE']
|
12
|
-
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
13
|
-
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
14
|
-
else
|
15
|
-
begin
|
16
|
-
File.expand_path('~')
|
17
|
-
rescue
|
18
|
-
if File::ALT_SEPARATOR
|
19
|
-
'C:/'
|
20
|
-
else
|
21
|
-
'/'
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
## note: use File.expand_path to "unify" path e.g
|
27
|
-
## C:\Users\roman becomes
|
28
|
-
## C:/Users/roman
|
29
|
-
|
30
|
-
File.expand_path( path )
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
class Configuration
|
35
|
-
## root directory - todo/check: find/use a better name - why? why not?
|
36
|
-
def root() @root || "#{Webcache.home}/.cache"; end
|
37
|
-
def root=(value) @root = value; end
|
38
|
-
end # class Configuration
|
39
|
-
|
40
|
-
|
41
|
-
## lets you use
|
42
|
-
## Webcache.configure do |config|
|
43
|
-
## config.root = './cache'
|
44
|
-
## end
|
45
|
-
def self.configure() yield( config ); end
|
46
|
-
def self.config() @config ||= Configuration.new; end
|
47
|
-
|
48
|
-
|
49
|
-
## add "high level" root convenience helpers
|
50
|
-
|
51
|
-
def self.root
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
def self.
|
72
|
-
def self.
|
73
|
-
def self.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
data
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
data
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
##
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
##
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
def
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
##
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
##
|
192
|
-
##
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
req_path = req_path.sub( '
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
##
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
req_path =
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
##
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
end
|
1
|
+
|
2
|
+
|
3
|
+
module Webcache
|
4
|
+
|
5
|
+
#####
|
6
|
+
# copied from props gem, see Env.home
|
7
|
+
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
8
|
+
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
9
|
+
def self.home
|
10
|
+
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
11
|
+
ENV['HOME'] || ENV['USERPROFILE']
|
12
|
+
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
13
|
+
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
14
|
+
else
|
15
|
+
begin
|
16
|
+
File.expand_path('~')
|
17
|
+
rescue
|
18
|
+
if File::ALT_SEPARATOR
|
19
|
+
'C:/'
|
20
|
+
else
|
21
|
+
'/'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
## note: use File.expand_path to "unify" path e.g
|
27
|
+
## C:\Users\roman becomes
|
28
|
+
## C:/Users/roman
|
29
|
+
|
30
|
+
File.expand_path( path )
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class Configuration
|
35
|
+
## root directory - todo/check: find/use a better name - why? why not?
|
36
|
+
def root() @root || "#{Webcache.home}/.cache"; end
|
37
|
+
def root=(value) @root = value; end
|
38
|
+
end # class Configuration
|
39
|
+
|
40
|
+
|
41
|
+
## lets you use
|
42
|
+
## Webcache.configure do |config|
|
43
|
+
## config.root = './cache'
|
44
|
+
## end
|
45
|
+
def self.configure() yield( config ); end
|
46
|
+
def self.config() @config ||= Configuration.new; end
|
47
|
+
|
48
|
+
|
49
|
+
## add "high level" root convenience helpers
|
50
|
+
## use delegate helper - why? why not?
|
51
|
+
def self.root() config.root; end
|
52
|
+
def self.root=(value) config.root = value; end
|
53
|
+
|
54
|
+
|
55
|
+
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
56
|
+
def self.cache() @cache ||= DiskCache.new; end
|
57
|
+
|
58
|
+
def self.record( url, response,
|
59
|
+
path: nil,
|
60
|
+
encoding: 'UTF-8',
|
61
|
+
format: 'html' )
|
62
|
+
cache.record( url, response,
|
63
|
+
path: path,
|
64
|
+
encoding: encoding,
|
65
|
+
format: format );
|
66
|
+
end
|
67
|
+
def self.cached?( url ) cache.cached?( url ); end
|
68
|
+
class << self
|
69
|
+
alias_method :exist?, :cached?
|
70
|
+
end
|
71
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
72
|
+
def self.read( url ) cache.read( url ); end
|
73
|
+
def self.read_json( url ) cache.read_json( url ); end
|
74
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
class DiskCache
|
79
|
+
def cached?( url )
|
80
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
81
|
+
File.exist?( body_path )
|
82
|
+
end
|
83
|
+
alias_method :exist?, :cached?
|
84
|
+
|
85
|
+
|
86
|
+
def read( url )
|
87
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
88
|
+
File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
89
|
+
end
|
90
|
+
|
91
|
+
def read_json( url )
|
92
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
93
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
94
|
+
data = JSON.parse( txt )
|
95
|
+
data
|
96
|
+
end
|
97
|
+
|
98
|
+
def read_csv( url )
|
99
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
100
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
101
|
+
data = CsvHash.parse( txt )
|
102
|
+
data
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
## add more save / put / etc. aliases - why? why not?
|
107
|
+
## rename to record_html - why? why not?
|
108
|
+
def record( url, response,
|
109
|
+
path: nil,
|
110
|
+
encoding: 'UTF-8',
|
111
|
+
format: 'html' )
|
112
|
+
|
113
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
114
|
+
meta_path = "#{body_path}.meta.txt"
|
115
|
+
|
116
|
+
## make sure path exits
|
117
|
+
FileUtils.mkdir_p( File.dirname( body_path ) )
|
118
|
+
|
119
|
+
|
120
|
+
puts "[cache] saving #{body_path}..."
|
121
|
+
|
122
|
+
## todo/check: verify content-type - why? why not?
|
123
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
124
|
+
##
|
125
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
126
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
127
|
+
if format == 'json'
|
128
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
129
|
+
elsif format == 'csv'
|
130
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
131
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
132
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
133
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
134
|
+
else ## html or txt
|
135
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
136
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
File.open( meta_path, 'w:utf-8' ) do |f|
|
141
|
+
## todo/check:
|
142
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
143
|
+
response.headers.each do |key, value| # iterate all response headers
|
144
|
+
f.write( "#{key}: #{value}" )
|
145
|
+
f.write( "\n" )
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
|
153
|
+
## use file:// instead of disk:// - why? why not?
|
154
|
+
def url_to_id( str ) "disk://#{url_to_path( str )}"; end
|
155
|
+
|
156
|
+
|
157
|
+
### helpers
|
158
|
+
def url_to_path( str, path: nil )
|
159
|
+
## map url to file path
|
160
|
+
uri = URI.parse( str )
|
161
|
+
|
162
|
+
## note: ignore scheme (e.g. http/https)
|
163
|
+
## and post (e.g. 80, 8080, etc.) for now
|
164
|
+
## always downcase for now (internet domain is case insensitive)
|
165
|
+
host_dir = uri.host.downcase
|
166
|
+
|
167
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
168
|
+
path
|
169
|
+
else
|
170
|
+
## "/this/is/everything?query=params"
|
171
|
+
## cut-off leading slash and
|
172
|
+
## convert query ? =
|
173
|
+
uri.request_uri[1..-1]
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
|
178
|
+
### special "prettify" rule for weltfussball
|
179
|
+
## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
|
180
|
+
if host_dir.index( 'weltfussball.de' ) ||
|
181
|
+
host_dir.index( 'worldfootball.net' )
|
182
|
+
if req_path.end_with?( '/' )
|
183
|
+
req_path = "#{req_path[0..-2]}.html"
|
184
|
+
else
|
185
|
+
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
186
|
+
exit 1
|
187
|
+
end
|
188
|
+
elsif host_dir.index( 'tipp3.at' )
|
189
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
190
|
+
|
191
|
+
## change ? to -I-
|
192
|
+
## change = to ~
|
193
|
+
## Example:
|
194
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
195
|
+
## =>
|
196
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
197
|
+
req_path = req_path.gsub( '?', '-I-' )
|
198
|
+
.gsub( '=', '~')
|
199
|
+
|
200
|
+
req_path = "#{req_path}.html"
|
201
|
+
elsif host_dir.index( 'fbref.com' )
|
202
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
203
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
204
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
205
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
206
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
207
|
+
elsif host_dir.index( 'football-data.org' )
|
208
|
+
## req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
209
|
+
|
210
|
+
## flattern - make a file path - for auto-save
|
211
|
+
## change ? to -I-
|
212
|
+
## change / to ~~
|
213
|
+
## change = to ~
|
214
|
+
req_path = req_path.gsub( '?', '-I-' )
|
215
|
+
.gsub( '/', '~~' )
|
216
|
+
.gsub( '=', '~')
|
217
|
+
|
218
|
+
req_path = "#{req_path}.json"
|
219
|
+
elsif host_dir.index( 'api.cryptokitties.co' )
|
220
|
+
## for now always auto-add .json extensions e.g.
|
221
|
+
## kitties/1 => kitties/1.json
|
222
|
+
## cattributes => cattributes.json
|
223
|
+
req_path = "#{req_path}.json"
|
224
|
+
else
|
225
|
+
## no special rule
|
226
|
+
end
|
227
|
+
|
228
|
+
page_path = "#{host_dir}/#{req_path}"
|
229
|
+
page_path
|
230
|
+
end
|
231
|
+
end # class DiskCache
|
232
|
+
|
233
|
+
|
234
|
+
end # module Webcache
|
data/lib/webget/webget.rb
CHANGED
@@ -1,114 +1,124 @@
|
|
1
|
-
|
2
|
-
class Webget # a web (go get) crawler
|
3
|
-
|
4
|
-
class Configuration ## nested class
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def sleep()
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
##
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
response =
|
49
|
-
|
50
|
-
if response.status.ok? ## must be HTTP 200
|
51
|
-
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response,
|
53
|
-
encoding: encoding ) ## assumes format: html (default)
|
54
|
-
else
|
55
|
-
## todo/check - log error
|
56
|
-
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
57
|
-
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
58
|
-
end
|
59
|
-
|
60
|
-
## to be done / continued
|
61
|
-
response
|
62
|
-
end # method self.page
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
response
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
##
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
1
|
+
|
2
|
+
class Webget # a web (go get) crawler
|
3
|
+
|
4
|
+
class Configuration ## nested class
|
5
|
+
#######################
|
6
|
+
## accessors
|
7
|
+
def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
|
8
|
+
def sleep=(value) @sleep = value; end
|
9
|
+
## add delay, delay_in_s alias - why? why not?
|
10
|
+
alias_method :delay, :sleep
|
11
|
+
alias_method :delay_in_s, :sleep
|
12
|
+
alias_method :delay=, :sleep=
|
13
|
+
alias_method :delay_in_s=, :sleep=
|
14
|
+
end # (nested) class Configuration
|
15
|
+
|
16
|
+
## lets you use
|
17
|
+
## Webget.configure do |config|
|
18
|
+
## config.sleep = 10
|
19
|
+
## end
|
20
|
+
def self.configure() yield( config ); end
|
21
|
+
def self.config() @config ||= Configuration.new; end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
## note - assumes json format
|
26
|
+
## encoding always utf-8 by definition! - double check?)
|
27
|
+
def self.call( url, headers: {} )
|
28
|
+
response = _get( url, headers: headers )
|
29
|
+
|
30
|
+
if response.status.ok? ## must be HTTP 200
|
31
|
+
puts "#{response.status.code} #{response.status.message}"
|
32
|
+
## note: use format json for pretty printing and parse check!!!!
|
33
|
+
Webcache.record( url, response,
|
34
|
+
format: 'json' )
|
35
|
+
else
|
36
|
+
## todo/check - log error
|
37
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
38
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
39
|
+
end
|
40
|
+
|
41
|
+
## to be done / continued
|
42
|
+
response
|
43
|
+
end # method self.call
|
44
|
+
|
45
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
46
|
+
## check encoding UTF-8 or utf-8 - makes a difference?
|
47
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
48
|
+
response = _get( url, headers: headers )
|
49
|
+
|
50
|
+
if response.status.ok? ## must be HTTP 200
|
51
|
+
puts "#{response.status.code} #{response.status.message}"
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
54
|
+
else
|
55
|
+
## todo/check - log error
|
56
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
57
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
58
|
+
end
|
59
|
+
|
60
|
+
## to be done / continued
|
61
|
+
response
|
62
|
+
end # method self.page
|
63
|
+
|
64
|
+
|
65
|
+
## assumes txt format
|
66
|
+
def self.text( url, path: nil, headers: {} )
|
67
|
+
response = _get( url, headers: headers )
|
68
|
+
|
69
|
+
if response.status.ok? ## must be HTTP 200
|
70
|
+
puts "#{response.status.code} #{response.status.message}"
|
71
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
72
|
+
Webcache.record( url, response,
|
73
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
74
|
+
format: 'txt' )
|
75
|
+
else
|
76
|
+
## todo/check - log error
|
77
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
78
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
79
|
+
end
|
80
|
+
|
81
|
+
## to be done / continued
|
82
|
+
response
|
83
|
+
end # method self.text
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
88
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
89
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
90
|
+
response = _get( url, headers: headers )
|
91
|
+
|
92
|
+
if response.status.ok? ## must be HTTP 200
|
93
|
+
puts "#{response.status.code} #{response.status.message}"
|
94
|
+
Webcache.record( url, response,
|
95
|
+
encoding: encoding,
|
96
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
97
|
+
else
|
98
|
+
## todo/check - log error
|
99
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
100
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
101
|
+
end
|
102
|
+
|
103
|
+
## to be done / continued
|
104
|
+
response
|
105
|
+
end # method self.dataset
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
####
|
110
|
+
## private helpers
|
111
|
+
## make private - why? why not?
|
112
|
+
def self._get( url, headers: {} )
|
113
|
+
@@requests ||= 0 ## track number of requests
|
114
|
+
|
115
|
+
if @@requests > 0 ## note - do NOT sleep on very first request!!!
|
116
|
+
puts " sleep #{config.sleep} sec(s)..."
|
117
|
+
sleep( config.sleep ) ## slow down - sleep x secs before each http request
|
118
|
+
end
|
119
|
+
|
120
|
+
@@requests += 1
|
121
|
+
Webclient.get( url, headers: headers ) ## returns respone
|
122
|
+
end
|
123
|
+
end # class Webget
|
124
|
+
|
data/lib/webget.rb
CHANGED
@@ -1,25 +1,29 @@
|
|
1
|
-
require 'webclient'
|
2
|
-
|
3
|
-
## more (our own) 3rd party libs
|
4
|
-
require 'csvreader'
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
require '
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
##
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
1
|
+
require 'webclient'
|
2
|
+
|
3
|
+
## more (our own) 3rd party libs
|
4
|
+
require 'csvreader'
|
5
|
+
|
6
|
+
|
7
|
+
# NEW!! - require/add cocos
|
8
|
+
require 'cocos' # - note - cococs incl. webclient & cvsreader !!!!
|
9
|
+
|
10
|
+
|
11
|
+
## our own code
|
12
|
+
require_relative 'webget/version' # let version go first
|
13
|
+
require_relative 'webget/webcache'
|
14
|
+
require_relative 'webget/webget'
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
############
|
20
|
+
## add convenience alias for camel case / alternate different spelling
|
21
|
+
WebCache = Webcache
|
22
|
+
WebGet = Webget
|
23
|
+
|
24
|
+
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
25
|
+
WebGo = Webget
|
26
|
+
Webgo = Webget
|
27
|
+
|
28
|
+
|
29
|
+
puts Webget.banner # say hello
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 1.2.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: cocos
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rdoc
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -64,16 +78,16 @@ dependencies:
|
|
64
78
|
requirements:
|
65
79
|
- - "~>"
|
66
80
|
- !ruby/object:Gem::Version
|
67
|
-
version: '
|
81
|
+
version: '4.1'
|
68
82
|
type: :development
|
69
83
|
prerelease: false
|
70
84
|
version_requirements: !ruby/object:Gem::Requirement
|
71
85
|
requirements:
|
72
86
|
- - "~>"
|
73
87
|
- !ruby/object:Gem::Version
|
74
|
-
version: '
|
88
|
+
version: '4.1'
|
75
89
|
description: webget gem - a web (go get) crawler incl. web cache
|
76
|
-
email:
|
90
|
+
email: gerald.bauer@gmail.com
|
77
91
|
executables: []
|
78
92
|
extensions: []
|
79
93
|
extra_rdoc_files:
|
@@ -110,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
124
|
- !ruby/object:Gem::Version
|
111
125
|
version: '0'
|
112
126
|
requirements: []
|
113
|
-
rubygems_version: 3.
|
127
|
+
rubygems_version: 3.4.10
|
114
128
|
signing_key:
|
115
129
|
specification_version: 4
|
116
130
|
summary: webget gem - a web (go get) crawler incl. web cache
|