webget 0.2.5 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -4
- data/README.md +19 -20
- data/Rakefile +32 -31
- data/lib/webget/version.rb +21 -23
- data/lib/webget/webcache.rb +234 -230
- data/lib/webget/webget.rb +124 -114
- data/lib/webget.rb +29 -25
- metadata +20 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: de0060acabe176d1a11a4f2e36e8bee5090083c4907b75d9b22e9cbb9c7b22e2
|
|
4
|
+
data.tar.gz: fb64c2b1294932b00fa401af006cd27b263b255789bad75e4ef62bec5598fd74
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d1f1653c68729e7d609c3c848e0146de9668cd2932ddab2e794ab361d3f786603a10c7586bcf0029de4955255803987000b2fb35b0d1e14cfa35d0582f919be7
|
|
7
|
+
data.tar.gz: c6cff08b2f683bb5b8e39607735250f954115244be3e19dfcdeb29fd2d0a0f0a27be1fed5a2add9a1b75a9cbe81dbafea1e80f06b0a1bdd93d4bd3bd12d5a9a1
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
|
-
# webget
|
|
2
|
-
|
|
3
|
-
webget gem - a web (go get) crawler incl. web cache
|
|
4
|
-
|
|
5
|
-
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
|
6
|
-
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
|
7
|
-
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
|
8
|
-
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
Use it as you please with no restrictions whatsoever.
|
|
1
|
+
# webget
|
|
2
|
+
|
|
3
|
+
webget gem - a web (go get) crawler incl. web cache
|
|
4
|
+
|
|
5
|
+
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
|
6
|
+
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
|
7
|
+
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
|
8
|
+
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
TBD
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## License
|
|
17
|
+
|
|
18
|
+
The `webget` scripts are dedicated to the public domain.
|
|
19
|
+
Use it as you please with no restrictions whatsoever.
|
data/Rakefile
CHANGED
|
@@ -1,31 +1,32 @@
|
|
|
1
|
-
require 'hoe'
|
|
2
|
-
require './lib/webget/version.rb'
|
|
3
|
-
|
|
4
|
-
Hoe.spec 'webget' do
|
|
5
|
-
|
|
6
|
-
self.version = Webget::VERSION
|
|
7
|
-
|
|
8
|
-
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
|
9
|
-
self.description = summary
|
|
10
|
-
|
|
11
|
-
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
|
12
|
-
|
|
13
|
-
self.author = 'Gerald Bauer'
|
|
14
|
-
self.email = '
|
|
15
|
-
|
|
16
|
-
# switch extension to .markdown for gihub formatting
|
|
17
|
-
self.readme_file = 'README.md'
|
|
18
|
-
self.history_file = 'CHANGELOG.md'
|
|
19
|
-
|
|
20
|
-
self.extra_deps = [
|
|
21
|
-
['webclient', '>= 0.2.0'],
|
|
22
|
-
['csvreader', '>= 1.2.4'],
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
1
|
+
require 'hoe'
|
|
2
|
+
require './lib/webget/version.rb'
|
|
3
|
+
|
|
4
|
+
Hoe.spec 'webget' do
|
|
5
|
+
|
|
6
|
+
self.version = Webget::VERSION
|
|
7
|
+
|
|
8
|
+
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
|
9
|
+
self.description = summary
|
|
10
|
+
|
|
11
|
+
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
|
12
|
+
|
|
13
|
+
self.author = 'Gerald Bauer'
|
|
14
|
+
self.email = 'gerald.bauer@gmail.com'
|
|
15
|
+
|
|
16
|
+
# switch extension to .markdown for gihub formatting
|
|
17
|
+
self.readme_file = 'README.md'
|
|
18
|
+
self.history_file = 'CHANGELOG.md'
|
|
19
|
+
|
|
20
|
+
self.extra_deps = [
|
|
21
|
+
['webclient', '>= 0.2.0'],
|
|
22
|
+
['csvreader', '>= 1.2.4'],
|
|
23
|
+
['cocos'],
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
self.licenses = ['Public Domain']
|
|
27
|
+
|
|
28
|
+
self.spec_extras = {
|
|
29
|
+
required_ruby_version: '>= 2.2.2'
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
end
|
data/lib/webget/version.rb
CHANGED
|
@@ -1,23 +1,21 @@
|
|
|
1
|
-
|
|
2
|
-
class Webget
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
end # module Webget
|
|
23
|
-
|
|
1
|
+
|
|
2
|
+
class Webget
|
|
3
|
+
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
|
4
|
+
MINOR = 3
|
|
5
|
+
PATCH = 1
|
|
6
|
+
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
|
7
|
+
|
|
8
|
+
def self.version
|
|
9
|
+
VERSION
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# version string for generator meta tag (includes ruby version)
|
|
13
|
+
def self.banner
|
|
14
|
+
"webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def self.root
|
|
18
|
+
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
|
19
|
+
end
|
|
20
|
+
end # module Webget
|
|
21
|
+
|
data/lib/webget/webcache.rb
CHANGED
|
@@ -1,230 +1,234 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
module Webcache
|
|
4
|
-
|
|
5
|
-
#####
|
|
6
|
-
# copied from props gem, see Env.home
|
|
7
|
-
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
|
8
|
-
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
|
9
|
-
def self.home
|
|
10
|
-
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
|
11
|
-
ENV['HOME'] || ENV['USERPROFILE']
|
|
12
|
-
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
|
13
|
-
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
|
14
|
-
else
|
|
15
|
-
begin
|
|
16
|
-
File.expand_path('~')
|
|
17
|
-
rescue
|
|
18
|
-
if File::ALT_SEPARATOR
|
|
19
|
-
'C:/'
|
|
20
|
-
else
|
|
21
|
-
'/'
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
## note: use File.expand_path to "unify" path e.g
|
|
27
|
-
## C:\Users\roman becomes
|
|
28
|
-
## C:/Users/roman
|
|
29
|
-
|
|
30
|
-
File.expand_path( path )
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class Configuration
|
|
35
|
-
## root directory - todo/check: find/use a better name - why? why not?
|
|
36
|
-
def root() @root || "#{Webcache.home}/.cache"; end
|
|
37
|
-
def root=(value) @root = value; end
|
|
38
|
-
end # class Configuration
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
## lets you use
|
|
42
|
-
## Webcache.configure do |config|
|
|
43
|
-
## config.root = './cache'
|
|
44
|
-
## end
|
|
45
|
-
def self.configure() yield( config ); end
|
|
46
|
-
def self.config() @config ||= Configuration.new; end
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
## add "high level" root convenience helpers
|
|
50
|
-
|
|
51
|
-
def self.root
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def self.
|
|
72
|
-
def self.
|
|
73
|
-
def self.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
data
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
data
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
##
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
##
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
##
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
##
|
|
192
|
-
##
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
req_path = req_path.sub( '
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
##
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
req_path =
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
##
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
end
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
module Webcache
|
|
4
|
+
|
|
5
|
+
#####
|
|
6
|
+
# copied from props gem, see Env.home
|
|
7
|
+
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
|
8
|
+
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
|
9
|
+
def self.home
|
|
10
|
+
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
|
11
|
+
ENV['HOME'] || ENV['USERPROFILE']
|
|
12
|
+
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
|
13
|
+
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
|
14
|
+
else
|
|
15
|
+
begin
|
|
16
|
+
File.expand_path('~')
|
|
17
|
+
rescue
|
|
18
|
+
if File::ALT_SEPARATOR
|
|
19
|
+
'C:/'
|
|
20
|
+
else
|
|
21
|
+
'/'
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
## note: use File.expand_path to "unify" path e.g
|
|
27
|
+
## C:\Users\roman becomes
|
|
28
|
+
## C:/Users/roman
|
|
29
|
+
|
|
30
|
+
File.expand_path( path )
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Configuration
|
|
35
|
+
## root directory - todo/check: find/use a better name - why? why not?
|
|
36
|
+
def root() @root || "#{Webcache.home}/.cache"; end
|
|
37
|
+
def root=(value) @root = value; end
|
|
38
|
+
end # class Configuration
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
## lets you use
|
|
42
|
+
## Webcache.configure do |config|
|
|
43
|
+
## config.root = './cache'
|
|
44
|
+
## end
|
|
45
|
+
def self.configure() yield( config ); end
|
|
46
|
+
def self.config() @config ||= Configuration.new; end
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## add "high level" root convenience helpers
|
|
50
|
+
## use delegate helper - why? why not?
|
|
51
|
+
def self.root() config.root; end
|
|
52
|
+
def self.root=(value) config.root = value; end
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
|
56
|
+
def self.cache() @cache ||= DiskCache.new; end
|
|
57
|
+
|
|
58
|
+
def self.record( url, response,
|
|
59
|
+
path: nil,
|
|
60
|
+
encoding: 'UTF-8',
|
|
61
|
+
format: 'html' )
|
|
62
|
+
cache.record( url, response,
|
|
63
|
+
path: path,
|
|
64
|
+
encoding: encoding,
|
|
65
|
+
format: format );
|
|
66
|
+
end
|
|
67
|
+
def self.cached?( url ) cache.cached?( url ); end
|
|
68
|
+
class << self
|
|
69
|
+
alias_method :exist?, :cached?
|
|
70
|
+
end
|
|
71
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
|
72
|
+
def self.read( url ) cache.read( url ); end
|
|
73
|
+
def self.read_json( url ) cache.read_json( url ); end
|
|
74
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DiskCache
|
|
79
|
+
def cached?( url )
|
|
80
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
81
|
+
File.exist?( body_path )
|
|
82
|
+
end
|
|
83
|
+
alias_method :exist?, :cached?
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def read( url )
|
|
87
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
88
|
+
File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def read_json( url )
|
|
92
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
93
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
|
94
|
+
data = JSON.parse( txt )
|
|
95
|
+
data
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def read_csv( url )
|
|
99
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
|
100
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
|
101
|
+
data = CsvHash.parse( txt )
|
|
102
|
+
data
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
## add more save / put / etc. aliases - why? why not?
|
|
107
|
+
## rename to record_html - why? why not?
|
|
108
|
+
def record( url, response,
|
|
109
|
+
path: nil,
|
|
110
|
+
encoding: 'UTF-8',
|
|
111
|
+
format: 'html' )
|
|
112
|
+
|
|
113
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
|
114
|
+
meta_path = "#{body_path}.meta.txt"
|
|
115
|
+
|
|
116
|
+
## make sure path exits
|
|
117
|
+
FileUtils.mkdir_p( File.dirname( body_path ) )
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
puts "[cache] saving #{body_path}..."
|
|
121
|
+
|
|
122
|
+
## todo/check: verify content-type - why? why not?
|
|
123
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
|
124
|
+
##
|
|
125
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
|
126
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
|
127
|
+
if format == 'json'
|
|
128
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
|
129
|
+
elsif format == 'csv'
|
|
130
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
|
131
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
|
132
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
|
133
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
|
134
|
+
else ## html or txt
|
|
135
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
|
136
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
File.open( meta_path, 'w:utf-8' ) do |f|
|
|
141
|
+
## todo/check:
|
|
142
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
|
143
|
+
response.headers.each do |key, value| # iterate all response headers
|
|
144
|
+
f.write( "#{key}: #{value}" )
|
|
145
|
+
f.write( "\n" )
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
|
|
153
|
+
## use file:// instead of disk:// - why? why not?
|
|
154
|
+
def url_to_id( str ) "disk://#{url_to_path( str )}"; end
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
### helpers
|
|
158
|
+
def url_to_path( str, path: nil )
|
|
159
|
+
## map url to file path
|
|
160
|
+
uri = URI.parse( str )
|
|
161
|
+
|
|
162
|
+
## note: ignore scheme (e.g. http/https)
|
|
163
|
+
## and post (e.g. 80, 8080, etc.) for now
|
|
164
|
+
## always downcase for now (internet domain is case insensitive)
|
|
165
|
+
host_dir = uri.host.downcase
|
|
166
|
+
|
|
167
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
|
168
|
+
path
|
|
169
|
+
else
|
|
170
|
+
## "/this/is/everything?query=params"
|
|
171
|
+
## cut-off leading slash and
|
|
172
|
+
## convert query ? =
|
|
173
|
+
uri.request_uri[1..-1]
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
### special "prettify" rule for weltfussball
|
|
179
|
+
## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
|
|
180
|
+
if host_dir.index( 'weltfussball.de' ) ||
|
|
181
|
+
host_dir.index( 'worldfootball.net' )
|
|
182
|
+
if req_path.end_with?( '/' )
|
|
183
|
+
req_path = "#{req_path[0..-2]}.html"
|
|
184
|
+
else
|
|
185
|
+
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
|
186
|
+
exit 1
|
|
187
|
+
end
|
|
188
|
+
elsif host_dir.index( 'tipp3.at' )
|
|
189
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
|
190
|
+
|
|
191
|
+
## change ? to -I-
|
|
192
|
+
## change = to ~
|
|
193
|
+
## Example:
|
|
194
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
|
195
|
+
## =>
|
|
196
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
|
197
|
+
req_path = req_path.gsub( '?', '-I-' )
|
|
198
|
+
.gsub( '=', '~')
|
|
199
|
+
|
|
200
|
+
req_path = "#{req_path}.html"
|
|
201
|
+
elsif host_dir.index( 'fbref.com' )
|
|
202
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
|
203
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
|
204
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
|
205
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
|
206
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
|
207
|
+
elsif host_dir.index( 'football-data.org' )
|
|
208
|
+
## req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
|
209
|
+
|
|
210
|
+
## flattern - make a file path - for auto-save
|
|
211
|
+
## change ? to -I-
|
|
212
|
+
## change / to ~~
|
|
213
|
+
## change = to ~
|
|
214
|
+
req_path = req_path.gsub( '?', '-I-' )
|
|
215
|
+
.gsub( '/', '~~' )
|
|
216
|
+
.gsub( '=', '~')
|
|
217
|
+
|
|
218
|
+
req_path = "#{req_path}.json"
|
|
219
|
+
elsif host_dir.index( 'api.cryptokitties.co' )
|
|
220
|
+
## for now always auto-add .json extensions e.g.
|
|
221
|
+
## kitties/1 => kitties/1.json
|
|
222
|
+
## cattributes => cattributes.json
|
|
223
|
+
req_path = "#{req_path}.json"
|
|
224
|
+
else
|
|
225
|
+
## no special rule
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
page_path = "#{host_dir}/#{req_path}"
|
|
229
|
+
page_path
|
|
230
|
+
end
|
|
231
|
+
end # class DiskCache
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
end # module Webcache
|
data/lib/webget/webget.rb
CHANGED
|
@@ -1,114 +1,124 @@
|
|
|
1
|
-
|
|
2
|
-
class Webget # a web (go get) crawler
|
|
3
|
-
|
|
4
|
-
class Configuration ## nested class
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def sleep()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
##
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
response =
|
|
49
|
-
|
|
50
|
-
if response.status.ok? ## must be HTTP 200
|
|
51
|
-
puts "#{response.status.code} #{response.status.message}"
|
|
52
|
-
Webcache.record( url, response,
|
|
53
|
-
encoding: encoding ) ## assumes format: html (default)
|
|
54
|
-
else
|
|
55
|
-
## todo/check - log error
|
|
56
|
-
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
|
57
|
-
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
## to be done / continued
|
|
61
|
-
response
|
|
62
|
-
end # method self.page
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
response
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
##
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
1
|
+
|
|
2
|
+
class Webget # a web (go get) crawler
|
|
3
|
+
|
|
4
|
+
class Configuration ## nested class
|
|
5
|
+
#######################
|
|
6
|
+
## accessors
|
|
7
|
+
def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
|
|
8
|
+
def sleep=(value) @sleep = value; end
|
|
9
|
+
## add delay, delay_in_s alias - why? why not?
|
|
10
|
+
alias_method :delay, :sleep
|
|
11
|
+
alias_method :delay_in_s, :sleep
|
|
12
|
+
alias_method :delay=, :sleep=
|
|
13
|
+
alias_method :delay_in_s=, :sleep=
|
|
14
|
+
end # (nested) class Configuration
|
|
15
|
+
|
|
16
|
+
## lets you use
|
|
17
|
+
## Webget.configure do |config|
|
|
18
|
+
## config.sleep = 10
|
|
19
|
+
## end
|
|
20
|
+
def self.configure() yield( config ); end
|
|
21
|
+
def self.config() @config ||= Configuration.new; end
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## note - assumes json format
|
|
26
|
+
## encoding always utf-8 by definition! - double check?)
|
|
27
|
+
def self.call( url, headers: {} )
|
|
28
|
+
response = _get( url, headers: headers )
|
|
29
|
+
|
|
30
|
+
if response.status.ok? ## must be HTTP 200
|
|
31
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
32
|
+
## note: use format json for pretty printing and parse check!!!!
|
|
33
|
+
Webcache.record( url, response,
|
|
34
|
+
format: 'json' )
|
|
35
|
+
else
|
|
36
|
+
## todo/check - log error
|
|
37
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
|
38
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
## to be done / continued
|
|
42
|
+
response
|
|
43
|
+
end # method self.call
|
|
44
|
+
|
|
45
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
|
46
|
+
## check encoding UTF-8 or utf-8 - makes a difference?
|
|
47
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
|
48
|
+
response = _get( url, headers: headers )
|
|
49
|
+
|
|
50
|
+
if response.status.ok? ## must be HTTP 200
|
|
51
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
52
|
+
Webcache.record( url, response,
|
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
|
54
|
+
else
|
|
55
|
+
## todo/check - log error
|
|
56
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
|
57
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
## to be done / continued
|
|
61
|
+
response
|
|
62
|
+
end # method self.page
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## assumes txt format
|
|
66
|
+
def self.text( url, path: nil, headers: {} )
|
|
67
|
+
response = _get( url, headers: headers )
|
|
68
|
+
|
|
69
|
+
if response.status.ok? ## must be HTTP 200
|
|
70
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
71
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
|
72
|
+
Webcache.record( url, response,
|
|
73
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
|
74
|
+
format: 'txt' )
|
|
75
|
+
else
|
|
76
|
+
## todo/check - log error
|
|
77
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
|
78
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
## to be done / continued
|
|
82
|
+
response
|
|
83
|
+
end # method self.text
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
|
88
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
|
89
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
|
90
|
+
response = _get( url, headers: headers )
|
|
91
|
+
|
|
92
|
+
if response.status.ok? ## must be HTTP 200
|
|
93
|
+
puts "#{response.status.code} #{response.status.message}"
|
|
94
|
+
Webcache.record( url, response,
|
|
95
|
+
encoding: encoding,
|
|
96
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
|
97
|
+
else
|
|
98
|
+
## todo/check - log error
|
|
99
|
+
puts "!! HTTP ERROR - #{response.status.code} #{response.status.message}:"
|
|
100
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
## to be done / continued
|
|
104
|
+
response
|
|
105
|
+
end # method self.dataset
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
####
|
|
110
|
+
## private helpers
|
|
111
|
+
## make private - why? why not?
|
|
112
|
+
def self._get( url, headers: {} )
|
|
113
|
+
@@requests ||= 0 ## track number of requests
|
|
114
|
+
|
|
115
|
+
if @@requests > 0 ## note - do NOT sleep on very first request!!!
|
|
116
|
+
puts " sleep #{config.sleep} sec(s)..."
|
|
117
|
+
sleep( config.sleep ) ## slow down - sleep x secs before each http request
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
@@requests += 1
|
|
121
|
+
Webclient.get( url, headers: headers ) ## returns respone
|
|
122
|
+
end
|
|
123
|
+
end # class Webget
|
|
124
|
+
|
data/lib/webget.rb
CHANGED
|
@@ -1,25 +1,29 @@
|
|
|
1
|
-
require 'webclient'
|
|
2
|
-
|
|
3
|
-
## more (our own) 3rd party libs
|
|
4
|
-
require 'csvreader'
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
require '
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
##
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
1
|
+
require 'webclient'
|
|
2
|
+
|
|
3
|
+
## more (our own) 3rd party libs
|
|
4
|
+
require 'csvreader'
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# NEW!! - require/add cocos
|
|
8
|
+
require 'cocos' # - note - cococs incl. webclient & cvsreader !!!!
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## our own code
|
|
12
|
+
require_relative 'webget/version' # let version go first
|
|
13
|
+
require_relative 'webget/webcache'
|
|
14
|
+
require_relative 'webget/webget'
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
############
|
|
20
|
+
## add convenience alias for camel case / alternate different spelling
|
|
21
|
+
WebCache = Webcache
|
|
22
|
+
WebGet = Webget
|
|
23
|
+
|
|
24
|
+
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
|
25
|
+
WebGo = Webget
|
|
26
|
+
Webgo = Webget
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
puts Webget.banner # say hello
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: webget
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gerald Bauer
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2024-07-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: webclient
|
|
@@ -38,6 +38,20 @@ dependencies:
|
|
|
38
38
|
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: 1.2.4
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: cocos
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
41
55
|
- !ruby/object:Gem::Dependency
|
|
42
56
|
name: rdoc
|
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -64,16 +78,16 @@ dependencies:
|
|
|
64
78
|
requirements:
|
|
65
79
|
- - "~>"
|
|
66
80
|
- !ruby/object:Gem::Version
|
|
67
|
-
version: '
|
|
81
|
+
version: '4.1'
|
|
68
82
|
type: :development
|
|
69
83
|
prerelease: false
|
|
70
84
|
version_requirements: !ruby/object:Gem::Requirement
|
|
71
85
|
requirements:
|
|
72
86
|
- - "~>"
|
|
73
87
|
- !ruby/object:Gem::Version
|
|
74
|
-
version: '
|
|
88
|
+
version: '4.1'
|
|
75
89
|
description: webget gem - a web (go get) crawler incl. web cache
|
|
76
|
-
email:
|
|
90
|
+
email: gerald.bauer@gmail.com
|
|
77
91
|
executables: []
|
|
78
92
|
extensions: []
|
|
79
93
|
extra_rdoc_files:
|
|
@@ -110,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
110
124
|
- !ruby/object:Gem::Version
|
|
111
125
|
version: '0'
|
|
112
126
|
requirements: []
|
|
113
|
-
rubygems_version: 3.
|
|
127
|
+
rubygems_version: 3.4.10
|
|
114
128
|
signing_key:
|
|
115
129
|
specification_version: 4
|
|
116
130
|
summary: webget gem - a web (go get) crawler incl. web cache
|