webget 0.1.1 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Manifest.txt +0 -1
- data/README.md +4 -3
- data/Rakefile +6 -3
- data/lib/webget.rb +3 -13
- data/lib/webget/version.rb +2 -2
- data/lib/webget/webcache.rb +62 -13
- data/lib/webget/webget.rb +54 -4
- metadata +37 -11
- data/lib/webget/webclient.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
|
4
|
+
data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
|
7
|
+
data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# webget
|
2
2
|
|
3
|
-
webget gem -
|
3
|
+
webget gem - a web (go get) crawler incl. web cache
|
4
4
|
|
5
|
-
* home :: [github.com/rubycoco/
|
6
|
-
* bugs :: [github.com/rubycoco/
|
5
|
+
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
6
|
+
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
7
7
|
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
8
|
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
|
+
* forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
|
9
10
|
|
10
11
|
|
11
12
|
## Usage
|
data/Rakefile
CHANGED
@@ -5,10 +5,10 @@ Hoe.spec 'webget' do
|
|
5
5
|
|
6
6
|
self.version = Webget::VERSION
|
7
7
|
|
8
|
-
self.summary = 'webget gem -
|
8
|
+
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
9
9
|
self.description = summary
|
10
10
|
|
11
|
-
self.urls = { home: 'https://github.com/rubycoco/
|
11
|
+
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
14
|
self.email = 'ruby-talk@ruby-lang.org'
|
@@ -17,7 +17,10 @@ Hoe.spec 'webget' do
|
|
17
17
|
self.readme_file = 'README.md'
|
18
18
|
self.history_file = 'CHANGELOG.md'
|
19
19
|
|
20
|
-
self.extra_deps = [
|
20
|
+
self.extra_deps = [
|
21
|
+
['webclient', '>= 0.2.0'],
|
22
|
+
['csvreader', '>= 1.2.4'],
|
23
|
+
]
|
21
24
|
|
22
25
|
self.licenses = ['Public Domain']
|
23
26
|
|
data/lib/webget.rb
CHANGED
@@ -1,20 +1,11 @@
|
|
1
|
-
require '
|
2
|
-
require 'time'
|
3
|
-
require 'date'
|
4
|
-
require 'fileutils'
|
5
|
-
|
6
|
-
require 'uri'
|
7
|
-
require 'net/http'
|
8
|
-
require 'net/https'
|
9
|
-
|
10
|
-
require 'json'
|
11
|
-
require 'yaml'
|
1
|
+
require 'webclient'
|
12
2
|
|
3
|
+
## more (our own) 3rd party libs
|
4
|
+
require 'csvreader'
|
13
5
|
|
14
6
|
|
15
7
|
## our own code
|
16
8
|
require 'webget/version' # let version go first
|
17
|
-
require 'webget/webclient'
|
18
9
|
require 'webget/webcache'
|
19
10
|
require 'webget/webget'
|
20
11
|
|
@@ -24,7 +15,6 @@ require 'webget/webget'
|
|
24
15
|
############
|
25
16
|
## add convenience alias for camel case / alternate different spelling
|
26
17
|
WebCache = Webcache
|
27
|
-
WebClient = Webclient
|
28
18
|
WebGet = Webget
|
29
19
|
|
30
20
|
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,16 +54,24 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response,
|
58
|
-
|
57
|
+
def self.record( url, response,
|
58
|
+
path: nil,
|
59
|
+
encoding: 'UTF-8',
|
60
|
+
format: 'html' )
|
61
|
+
cache.record( url, response,
|
62
|
+
path: path,
|
63
|
+
encoding: encoding,
|
64
|
+
format: format );
|
59
65
|
end
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
61
67
|
class << self
|
62
68
|
alias_method :exist?, :cached?
|
63
69
|
end
|
64
|
-
def self.url_to_id( url )
|
70
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
71
|
def self.read( url ) cache.read( url ); end
|
66
72
|
def self.read_json( url ) cache.read_json( url ); end
|
73
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
74
|
+
|
67
75
|
|
68
76
|
|
69
77
|
class DiskCache
|
@@ -86,13 +94,22 @@ class DiskCache
|
|
86
94
|
data
|
87
95
|
end
|
88
96
|
|
97
|
+
def read_csv( url )
|
98
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
99
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
100
|
+
data = CsvHash.parse( txt )
|
101
|
+
data
|
102
|
+
end
|
89
103
|
|
90
104
|
|
91
105
|
## add more save / put / etc. aliases - why? why not?
|
92
106
|
## rename to record_html - why? why not?
|
93
|
-
def record( url, response,
|
107
|
+
def record( url, response,
|
108
|
+
path: nil,
|
109
|
+
encoding: 'UTF-8',
|
110
|
+
format: 'html' )
|
94
111
|
|
95
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
96
113
|
meta_path = "#{body_path}.meta.txt"
|
97
114
|
|
98
115
|
## make sure path exits
|
@@ -102,14 +119,23 @@ class DiskCache
|
|
102
119
|
puts "[cache] saving #{body_path}..."
|
103
120
|
|
104
121
|
## todo/check: verify content-type - why? why not?
|
122
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
105
123
|
if format == 'json'
|
106
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
107
|
-
|
108
|
-
##
|
109
|
-
|
125
|
+
elsif format == 'csv'
|
126
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
128
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
129
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
130
|
+
else ## html or txt
|
131
|
+
text = response.text( encoding: encoding )
|
132
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
110
133
|
end
|
111
134
|
|
135
|
+
|
112
136
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
137
|
+
## todo/check:
|
138
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
113
139
|
response.headers.each do |key, value| # iterate all response headers
|
114
140
|
f.write( "#{key}: #{value}" )
|
115
141
|
f.write( "\n" )
|
@@ -125,7 +151,7 @@ class DiskCache
|
|
125
151
|
|
126
152
|
|
127
153
|
### helpers
|
128
|
-
def url_to_path( str )
|
154
|
+
def url_to_path( str, path: nil )
|
129
155
|
## map url to file path
|
130
156
|
uri = URI.parse( str )
|
131
157
|
|
@@ -134,10 +160,14 @@ class DiskCache
|
|
134
160
|
## always downcase for now (internet domain is case insensitive)
|
135
161
|
host_dir = uri.host.downcase
|
136
162
|
|
137
|
-
## "
|
138
|
-
|
139
|
-
|
140
|
-
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
164
|
+
path
|
165
|
+
else
|
166
|
+
## "/this/is/everything?query=params"
|
167
|
+
## cut-off leading slash and
|
168
|
+
## convert query ? =
|
169
|
+
uri.request_uri[1..-1]
|
170
|
+
end
|
141
171
|
|
142
172
|
|
143
173
|
|
@@ -151,6 +181,25 @@ class DiskCache
|
|
151
181
|
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
152
182
|
exit 1
|
153
183
|
end
|
184
|
+
elsif host_dir.index( 'tipp3.at' )
|
185
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
186
|
+
|
187
|
+
## change ? to -I-
|
188
|
+
## change = to ~
|
189
|
+
## Example:
|
190
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
191
|
+
## =>
|
192
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
193
|
+
req_path = req_path.gsub( '?', '-I-' )
|
194
|
+
.gsub( '=', '~')
|
195
|
+
|
196
|
+
req_path = "#{req_path}.html"
|
197
|
+
elsif host_dir.index( 'fbref.com' )
|
198
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
199
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
200
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
201
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
202
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
154
203
|
elsif host_dir.index( 'football-data.org' )
|
155
204
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
156
205
|
|
data/lib/webget/webget.rb
CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
19
19
|
|
20
20
|
|
21
21
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
25
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
40
40
|
response
|
41
41
|
end # method self.call
|
42
42
|
|
43
|
-
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
47
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
49
49
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
53
54
|
else
|
54
55
|
## todo/check - log error
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
|
|
60
61
|
response
|
61
62
|
end # method self.page
|
62
63
|
|
64
|
+
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
68
|
+
|
69
|
+
response = Webclient.get( url, headers: headers )
|
70
|
+
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
74
|
+
Webcache.record( url, response,
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
76
|
+
format: 'txt' )
|
77
|
+
else
|
78
|
+
## todo/check - log error
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
81
|
+
end
|
82
|
+
|
83
|
+
## to be done / continued
|
84
|
+
response
|
85
|
+
end # method self.text
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
90
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
91
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
92
|
+
puts " sleep #{config.sleep} sec(s)..."
|
93
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
94
|
+
|
95
|
+
response = Webclient.get( url, headers: headers )
|
96
|
+
|
97
|
+
if response.status.ok? ## must be HTTP 200
|
98
|
+
puts "#{response.status.code} #{response.status.message}"
|
99
|
+
Webcache.record( url, response,
|
100
|
+
encoding: encoding,
|
101
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
102
|
+
else
|
103
|
+
## todo/check - log error
|
104
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
105
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
106
|
+
end
|
107
|
+
|
108
|
+
## to be done / continued
|
109
|
+
response
|
110
|
+
end # method self.dataset
|
111
|
+
|
112
|
+
|
63
113
|
end # class Webget
|
64
114
|
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: webclient
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csvreader
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.4
|
13
41
|
- !ruby/object:Gem::Dependency
|
14
42
|
name: rdoc
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,7 +72,7 @@ dependencies:
|
|
44
72
|
- - "~>"
|
45
73
|
- !ruby/object:Gem::Version
|
46
74
|
version: '3.22'
|
47
|
-
description: webget gem -
|
75
|
+
description: webget gem - a web (go get) crawler incl. web cache
|
48
76
|
email: ruby-talk@ruby-lang.org
|
49
77
|
executables: []
|
50
78
|
extensions: []
|
@@ -60,13 +88,12 @@ files:
|
|
60
88
|
- lib/webget.rb
|
61
89
|
- lib/webget/version.rb
|
62
90
|
- lib/webget/webcache.rb
|
63
|
-
- lib/webget/webclient.rb
|
64
91
|
- lib/webget/webget.rb
|
65
|
-
homepage: https://github.com/rubycoco/
|
92
|
+
homepage: https://github.com/rubycoco/webclient
|
66
93
|
licenses:
|
67
94
|
- Public Domain
|
68
95
|
metadata: {}
|
69
|
-
post_install_message:
|
96
|
+
post_install_message:
|
70
97
|
rdoc_options:
|
71
98
|
- "--main"
|
72
99
|
- README.md
|
@@ -83,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
110
|
- !ruby/object:Gem::Version
|
84
111
|
version: '0'
|
85
112
|
requirements: []
|
86
|
-
|
87
|
-
|
88
|
-
signing_key:
|
113
|
+
rubygems_version: 3.1.4
|
114
|
+
signing_key:
|
89
115
|
specification_version: 4
|
90
|
-
summary: webget gem -
|
116
|
+
summary: webget gem - a web (go get) crawler incl. web cache
|
91
117
|
test_files: []
|
data/lib/webget/webclient.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
|
2
|
-
class Webclient
|
3
|
-
|
4
|
-
class Response # nested class - wrap Net::HTTP::Response
|
5
|
-
def initialize( response )
|
6
|
-
@response = response
|
7
|
-
end
|
8
|
-
def raw() @response; end
|
9
|
-
|
10
|
-
|
11
|
-
def text
|
12
|
-
# note: Net::HTTP will NOT set encoding UTF-8 etc.
|
13
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
14
|
-
# thus, set/force encoding to utf-8
|
15
|
-
text = @response.body.to_s
|
16
|
-
text = text.force_encoding( Encoding::UTF_8 )
|
17
|
-
text
|
18
|
-
end
|
19
|
-
|
20
|
-
## convenience helper; returns parsed json data
|
21
|
-
def json() JSON.parse( text ); end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
class Headers # nested (nested) class
|
26
|
-
def initialize( response )
|
27
|
-
@response = response
|
28
|
-
end
|
29
|
-
def each( &blk )
|
30
|
-
@response.each_header do |key, value| # Iterate all response headers
|
31
|
-
blk.call( key, value )
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
def headers() @headers ||= Headers.new( @response ); end
|
36
|
-
|
37
|
-
class Status # nested (nested) class
|
38
|
-
def initialize( response )
|
39
|
-
@response = response
|
40
|
-
end
|
41
|
-
def code() @response.code.to_i; end
|
42
|
-
def ok?() code == 200; end
|
43
|
-
def nok?() code != 200; end
|
44
|
-
def message() @response.message; end
|
45
|
-
end
|
46
|
-
def status() @status ||= Status.new( @response ); end
|
47
|
-
end # (nested) class Response
|
48
|
-
|
49
|
-
|
50
|
-
def self.get( url, headers: {} )
|
51
|
-
|
52
|
-
uri = URI.parse( url )
|
53
|
-
http = Net::HTTP.new( uri.host, uri.port )
|
54
|
-
|
55
|
-
if uri.instance_of? URI::HTTPS
|
56
|
-
http.use_ssl = true
|
57
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
58
|
-
end
|
59
|
-
|
60
|
-
request = Net::HTTP::Get.new( uri.request_uri )
|
61
|
-
|
62
|
-
### add (custom) headers if any
|
63
|
-
## check/todo: is there are more idiomatic way for Net::HTTP ???
|
64
|
-
## use
|
65
|
-
## request = Net::HTTP::Get.new( uri.request_uri, headers )
|
66
|
-
## why? why not?
|
67
|
-
## instead of e.g.
|
68
|
-
## request['X-Auth-Token'] = 'xxxxxxx'
|
69
|
-
## request['User-Agent'] = 'ruby'
|
70
|
-
## request['Accept'] = '*/*'
|
71
|
-
if headers && headers.size > 0
|
72
|
-
headers.each do |key,value|
|
73
|
-
request[ key ] = value
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
response = http.request( request )
|
79
|
-
|
80
|
-
## note: return "unified" wrapped response
|
81
|
-
Response.new( response )
|
82
|
-
end # method self.get
|
83
|
-
|
84
|
-
end # class Webclient
|
85
|
-
|