webget 0.1.1 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Manifest.txt +0 -1
- data/README.md +4 -3
- data/Rakefile +6 -3
- data/lib/webget.rb +3 -13
- data/lib/webget/version.rb +2 -2
- data/lib/webget/webcache.rb +62 -13
- data/lib/webget/webget.rb +54 -4
- metadata +37 -11
- data/lib/webget/webclient.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
|
4
|
+
data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
|
7
|
+
data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# webget
|
2
2
|
|
3
|
-
webget gem -
|
3
|
+
webget gem - a web (go get) crawler incl. web cache
|
4
4
|
|
5
|
-
* home :: [github.com/rubycoco/
|
6
|
-
* bugs :: [github.com/rubycoco/
|
5
|
+
* home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
|
6
|
+
* bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
|
7
7
|
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
8
|
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
|
+
* forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
|
9
10
|
|
10
11
|
|
11
12
|
## Usage
|
data/Rakefile
CHANGED
@@ -5,10 +5,10 @@ Hoe.spec 'webget' do
|
|
5
5
|
|
6
6
|
self.version = Webget::VERSION
|
7
7
|
|
8
|
-
self.summary = 'webget gem -
|
8
|
+
self.summary = 'webget gem - a web (go get) crawler incl. web cache'
|
9
9
|
self.description = summary
|
10
10
|
|
11
|
-
self.urls = { home: 'https://github.com/rubycoco/
|
11
|
+
self.urls = { home: 'https://github.com/rubycoco/webclient' }
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
14
|
self.email = 'ruby-talk@ruby-lang.org'
|
@@ -17,7 +17,10 @@ Hoe.spec 'webget' do
|
|
17
17
|
self.readme_file = 'README.md'
|
18
18
|
self.history_file = 'CHANGELOG.md'
|
19
19
|
|
20
|
-
self.extra_deps = [
|
20
|
+
self.extra_deps = [
|
21
|
+
['webclient', '>= 0.2.0'],
|
22
|
+
['csvreader', '>= 1.2.4'],
|
23
|
+
]
|
21
24
|
|
22
25
|
self.licenses = ['Public Domain']
|
23
26
|
|
data/lib/webget.rb
CHANGED
@@ -1,20 +1,11 @@
|
|
1
|
-
require '
|
2
|
-
require 'time'
|
3
|
-
require 'date'
|
4
|
-
require 'fileutils'
|
5
|
-
|
6
|
-
require 'uri'
|
7
|
-
require 'net/http'
|
8
|
-
require 'net/https'
|
9
|
-
|
10
|
-
require 'json'
|
11
|
-
require 'yaml'
|
1
|
+
require 'webclient'
|
12
2
|
|
3
|
+
## more (our own) 3rd party libs
|
4
|
+
require 'csvreader'
|
13
5
|
|
14
6
|
|
15
7
|
## our own code
|
16
8
|
require 'webget/version' # let version go first
|
17
|
-
require 'webget/webclient'
|
18
9
|
require 'webget/webcache'
|
19
10
|
require 'webget/webget'
|
20
11
|
|
@@ -24,7 +15,6 @@ require 'webget/webget'
|
|
24
15
|
############
|
25
16
|
## add convenience alias for camel case / alternate different spelling
|
26
17
|
WebCache = Webcache
|
27
|
-
WebClient = Webclient
|
28
18
|
WebGet = Webget
|
29
19
|
|
30
20
|
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,16 +54,24 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response,
|
58
|
-
|
57
|
+
def self.record( url, response,
|
58
|
+
path: nil,
|
59
|
+
encoding: 'UTF-8',
|
60
|
+
format: 'html' )
|
61
|
+
cache.record( url, response,
|
62
|
+
path: path,
|
63
|
+
encoding: encoding,
|
64
|
+
format: format );
|
59
65
|
end
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
61
67
|
class << self
|
62
68
|
alias_method :exist?, :cached?
|
63
69
|
end
|
64
|
-
def self.url_to_id( url )
|
70
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
71
|
def self.read( url ) cache.read( url ); end
|
66
72
|
def self.read_json( url ) cache.read_json( url ); end
|
73
|
+
def self.read_csv( url ) cache.read_csv( url ); end
|
74
|
+
|
67
75
|
|
68
76
|
|
69
77
|
class DiskCache
|
@@ -86,13 +94,22 @@ class DiskCache
|
|
86
94
|
data
|
87
95
|
end
|
88
96
|
|
97
|
+
def read_csv( url )
|
98
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
99
|
+
txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
100
|
+
data = CsvHash.parse( txt )
|
101
|
+
data
|
102
|
+
end
|
89
103
|
|
90
104
|
|
91
105
|
## add more save / put / etc. aliases - why? why not?
|
92
106
|
## rename to record_html - why? why not?
|
93
|
-
def record( url, response,
|
107
|
+
def record( url, response,
|
108
|
+
path: nil,
|
109
|
+
encoding: 'UTF-8',
|
110
|
+
format: 'html' )
|
94
111
|
|
95
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
96
113
|
meta_path = "#{body_path}.meta.txt"
|
97
114
|
|
98
115
|
## make sure path exits
|
@@ -102,14 +119,23 @@ class DiskCache
|
|
102
119
|
puts "[cache] saving #{body_path}..."
|
103
120
|
|
104
121
|
## todo/check: verify content-type - why? why not?
|
122
|
+
## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
|
105
123
|
if format == 'json'
|
106
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
107
|
-
|
108
|
-
##
|
109
|
-
|
125
|
+
elsif format == 'csv'
|
126
|
+
## fix: newlines - always use "unix" style" - why? why not?
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
128
|
+
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
129
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
130
|
+
else ## html or txt
|
131
|
+
text = response.text( encoding: encoding )
|
132
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
110
133
|
end
|
111
134
|
|
135
|
+
|
112
136
|
File.open( meta_path, 'w:utf-8' ) do |f|
|
137
|
+
## todo/check:
|
138
|
+
## do headers also need to converted (like text) if encoding is NOT utf-8 ???
|
113
139
|
response.headers.each do |key, value| # iterate all response headers
|
114
140
|
f.write( "#{key}: #{value}" )
|
115
141
|
f.write( "\n" )
|
@@ -125,7 +151,7 @@ class DiskCache
|
|
125
151
|
|
126
152
|
|
127
153
|
### helpers
|
128
|
-
def url_to_path( str )
|
154
|
+
def url_to_path( str, path: nil )
|
129
155
|
## map url to file path
|
130
156
|
uri = URI.parse( str )
|
131
157
|
|
@@ -134,10 +160,14 @@ class DiskCache
|
|
134
160
|
## always downcase for now (internet domain is case insensitive)
|
135
161
|
host_dir = uri.host.downcase
|
136
162
|
|
137
|
-
## "
|
138
|
-
|
139
|
-
|
140
|
-
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
164
|
+
path
|
165
|
+
else
|
166
|
+
## "/this/is/everything?query=params"
|
167
|
+
## cut-off leading slash and
|
168
|
+
## convert query ? =
|
169
|
+
uri.request_uri[1..-1]
|
170
|
+
end
|
141
171
|
|
142
172
|
|
143
173
|
|
@@ -151,6 +181,25 @@ class DiskCache
|
|
151
181
|
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
152
182
|
exit 1
|
153
183
|
end
|
184
|
+
elsif host_dir.index( 'tipp3.at' )
|
185
|
+
req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
|
186
|
+
|
187
|
+
## change ? to -I-
|
188
|
+
## change = to ~
|
189
|
+
## Example:
|
190
|
+
## sportwetten/classicresults.jsp?oddsetProgramID=888
|
191
|
+
## =>
|
192
|
+
## sportwetten/classicresults-I-oddsetProgramID~888
|
193
|
+
req_path = req_path.gsub( '?', '-I-' )
|
194
|
+
.gsub( '=', '~')
|
195
|
+
|
196
|
+
req_path = "#{req_path}.html"
|
197
|
+
elsif host_dir.index( 'fbref.com' )
|
198
|
+
req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
|
199
|
+
req_path = "#{req_path}.html" # auto-add html extension
|
200
|
+
elsif host_dir.index( 'football-data.co.uk' )
|
201
|
+
req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
|
202
|
+
req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
|
154
203
|
elsif host_dir.index( 'football-data.org' )
|
155
204
|
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
156
205
|
|
data/lib/webget/webget.rb
CHANGED
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
|
|
19
19
|
|
20
20
|
|
21
21
|
|
22
|
-
def self.call( url, headers: {} ) ## assumes json format
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
|
23
23
|
puts " sleep #{config.sleep} sec(s)..."
|
24
24
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
25
|
|
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
|
|
40
40
|
response
|
41
41
|
end # method self.call
|
42
42
|
|
43
|
-
|
44
|
-
def self.page( url, headers: {} ) ## assumes html format
|
43
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
44
|
+
def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
|
45
45
|
puts " sleep #{config.sleep} sec(s)..."
|
46
46
|
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
47
|
|
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
|
|
49
49
|
|
50
50
|
if response.status.ok? ## must be HTTP 200
|
51
51
|
puts "#{response.status.code} #{response.status.message}"
|
52
|
-
Webcache.record( url, response
|
52
|
+
Webcache.record( url, response,
|
53
|
+
encoding: encoding ) ## assumes format: html (default)
|
53
54
|
else
|
54
55
|
## todo/check - log error
|
55
56
|
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
|
|
60
61
|
response
|
61
62
|
end # method self.page
|
62
63
|
|
64
|
+
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
68
|
+
|
69
|
+
response = Webclient.get( url, headers: headers )
|
70
|
+
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
74
|
+
Webcache.record( url, response,
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
76
|
+
format: 'txt' )
|
77
|
+
else
|
78
|
+
## todo/check - log error
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
81
|
+
end
|
82
|
+
|
83
|
+
## to be done / continued
|
84
|
+
response
|
85
|
+
end # method self.text
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
## todo/check: rename to csv or file or records or - why? why not?
|
90
|
+
## todo/check: rename encoding to html/http-like charset - why? why not?
|
91
|
+
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
92
|
+
puts " sleep #{config.sleep} sec(s)..."
|
93
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
94
|
+
|
95
|
+
response = Webclient.get( url, headers: headers )
|
96
|
+
|
97
|
+
if response.status.ok? ## must be HTTP 200
|
98
|
+
puts "#{response.status.code} #{response.status.message}"
|
99
|
+
Webcache.record( url, response,
|
100
|
+
encoding: encoding,
|
101
|
+
format: 'csv' ) ## pass along csv format - why? why not?
|
102
|
+
else
|
103
|
+
## todo/check - log error
|
104
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
105
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
106
|
+
end
|
107
|
+
|
108
|
+
## to be done / continued
|
109
|
+
response
|
110
|
+
end # method self.dataset
|
111
|
+
|
112
|
+
|
63
113
|
end # class Webget
|
64
114
|
|
metadata
CHANGED
@@ -1,15 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: webclient
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csvreader
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.2.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.2.4
|
13
41
|
- !ruby/object:Gem::Dependency
|
14
42
|
name: rdoc
|
15
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,7 +72,7 @@ dependencies:
|
|
44
72
|
- - "~>"
|
45
73
|
- !ruby/object:Gem::Version
|
46
74
|
version: '3.22'
|
47
|
-
description: webget gem -
|
75
|
+
description: webget gem - a web (go get) crawler incl. web cache
|
48
76
|
email: ruby-talk@ruby-lang.org
|
49
77
|
executables: []
|
50
78
|
extensions: []
|
@@ -60,13 +88,12 @@ files:
|
|
60
88
|
- lib/webget.rb
|
61
89
|
- lib/webget/version.rb
|
62
90
|
- lib/webget/webcache.rb
|
63
|
-
- lib/webget/webclient.rb
|
64
91
|
- lib/webget/webget.rb
|
65
|
-
homepage: https://github.com/rubycoco/
|
92
|
+
homepage: https://github.com/rubycoco/webclient
|
66
93
|
licenses:
|
67
94
|
- Public Domain
|
68
95
|
metadata: {}
|
69
|
-
post_install_message:
|
96
|
+
post_install_message:
|
70
97
|
rdoc_options:
|
71
98
|
- "--main"
|
72
99
|
- README.md
|
@@ -83,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
110
|
- !ruby/object:Gem::Version
|
84
111
|
version: '0'
|
85
112
|
requirements: []
|
86
|
-
|
87
|
-
|
88
|
-
signing_key:
|
113
|
+
rubygems_version: 3.1.4
|
114
|
+
signing_key:
|
89
115
|
specification_version: 4
|
90
|
-
summary: webget gem -
|
116
|
+
summary: webget gem - a web (go get) crawler incl. web cache
|
91
117
|
test_files: []
|
data/lib/webget/webclient.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
|
2
|
-
class Webclient
|
3
|
-
|
4
|
-
class Response # nested class - wrap Net::HTTP::Response
|
5
|
-
def initialize( response )
|
6
|
-
@response = response
|
7
|
-
end
|
8
|
-
def raw() @response; end
|
9
|
-
|
10
|
-
|
11
|
-
def text
|
12
|
-
# note: Net::HTTP will NOT set encoding UTF-8 etc.
|
13
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
14
|
-
# thus, set/force encoding to utf-8
|
15
|
-
text = @response.body.to_s
|
16
|
-
text = text.force_encoding( Encoding::UTF_8 )
|
17
|
-
text
|
18
|
-
end
|
19
|
-
|
20
|
-
## convenience helper; returns parsed json data
|
21
|
-
def json() JSON.parse( text ); end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
class Headers # nested (nested) class
|
26
|
-
def initialize( response )
|
27
|
-
@response = response
|
28
|
-
end
|
29
|
-
def each( &blk )
|
30
|
-
@response.each_header do |key, value| # Iterate all response headers
|
31
|
-
blk.call( key, value )
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
def headers() @headers ||= Headers.new( @response ); end
|
36
|
-
|
37
|
-
class Status # nested (nested) class
|
38
|
-
def initialize( response )
|
39
|
-
@response = response
|
40
|
-
end
|
41
|
-
def code() @response.code.to_i; end
|
42
|
-
def ok?() code == 200; end
|
43
|
-
def nok?() code != 200; end
|
44
|
-
def message() @response.message; end
|
45
|
-
end
|
46
|
-
def status() @status ||= Status.new( @response ); end
|
47
|
-
end # (nested) class Response
|
48
|
-
|
49
|
-
|
50
|
-
def self.get( url, headers: {} )
|
51
|
-
|
52
|
-
uri = URI.parse( url )
|
53
|
-
http = Net::HTTP.new( uri.host, uri.port )
|
54
|
-
|
55
|
-
if uri.instance_of? URI::HTTPS
|
56
|
-
http.use_ssl = true
|
57
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
58
|
-
end
|
59
|
-
|
60
|
-
request = Net::HTTP::Get.new( uri.request_uri )
|
61
|
-
|
62
|
-
### add (custom) headers if any
|
63
|
-
## check/todo: is there are more idiomatic way for Net::HTTP ???
|
64
|
-
## use
|
65
|
-
## request = Net::HTTP::Get.new( uri.request_uri, headers )
|
66
|
-
## why? why not?
|
67
|
-
## instead of e.g.
|
68
|
-
## request['X-Auth-Token'] = 'xxxxxxx'
|
69
|
-
## request['User-Agent'] = 'ruby'
|
70
|
-
## request['Accept'] = '*/*'
|
71
|
-
if headers && headers.size > 0
|
72
|
-
headers.each do |key,value|
|
73
|
-
request[ key ] = value
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
response = http.request( request )
|
79
|
-
|
80
|
-
## note: return "unified" wrapped response
|
81
|
-
Response.new( response )
|
82
|
-
end # method self.get
|
83
|
-
|
84
|
-
end # class Webclient
|
85
|
-
|