webget 0.1.1 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f30b181118ed0b78e2617246be67187afbc82b06
4
- data.tar.gz: a70be1694115592004e896f2b762db231896fdf2
2
+ SHA256:
3
+ metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
4
+ data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
5
5
  SHA512:
6
- metadata.gz: 2279af9b0b84949ef8e175acdda2e91cdf2d41bda3a6048bfd43e1759731fe3f92f8f9f4008137017659644d4d013257bb551fb7d823c3217f1808470df3a8cf
7
- data.tar.gz: 5551d7778d4c8e2664a34aa6f873189be48fdf64681e4a6b84eb9e73f11ec5109ccf5e5b6a0cdba9ee6bcea7f35bc9a1337723d5fe9ff1b3c8ca74eacaed7963
6
+ metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
7
+ data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
@@ -5,5 +5,4 @@ Rakefile
5
5
  lib/webget.rb
6
6
  lib/webget/version.rb
7
7
  lib/webget/webcache.rb
8
- lib/webget/webclient.rb
9
8
  lib/webget/webget.rb
data/README.md CHANGED
@@ -1,11 +1,12 @@
1
1
  # webget
2
2
 
3
- webget gem - yet (another) network client for world wide web (www) requests via HTTP
3
+ webget gem - a web (go get) crawler incl. web cache
4
4
 
5
- * home :: [github.com/rubycoco/fetcher](https://github.com/rubycoco/fetcher)
6
- * bugs :: [github.com/rubycoco/fetcher/issues](https://github.com/rubycoco/fetcher/issues)
5
+ * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
+ * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
7
  * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
8
  * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
+ * forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
9
10
 
10
11
 
11
12
  ## Usage
data/Rakefile CHANGED
@@ -5,10 +5,10 @@ Hoe.spec 'webget' do
5
5
 
6
6
  self.version = Webget::VERSION
7
7
 
8
- self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
8
+ self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
9
  self.description = summary
10
10
 
11
- self.urls = { home: 'https://github.com/rubycoco/fetcher' }
11
+ self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
12
 
13
13
  self.author = 'Gerald Bauer'
14
14
  self.email = 'ruby-talk@ruby-lang.org'
@@ -17,7 +17,10 @@ Hoe.spec 'webget' do
17
17
  self.readme_file = 'README.md'
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
- self.extra_deps = []
20
+ self.extra_deps = [
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
23
+ ]
21
24
 
22
25
  self.licenses = ['Public Domain']
23
26
 
@@ -1,20 +1,11 @@
1
- require 'pp'
2
- require 'time'
3
- require 'date'
4
- require 'fileutils'
5
-
6
- require 'uri'
7
- require 'net/http'
8
- require 'net/https'
9
-
10
- require 'json'
11
- require 'yaml'
1
+ require 'webclient'
12
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
13
5
 
14
6
 
15
7
  ## our own code
16
8
  require 'webget/version' # let version go first
17
- require 'webget/webclient'
18
9
  require 'webget/webcache'
19
10
  require 'webget/webget'
20
11
 
@@ -24,7 +15,6 @@ require 'webget/webget'
24
15
  ############
25
16
  ## add convenience alias for camel case / alternate different spelling
26
17
  WebCache = Webcache
27
- WebClient = Webclient
28
18
  WebGet = Webget
29
19
 
30
20
  ## use Webgo as (alias) name (keep reserver for now) - why? why not?
@@ -2,8 +2,8 @@
2
2
  class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
- MINOR = 1
6
- PATCH = 1
5
+ MINOR = 2
6
+ PATCH = 4
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,24 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
62
68
  alias_method :exist?, :cached?
63
69
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
70
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
71
  def self.read( url ) cache.read( url ); end
66
72
  def self.read_json( url ) cache.read_json( url ); end
73
+ def self.read_csv( url ) cache.read_csv( url ); end
74
+
67
75
 
68
76
 
69
77
  class DiskCache
@@ -86,13 +94,22 @@ class DiskCache
86
94
  data
87
95
  end
88
96
 
97
+ def read_csv( url )
98
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
+ data = CsvHash.parse( txt )
101
+ data
102
+ end
89
103
 
90
104
 
91
105
  ## add more save / put / etc. aliases - why? why not?
92
106
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
94
111
 
95
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
96
113
  meta_path = "#{body_path}.meta.txt"
97
114
 
98
115
  ## make sure path exits
@@ -102,14 +119,23 @@ class DiskCache
102
119
  puts "[cache] saving #{body_path}..."
103
120
 
104
121
  ## todo/check: verify content-type - why? why not?
122
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
123
  if format == 'json'
106
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
107
- else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
125
+ elsif format == 'csv'
126
+ ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
128
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
+ else ## html or txt
131
+ text = response.text( encoding: encoding )
132
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
133
  end
111
134
 
135
+
112
136
  File.open( meta_path, 'w:utf-8' ) do |f|
137
+ ## todo/check:
138
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
139
  response.headers.each do |key, value| # iterate all response headers
114
140
  f.write( "#{key}: #{value}" )
115
141
  f.write( "\n" )
@@ -125,7 +151,7 @@ class DiskCache
125
151
 
126
152
 
127
153
  ### helpers
128
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
129
155
  ## map url to file path
130
156
  uri = URI.parse( str )
131
157
 
@@ -134,10 +160,14 @@ class DiskCache
134
160
  ## always downcase for now (internet domain is case insensitive)
135
161
  host_dir = uri.host.downcase
136
162
 
137
- ## "/this/is/everything?query=params"
138
- ## cut-off leading slash and
139
- ## convert query ? =
140
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
141
171
 
142
172
 
143
173
 
@@ -151,6 +181,25 @@ class DiskCache
151
181
  puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
152
182
  exit 1
153
183
  end
184
+ elsif host_dir.index( 'tipp3.at' )
185
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
+
187
+ ## change ? to -I-
188
+ ## change = to ~
189
+ ## Example:
190
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
+ ## =>
192
+ ## sportwetten/classicresults-I-oddsetProgramID~888
193
+ req_path = req_path.gsub( '?', '-I-' )
194
+ .gsub( '=', '~')
195
+
196
+ req_path = "#{req_path}.html"
197
+ elsif host_dir.index( 'fbref.com' )
198
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
+ req_path = "#{req_path}.html" # auto-add html extension
200
+ elsif host_dir.index( 'football-data.co.uk' )
201
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
154
203
  elsif host_dir.index( 'football-data.org' )
155
204
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
156
205
 
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
89
+ ## todo/check: rename to csv or file or records or - why? why not?
90
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
91
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
+ puts " sleep #{config.sleep} sec(s)..."
93
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
+
95
+ response = Webclient.get( url, headers: headers )
96
+
97
+ if response.status.ok? ## must be HTTP 200
98
+ puts "#{response.status.code} #{response.status.message}"
99
+ Webcache.record( url, response,
100
+ encoding: encoding,
101
+ format: 'csv' ) ## pass along csv format - why? why not?
102
+ else
103
+ ## todo/check - log error
104
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
+ end
107
+
108
+ ## to be done / continued
109
+ response
110
+ end # method self.dataset
111
+
112
+
63
113
  end # class Webget
64
114
 
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-05 00:00:00.000000000 Z
11
+ date: 2020-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: webclient
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: rdoc
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,7 +72,7 @@ dependencies:
44
72
  - - "~>"
45
73
  - !ruby/object:Gem::Version
46
74
  version: '3.22'
47
- description: webget gem - yet (another) network client for world wide web (www) requests
75
+ description: webget gem - a web (go get) crawler incl. web cache
48
76
  email: ruby-talk@ruby-lang.org
49
77
  executables: []
50
78
  extensions: []
@@ -60,13 +88,12 @@ files:
60
88
  - lib/webget.rb
61
89
  - lib/webget/version.rb
62
90
  - lib/webget/webcache.rb
63
- - lib/webget/webclient.rb
64
91
  - lib/webget/webget.rb
65
- homepage: https://github.com/rubycoco/fetcher
92
+ homepage: https://github.com/rubycoco/webclient
66
93
  licenses:
67
94
  - Public Domain
68
95
  metadata: {}
69
- post_install_message:
96
+ post_install_message:
70
97
  rdoc_options:
71
98
  - "--main"
72
99
  - README.md
@@ -83,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
110
  - !ruby/object:Gem::Version
84
111
  version: '0'
85
112
  requirements: []
86
- rubyforge_project:
87
- rubygems_version: 2.5.2
88
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
89
115
  specification_version: 4
90
- summary: webget gem - yet (another) network client for world wide web (www) requests
116
+ summary: webget gem - a web (go get) crawler incl. web cache
91
117
  test_files: []
@@ -1,85 +0,0 @@
1
-
2
- class Webclient
3
-
4
- class Response # nested class - wrap Net::HTTP::Response
5
- def initialize( response )
6
- @response = response
7
- end
8
- def raw() @response; end
9
-
10
-
11
- def text
12
- # note: Net::HTTP will NOT set encoding UTF-8 etc.
13
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
14
- # thus, set/force encoding to utf-8
15
- text = @response.body.to_s
16
- text = text.force_encoding( Encoding::UTF_8 )
17
- text
18
- end
19
-
20
- ## convenience helper; returns parsed json data
21
- def json() JSON.parse( text ); end
22
-
23
-
24
-
25
- class Headers # nested (nested) class
26
- def initialize( response )
27
- @response = response
28
- end
29
- def each( &blk )
30
- @response.each_header do |key, value| # Iterate all response headers
31
- blk.call( key, value )
32
- end
33
- end
34
- end
35
- def headers() @headers ||= Headers.new( @response ); end
36
-
37
- class Status # nested (nested) class
38
- def initialize( response )
39
- @response = response
40
- end
41
- def code() @response.code.to_i; end
42
- def ok?() code == 200; end
43
- def nok?() code != 200; end
44
- def message() @response.message; end
45
- end
46
- def status() @status ||= Status.new( @response ); end
47
- end # (nested) class Response
48
-
49
-
50
- def self.get( url, headers: {} )
51
-
52
- uri = URI.parse( url )
53
- http = Net::HTTP.new( uri.host, uri.port )
54
-
55
- if uri.instance_of? URI::HTTPS
56
- http.use_ssl = true
57
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
58
- end
59
-
60
- request = Net::HTTP::Get.new( uri.request_uri )
61
-
62
- ### add (custom) headers if any
63
- ## check/todo: is there are more idiomatic way for Net::HTTP ???
64
- ## use
65
- ## request = Net::HTTP::Get.new( uri.request_uri, headers )
66
- ## why? why not?
67
- ## instead of e.g.
68
- ## request['X-Auth-Token'] = 'xxxxxxx'
69
- ## request['User-Agent'] = 'ruby'
70
- ## request['Accept'] = '*/*'
71
- if headers && headers.size > 0
72
- headers.each do |key,value|
73
- request[ key ] = value
74
- end
75
- end
76
-
77
-
78
- response = http.request( request )
79
-
80
- ## note: return "unified" wrapped response
81
- Response.new( response )
82
- end # method self.get
83
-
84
- end # class Webclient
85
-