webget 0.1.1 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f30b181118ed0b78e2617246be67187afbc82b06
4
- data.tar.gz: a70be1694115592004e896f2b762db231896fdf2
2
+ SHA256:
3
+ metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
4
+ data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
5
5
  SHA512:
6
- metadata.gz: 2279af9b0b84949ef8e175acdda2e91cdf2d41bda3a6048bfd43e1759731fe3f92f8f9f4008137017659644d4d013257bb551fb7d823c3217f1808470df3a8cf
7
- data.tar.gz: 5551d7778d4c8e2664a34aa6f873189be48fdf64681e4a6b84eb9e73f11ec5109ccf5e5b6a0cdba9ee6bcea7f35bc9a1337723d5fe9ff1b3c8ca74eacaed7963
6
+ metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
7
+ data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
@@ -5,5 +5,4 @@ Rakefile
5
5
  lib/webget.rb
6
6
  lib/webget/version.rb
7
7
  lib/webget/webcache.rb
8
- lib/webget/webclient.rb
9
8
  lib/webget/webget.rb
data/README.md CHANGED
@@ -1,11 +1,12 @@
1
1
  # webget
2
2
 
3
- webget gem - yet (another) network client for world wide web (www) requests via HTTP
3
+ webget gem - a web (go get) crawler incl. web cache
4
4
 
5
- * home :: [github.com/rubycoco/fetcher](https://github.com/rubycoco/fetcher)
6
- * bugs :: [github.com/rubycoco/fetcher/issues](https://github.com/rubycoco/fetcher/issues)
5
+ * home :: [github.com/rubycoco/webclient](https://github.com/rubycoco/webclient)
6
+ * bugs :: [github.com/rubycoco/webclient/issues](https://github.com/rubycoco/webclient/issues)
7
7
  * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
8
  * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
+ * forum :: [groups.google.com/group/wwwmake](https://groups.google.com/group/wwwmake)
9
10
 
10
11
 
11
12
  ## Usage
data/Rakefile CHANGED
@@ -5,10 +5,10 @@ Hoe.spec 'webget' do
5
5
 
6
6
  self.version = Webget::VERSION
7
7
 
8
- self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
8
+ self.summary = 'webget gem - a web (go get) crawler incl. web cache'
9
9
  self.description = summary
10
10
 
11
- self.urls = { home: 'https://github.com/rubycoco/fetcher' }
11
+ self.urls = { home: 'https://github.com/rubycoco/webclient' }
12
12
 
13
13
  self.author = 'Gerald Bauer'
14
14
  self.email = 'ruby-talk@ruby-lang.org'
@@ -17,7 +17,10 @@ Hoe.spec 'webget' do
17
17
  self.readme_file = 'README.md'
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
- self.extra_deps = []
20
+ self.extra_deps = [
21
+ ['webclient', '>= 0.2.0'],
22
+ ['csvreader', '>= 1.2.4'],
23
+ ]
21
24
 
22
25
  self.licenses = ['Public Domain']
23
26
 
@@ -1,20 +1,11 @@
1
- require 'pp'
2
- require 'time'
3
- require 'date'
4
- require 'fileutils'
5
-
6
- require 'uri'
7
- require 'net/http'
8
- require 'net/https'
9
-
10
- require 'json'
11
- require 'yaml'
1
+ require 'webclient'
12
2
 
3
+ ## more (our own) 3rd party libs
4
+ require 'csvreader'
13
5
 
14
6
 
15
7
  ## our own code
16
8
  require 'webget/version' # let version go first
17
- require 'webget/webclient'
18
9
  require 'webget/webcache'
19
10
  require 'webget/webget'
20
11
 
@@ -24,7 +15,6 @@ require 'webget/webget'
24
15
  ############
25
16
  ## add convenience alias for camel case / alternate different spelling
26
17
  WebCache = Webcache
27
- WebClient = Webclient
28
18
  WebGet = Webget
29
19
 
30
20
  ## use Webgo as (alias) name (keep reserver for now) - why? why not?
@@ -2,8 +2,8 @@
2
2
  class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
- MINOR = 1
6
- PATCH = 1
5
+ MINOR = 2
6
+ PATCH = 4
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,16 +54,24 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, format: 'html' )
58
- cache.record( url, response, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
62
68
  alias_method :exist?, :cached?
63
69
  end
64
- def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
70
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
71
  def self.read( url ) cache.read( url ); end
66
72
  def self.read_json( url ) cache.read_json( url ); end
73
+ def self.read_csv( url ) cache.read_csv( url ); end
74
+
67
75
 
68
76
 
69
77
  class DiskCache
@@ -86,13 +94,22 @@ class DiskCache
86
94
  data
87
95
  end
88
96
 
97
+ def read_csv( url )
98
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
99
+ txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
100
+ data = CsvHash.parse( txt )
101
+ data
102
+ end
89
103
 
90
104
 
91
105
  ## add more save / put / etc. aliases - why? why not?
92
106
  ## rename to record_html - why? why not?
93
- def record( url, response, format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
94
111
 
95
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
96
113
  meta_path = "#{body_path}.meta.txt"
97
114
 
98
115
  ## make sure path exits
@@ -102,14 +119,23 @@ class DiskCache
102
119
  puts "[cache] saving #{body_path}..."
103
120
 
104
121
  ## todo/check: verify content-type - why? why not?
122
+ ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
105
123
  if format == 'json'
106
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
107
- else
108
- ## note - for now always assume utf8!!!!!!!!!
109
- File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
125
+ elsif format == 'csv'
126
+ ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
128
+ text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
129
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
130
+ else ## html or txt
131
+ text = response.text( encoding: encoding )
132
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
110
133
  end
111
134
 
135
+
112
136
  File.open( meta_path, 'w:utf-8' ) do |f|
137
+ ## todo/check:
138
+ ## do headers also need to converted (like text) if encoding is NOT utf-8 ???
113
139
  response.headers.each do |key, value| # iterate all response headers
114
140
  f.write( "#{key}: #{value}" )
115
141
  f.write( "\n" )
@@ -125,7 +151,7 @@ class DiskCache
125
151
 
126
152
 
127
153
  ### helpers
128
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
129
155
  ## map url to file path
130
156
  uri = URI.parse( str )
131
157
 
@@ -134,10 +160,14 @@ class DiskCache
134
160
  ## always downcase for now (internet domain is case insensitive)
135
161
  host_dir = uri.host.downcase
136
162
 
137
- ## "/this/is/everything?query=params"
138
- ## cut-off leading slash and
139
- ## convert query ? =
140
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
141
171
 
142
172
 
143
173
 
@@ -151,6 +181,25 @@ class DiskCache
151
181
  puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
152
182
  exit 1
153
183
  end
184
+ elsif host_dir.index( 'tipp3.at' )
185
+ req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension
186
+
187
+ ## change ? to -I-
188
+ ## change = to ~
189
+ ## Example:
190
+ ## sportwetten/classicresults.jsp?oddsetProgramID=888
191
+ ## =>
192
+ ## sportwetten/classicresults-I-oddsetProgramID~888
193
+ req_path = req_path.gsub( '?', '-I-' )
194
+ .gsub( '=', '~')
195
+
196
+ req_path = "#{req_path}.html"
197
+ elsif host_dir.index( 'fbref.com' )
198
+ req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/
199
+ req_path = "#{req_path}.html" # auto-add html extension
200
+ elsif host_dir.index( 'football-data.co.uk' )
201
+ req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/
202
+ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/
154
203
  elsif host_dir.index( 'football-data.org' )
155
204
  req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
156
205
 
@@ -19,7 +19,7 @@ class Webget # a web (go get) crawler
19
19
 
20
20
 
21
21
 
22
- def self.call( url, headers: {} ) ## assumes json format
22
+ def self.call( url, headers: {} ) ## assumes json format (note - encoding always utf-8 by definition! - double check?)
23
23
  puts " sleep #{config.sleep} sec(s)..."
24
24
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
25
 
@@ -40,8 +40,8 @@ class Webget # a web (go get) crawler
40
40
  response
41
41
  end # method self.call
42
42
 
43
-
44
- def self.page( url, headers: {} ) ## assumes html format
43
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
44
+ def self.page( url, encoding: 'UTF-8', headers: {} ) ## assumes html format
45
45
  puts " sleep #{config.sleep} sec(s)..."
46
46
  sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
47
 
@@ -49,7 +49,8 @@ class Webget # a web (go get) crawler
49
49
 
50
50
  if response.status.ok? ## must be HTTP 200
51
51
  puts "#{response.status.code} #{response.status.message}"
52
- Webcache.record( url, response ) ## assumes format: html (default)
52
+ Webcache.record( url, response,
53
+ encoding: encoding ) ## assumes format: html (default)
53
54
  else
54
55
  ## todo/check - log error
55
56
  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
@@ -60,5 +61,54 @@ class Webget # a web (go get) crawler
60
61
  response
61
62
  end # method self.page
62
63
 
64
+
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
89
+ ## todo/check: rename to csv or file or records or - why? why not?
90
+ ## todo/check: rename encoding to html/http-like charset - why? why not?
91
+ def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
92
+ puts " sleep #{config.sleep} sec(s)..."
93
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
94
+
95
+ response = Webclient.get( url, headers: headers )
96
+
97
+ if response.status.ok? ## must be HTTP 200
98
+ puts "#{response.status.code} #{response.status.message}"
99
+ Webcache.record( url, response,
100
+ encoding: encoding,
101
+ format: 'csv' ) ## pass along csv format - why? why not?
102
+ else
103
+ ## todo/check - log error
104
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
105
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
106
+ end
107
+
108
+ ## to be done / continued
109
+ response
110
+ end # method self.dataset
111
+
112
+
63
113
  end # class Webget
64
114
 
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-05 00:00:00.000000000 Z
11
+ date: 2020-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: webclient
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csvreader
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.2.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.2.4
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: rdoc
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -44,7 +72,7 @@ dependencies:
44
72
  - - "~>"
45
73
  - !ruby/object:Gem::Version
46
74
  version: '3.22'
47
- description: webget gem - yet (another) network client for world wide web (www) requests
75
+ description: webget gem - a web (go get) crawler incl. web cache
48
76
  email: ruby-talk@ruby-lang.org
49
77
  executables: []
50
78
  extensions: []
@@ -60,13 +88,12 @@ files:
60
88
  - lib/webget.rb
61
89
  - lib/webget/version.rb
62
90
  - lib/webget/webcache.rb
63
- - lib/webget/webclient.rb
64
91
  - lib/webget/webget.rb
65
- homepage: https://github.com/rubycoco/fetcher
92
+ homepage: https://github.com/rubycoco/webclient
66
93
  licenses:
67
94
  - Public Domain
68
95
  metadata: {}
69
- post_install_message:
96
+ post_install_message:
70
97
  rdoc_options:
71
98
  - "--main"
72
99
  - README.md
@@ -83,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
110
  - !ruby/object:Gem::Version
84
111
  version: '0'
85
112
  requirements: []
86
- rubyforge_project:
87
- rubygems_version: 2.5.2
88
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
89
115
  specification_version: 4
90
- summary: webget gem - yet (another) network client for world wide web (www) requests
116
+ summary: webget gem - a web (go get) crawler incl. web cache
91
117
  test_files: []
@@ -1,85 +0,0 @@
1
-
2
- class Webclient
3
-
4
- class Response # nested class - wrap Net::HTTP::Response
5
- def initialize( response )
6
- @response = response
7
- end
8
- def raw() @response; end
9
-
10
-
11
- def text
12
- # note: Net::HTTP will NOT set encoding UTF-8 etc.
13
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
14
- # thus, set/force encoding to utf-8
15
- text = @response.body.to_s
16
- text = text.force_encoding( Encoding::UTF_8 )
17
- text
18
- end
19
-
20
- ## convenience helper; returns parsed json data
21
- def json() JSON.parse( text ); end
22
-
23
-
24
-
25
- class Headers # nested (nested) class
26
- def initialize( response )
27
- @response = response
28
- end
29
- def each( &blk )
30
- @response.each_header do |key, value| # Iterate all response headers
31
- blk.call( key, value )
32
- end
33
- end
34
- end
35
- def headers() @headers ||= Headers.new( @response ); end
36
-
37
- class Status # nested (nested) class
38
- def initialize( response )
39
- @response = response
40
- end
41
- def code() @response.code.to_i; end
42
- def ok?() code == 200; end
43
- def nok?() code != 200; end
44
- def message() @response.message; end
45
- end
46
- def status() @status ||= Status.new( @response ); end
47
- end # (nested) class Response
48
-
49
-
50
- def self.get( url, headers: {} )
51
-
52
- uri = URI.parse( url )
53
- http = Net::HTTP.new( uri.host, uri.port )
54
-
55
- if uri.instance_of? URI::HTTPS
56
- http.use_ssl = true
57
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
58
- end
59
-
60
- request = Net::HTTP::Get.new( uri.request_uri )
61
-
62
- ### add (custom) headers if any
63
- ## check/todo: is there are more idiomatic way for Net::HTTP ???
64
- ## use
65
- ## request = Net::HTTP::Get.new( uri.request_uri, headers )
66
- ## why? why not?
67
- ## instead of e.g.
68
- ## request['X-Auth-Token'] = 'xxxxxxx'
69
- ## request['User-Agent'] = 'ruby'
70
- ## request['Accept'] = '*/*'
71
- if headers && headers.size > 0
72
- headers.each do |key,value|
73
- request[ key ] = value
74
- end
75
- end
76
-
77
-
78
- response = http.request( request )
79
-
80
- ## note: return "unified" wrapped response
81
- Response.new( response )
82
- end # method self.get
83
-
84
- end # class Webclient
85
-