webget 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 02e86218c6a57c74af512d34f813f15ad11f71a9
4
- data.tar.gz: 7342247ad14c9b7567b129a029ab01ed5147aac5
2
+ SHA256:
3
+ metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
4
+ data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
5
5
  SHA512:
6
- metadata.gz: 05a823f3a5918013c085099482055d42d29ed163a48a67563bda48da164d3e86a11969a1ab8c873895c9880f41321be3e05b904272c5d78a787ddb04924fb388
7
- data.tar.gz: 7b412faa9fd940d788072c2dcac2eff90f68538c730b273f087f1195fd146673023eee3c32ae2c0e48fd28cfa575bc00dcc2a34795e7570272fb38c57a9dd7b7
6
+ metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
7
+ data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 3
6
+ PATCH = 4
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,8 +54,14 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, encoding: 'UTF-8', format: 'html' )
58
- cache.record( url, response, encoding: encoding, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
@@ -98,9 +104,12 @@ class DiskCache
98
104
 
99
105
  ## add more save / put / etc. aliases - why? why not?
100
106
  ## rename to record_html - why? why not?
101
- def record( url, response, encoding: 'UTF-8', format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
102
111
 
103
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
104
113
  meta_path = "#{body_path}.meta.txt"
105
114
 
106
115
  ## make sure path exits
@@ -115,9 +124,10 @@ class DiskCache
115
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
116
125
  elsif format == 'csv'
117
126
  ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
118
128
  text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
119
129
  File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
120
- else
130
+ else ## html or txt
121
131
  text = response.text( encoding: encoding )
122
132
  File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
123
133
  end
@@ -141,7 +151,7 @@ class DiskCache
141
151
 
142
152
 
143
153
  ### helpers
144
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
145
155
  ## map url to file path
146
156
  uri = URI.parse( str )
147
157
 
@@ -150,10 +160,14 @@ class DiskCache
150
160
  ## always downcase for now (internet domain is case insensitive)
151
161
  host_dir = uri.host.downcase
152
162
 
153
- ## "/this/is/everything?query=params"
154
- ## cut-off leading slash and
155
- ## convert query ? =
156
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
157
171
 
158
172
 
159
173
 
@@ -62,6 +62,30 @@ class Webget # a web (go get) crawler
62
62
  end # method self.page
63
63
 
64
64
 
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
65
89
  ## todo/check: rename to csv or file or records or - why? why not?
66
90
  ## todo/check: rename encoding to html/http-like charset - why? why not?
67
91
  def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-20 00:00:00.000000000 Z
11
+ date: 2020-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -93,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
93
93
  licenses:
94
94
  - Public Domain
95
95
  metadata: {}
96
- post_install_message:
96
+ post_install_message:
97
97
  rdoc_options:
98
98
  - "--main"
99
99
  - README.md
@@ -110,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
110
  - !ruby/object:Gem::Version
111
111
  version: '0'
112
112
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.5.2
115
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
116
115
  specification_version: 4
117
116
  summary: webget gem - a web (go get) crawler incl. web cache
118
117
  test_files: []