webget 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 02e86218c6a57c74af512d34f813f15ad11f71a9
4
- data.tar.gz: 7342247ad14c9b7567b129a029ab01ed5147aac5
2
+ SHA256:
3
+ metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
4
+ data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
5
5
  SHA512:
6
- metadata.gz: 05a823f3a5918013c085099482055d42d29ed163a48a67563bda48da164d3e86a11969a1ab8c873895c9880f41321be3e05b904272c5d78a787ddb04924fb388
7
- data.tar.gz: 7b412faa9fd940d788072c2dcac2eff90f68538c730b273f087f1195fd146673023eee3c32ae2c0e48fd28cfa575bc00dcc2a34795e7570272fb38c57a9dd7b7
6
+ metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
7
+ data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
@@ -3,7 +3,7 @@ class Webget
3
3
 
4
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
5
5
  MINOR = 2
6
- PATCH = 3
6
+ PATCH = 4
7
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
8
 
9
9
  def self.version
@@ -54,8 +54,14 @@ module Webcache
54
54
  ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
55
  def self.cache() @cache ||= DiskCache.new; end
56
56
 
57
- def self.record( url, response, encoding: 'UTF-8', format: 'html' )
58
- cache.record( url, response, encoding: encoding, format: format );
57
+ def self.record( url, response,
58
+ path: nil,
59
+ encoding: 'UTF-8',
60
+ format: 'html' )
61
+ cache.record( url, response,
62
+ path: path,
63
+ encoding: encoding,
64
+ format: format );
59
65
  end
60
66
  def self.cached?( url ) cache.cached?( url ); end
61
67
  class << self
@@ -98,9 +104,12 @@ class DiskCache
98
104
 
99
105
  ## add more save / put / etc. aliases - why? why not?
100
106
  ## rename to record_html - why? why not?
101
- def record( url, response, encoding: 'UTF-8', format: 'html' )
107
+ def record( url, response,
108
+ path: nil,
109
+ encoding: 'UTF-8',
110
+ format: 'html' )
102
111
 
103
- body_path = "#{Webcache.root}/#{url_to_path( url )}"
112
+ body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
104
113
  meta_path = "#{body_path}.meta.txt"
105
114
 
106
115
  ## make sure path exits
@@ -115,9 +124,10 @@ class DiskCache
115
124
  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
116
125
  elsif format == 'csv'
117
126
  ## fix: newlines - always use "unix" style" - why? why not?
127
+ ## fix: use :newline => :universal option? translates to univeral "\n"
118
128
  text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
119
129
  File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
120
- else
130
+ else ## html or txt
121
131
  text = response.text( encoding: encoding )
122
132
  File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
123
133
  end
@@ -141,7 +151,7 @@ class DiskCache
141
151
 
142
152
 
143
153
  ### helpers
144
- def url_to_path( str )
154
+ def url_to_path( str, path: nil )
145
155
  ## map url to file path
146
156
  uri = URI.parse( str )
147
157
 
@@ -150,10 +160,14 @@ class DiskCache
150
160
  ## always downcase for now (internet domain is case insensitive)
151
161
  host_dir = uri.host.downcase
152
162
 
153
- ## "/this/is/everything?query=params"
154
- ## cut-off leading slash and
155
- ## convert query ? =
156
- req_path = uri.request_uri[1..-1]
163
+ req_path = if path ## use "custom" (file)path for cache storage if passed in
164
+ path
165
+ else
166
+ ## "/this/is/everything?query=params"
167
+ ## cut-off leading slash and
168
+ ## convert query ? =
169
+ uri.request_uri[1..-1]
170
+ end
157
171
 
158
172
 
159
173
 
@@ -62,6 +62,30 @@ class Webget # a web (go get) crawler
62
62
  end # method self.page
63
63
 
64
64
 
65
+ def self.text( url, path: nil, headers: {} ) ## assumes txt format
66
+ puts " sleep #{config.sleep} sec(s)..."
67
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
68
+
69
+ response = Webclient.get( url, headers: headers )
70
+
71
+ if response.status.ok? ## must be HTTP 200
72
+ puts "#{response.status.code} #{response.status.message}"
73
+ ## note: like json assumes always utf-8 encoding for now !!!
74
+ Webcache.record( url, response,
75
+ path: path, ## optional "custom" (file)path for saving in cache
76
+ format: 'txt' )
77
+ else
78
+ ## todo/check - log error
79
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
80
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
81
+ end
82
+
83
+ ## to be done / continued
84
+ response
85
+ end # method self.text
86
+
87
+
88
+
65
89
  ## todo/check: rename to csv or file or records or - why? why not?
66
90
  ## todo/check: rename encoding to html/http-like charset - why? why not?
67
91
  def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-20 00:00:00.000000000 Z
11
+ date: 2020-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: webclient
@@ -93,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
93
93
  licenses:
94
94
  - Public Domain
95
95
  metadata: {}
96
- post_install_message:
96
+ post_install_message:
97
97
  rdoc_options:
98
98
  - "--main"
99
99
  - README.md
@@ -110,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
110
  - !ruby/object:Gem::Version
111
111
  version: '0'
112
112
  requirements: []
113
- rubyforge_project:
114
- rubygems_version: 2.5.2
115
- signing_key:
113
+ rubygems_version: 3.1.4
114
+ signing_key:
116
115
  specification_version: 4
117
116
  summary: webget gem - a web (go get) crawler incl. web cache
118
117
  test_files: []