webget 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +24 -10
- data/lib/webget/webget.rb +24 -0
- metadata +6 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
|
4
|
+
data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
|
7
|
+
data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
|
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,8 +54,14 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response,
|
58
|
-
|
57
|
+
def self.record( url, response,
|
58
|
+
path: nil,
|
59
|
+
encoding: 'UTF-8',
|
60
|
+
format: 'html' )
|
61
|
+
cache.record( url, response,
|
62
|
+
path: path,
|
63
|
+
encoding: encoding,
|
64
|
+
format: format );
|
59
65
|
end
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
61
67
|
class << self
|
@@ -98,9 +104,12 @@ class DiskCache
|
|
98
104
|
|
99
105
|
## add more save / put / etc. aliases - why? why not?
|
100
106
|
## rename to record_html - why? why not?
|
101
|
-
def record( url, response,
|
107
|
+
def record( url, response,
|
108
|
+
path: nil,
|
109
|
+
encoding: 'UTF-8',
|
110
|
+
format: 'html' )
|
102
111
|
|
103
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
104
113
|
meta_path = "#{body_path}.meta.txt"
|
105
114
|
|
106
115
|
## make sure path exits
|
@@ -115,9 +124,10 @@ class DiskCache
|
|
115
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
116
125
|
elsif format == 'csv'
|
117
126
|
## fix: newlines - always use "unix" style" - why? why not?
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
118
128
|
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
119
129
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
120
|
-
else
|
130
|
+
else ## html or txt
|
121
131
|
text = response.text( encoding: encoding )
|
122
132
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
123
133
|
end
|
@@ -141,7 +151,7 @@ class DiskCache
|
|
141
151
|
|
142
152
|
|
143
153
|
### helpers
|
144
|
-
def url_to_path( str )
|
154
|
+
def url_to_path( str, path: nil )
|
145
155
|
## map url to file path
|
146
156
|
uri = URI.parse( str )
|
147
157
|
|
@@ -150,10 +160,14 @@ class DiskCache
|
|
150
160
|
## always downcase for now (internet domain is case insensitive)
|
151
161
|
host_dir = uri.host.downcase
|
152
162
|
|
153
|
-
## "
|
154
|
-
|
155
|
-
|
156
|
-
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
164
|
+
path
|
165
|
+
else
|
166
|
+
## "/this/is/everything?query=params"
|
167
|
+
## cut-off leading slash and
|
168
|
+
## convert query ? =
|
169
|
+
uri.request_uri[1..-1]
|
170
|
+
end
|
157
171
|
|
158
172
|
|
159
173
|
|
data/lib/webget/webget.rb
CHANGED
@@ -62,6 +62,30 @@ class Webget # a web (go get) crawler
|
|
62
62
|
end # method self.page
|
63
63
|
|
64
64
|
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
68
|
+
|
69
|
+
response = Webclient.get( url, headers: headers )
|
70
|
+
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
74
|
+
Webcache.record( url, response,
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
76
|
+
format: 'txt' )
|
77
|
+
else
|
78
|
+
## todo/check - log error
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
81
|
+
end
|
82
|
+
|
83
|
+
## to be done / continued
|
84
|
+
response
|
85
|
+
end # method self.text
|
86
|
+
|
87
|
+
|
88
|
+
|
65
89
|
## todo/check: rename to csv or file or records or - why? why not?
|
66
90
|
## todo/check: rename encoding to html/http-like charset - why? why not?
|
67
91
|
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -93,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
|
|
93
93
|
licenses:
|
94
94
|
- Public Domain
|
95
95
|
metadata: {}
|
96
|
-
post_install_message:
|
96
|
+
post_install_message:
|
97
97
|
rdoc_options:
|
98
98
|
- "--main"
|
99
99
|
- README.md
|
@@ -110,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
110
|
- !ruby/object:Gem::Version
|
111
111
|
version: '0'
|
112
112
|
requirements: []
|
113
|
-
|
114
|
-
|
115
|
-
signing_key:
|
113
|
+
rubygems_version: 3.1.4
|
114
|
+
signing_key:
|
116
115
|
specification_version: 4
|
117
116
|
summary: webget gem - a web (go get) crawler incl. web cache
|
118
117
|
test_files: []
|