webget 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/webget/version.rb +1 -1
- data/lib/webget/webcache.rb +24 -10
- data/lib/webget/webget.rb +24 -0
- metadata +6 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e171b577175334da7546a23003e44796c19da96b37c35a21947e5f025772cf19
|
4
|
+
data.tar.gz: c3a9a39e443edef2b702a15096cf5f5135a4aaccfc9fe1ef9fe8b0e9e9ce9296
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0c524fd19e2444a98df172ccf217d1c9a6201849dc3d947e2885229294271fc865df00cb329c2840a149a72e819facf224c07b2b555ef161874af2cb03c3e1
|
7
|
+
data.tar.gz: 44532a070b15e02cba5a861de875d25e16484d05dc1c0b8793b4e0874deaa94a464dafe8be047bc36284f9f1c9358760c6738f41e9828e2a15e0845e8f09c9d2
|
data/lib/webget/version.rb
CHANGED
data/lib/webget/webcache.rb
CHANGED
@@ -54,8 +54,14 @@ module Webcache
|
|
54
54
|
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
55
|
def self.cache() @cache ||= DiskCache.new; end
|
56
56
|
|
57
|
-
def self.record( url, response,
|
58
|
-
|
57
|
+
def self.record( url, response,
|
58
|
+
path: nil,
|
59
|
+
encoding: 'UTF-8',
|
60
|
+
format: 'html' )
|
61
|
+
cache.record( url, response,
|
62
|
+
path: path,
|
63
|
+
encoding: encoding,
|
64
|
+
format: format );
|
59
65
|
end
|
60
66
|
def self.cached?( url ) cache.cached?( url ); end
|
61
67
|
class << self
|
@@ -98,9 +104,12 @@ class DiskCache
|
|
98
104
|
|
99
105
|
## add more save / put / etc. aliases - why? why not?
|
100
106
|
## rename to record_html - why? why not?
|
101
|
-
def record( url, response,
|
107
|
+
def record( url, response,
|
108
|
+
path: nil,
|
109
|
+
encoding: 'UTF-8',
|
110
|
+
format: 'html' )
|
102
111
|
|
103
|
-
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
112
|
+
body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
|
104
113
|
meta_path = "#{body_path}.meta.txt"
|
105
114
|
|
106
115
|
## make sure path exits
|
@@ -115,9 +124,10 @@ class DiskCache
|
|
115
124
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
116
125
|
elsif format == 'csv'
|
117
126
|
## fix: newlines - always use "unix" style" - why? why not?
|
127
|
+
## fix: use :newline => :universal option? translates to univeral "\n"
|
118
128
|
text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
|
119
129
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
120
|
-
else
|
130
|
+
else ## html or txt
|
121
131
|
text = response.text( encoding: encoding )
|
122
132
|
File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
|
123
133
|
end
|
@@ -141,7 +151,7 @@ class DiskCache
|
|
141
151
|
|
142
152
|
|
143
153
|
### helpers
|
144
|
-
def url_to_path( str )
|
154
|
+
def url_to_path( str, path: nil )
|
145
155
|
## map url to file path
|
146
156
|
uri = URI.parse( str )
|
147
157
|
|
@@ -150,10 +160,14 @@ class DiskCache
|
|
150
160
|
## always downcase for now (internet domain is case insensitive)
|
151
161
|
host_dir = uri.host.downcase
|
152
162
|
|
153
|
-
## "
|
154
|
-
|
155
|
-
|
156
|
-
|
163
|
+
req_path = if path ## use "custom" (file)path for cache storage if passed in
|
164
|
+
path
|
165
|
+
else
|
166
|
+
## "/this/is/everything?query=params"
|
167
|
+
## cut-off leading slash and
|
168
|
+
## convert query ? =
|
169
|
+
uri.request_uri[1..-1]
|
170
|
+
end
|
157
171
|
|
158
172
|
|
159
173
|
|
data/lib/webget/webget.rb
CHANGED
@@ -62,6 +62,30 @@ class Webget # a web (go get) crawler
|
|
62
62
|
end # method self.page
|
63
63
|
|
64
64
|
|
65
|
+
def self.text( url, path: nil, headers: {} ) ## assumes txt format
|
66
|
+
puts " sleep #{config.sleep} sec(s)..."
|
67
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
68
|
+
|
69
|
+
response = Webclient.get( url, headers: headers )
|
70
|
+
|
71
|
+
if response.status.ok? ## must be HTTP 200
|
72
|
+
puts "#{response.status.code} #{response.status.message}"
|
73
|
+
## note: like json assumes always utf-8 encoding for now !!!
|
74
|
+
Webcache.record( url, response,
|
75
|
+
path: path, ## optional "custom" (file)path for saving in cache
|
76
|
+
format: 'txt' )
|
77
|
+
else
|
78
|
+
## todo/check - log error
|
79
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
80
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
81
|
+
end
|
82
|
+
|
83
|
+
## to be done / continued
|
84
|
+
response
|
85
|
+
end # method self.text
|
86
|
+
|
87
|
+
|
88
|
+
|
65
89
|
## todo/check: rename to csv or file or records or - why? why not?
|
66
90
|
## todo/check: rename encoding to html/http-like charset - why? why not?
|
67
91
|
def self.dataset( url, encoding: 'UTF-8', headers: {} ) ## assumes csv format
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: webclient
|
@@ -93,7 +93,7 @@ homepage: https://github.com/rubycoco/webclient
|
|
93
93
|
licenses:
|
94
94
|
- Public Domain
|
95
95
|
metadata: {}
|
96
|
-
post_install_message:
|
96
|
+
post_install_message:
|
97
97
|
rdoc_options:
|
98
98
|
- "--main"
|
99
99
|
- README.md
|
@@ -110,9 +110,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
110
110
|
- !ruby/object:Gem::Version
|
111
111
|
version: '0'
|
112
112
|
requirements: []
|
113
|
-
|
114
|
-
|
115
|
-
signing_key:
|
113
|
+
rubygems_version: 3.1.4
|
114
|
+
signing_key:
|
116
115
|
specification_version: 4
|
117
116
|
summary: webget gem - a web (go get) crawler incl. web cache
|
118
117
|
test_files: []
|