webget 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +4 -4
- data/Rakefile +1 -1
- data/lib/webget.rb +32 -4
- data/lib/webget/version.rb +6 -3
- data/lib/webget/webcache.rb +167 -0
- data/lib/webget/webclient.rb +85 -0
- data/lib/webget/webget.rb +64 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e0feb02f55f01692b3353f5aaf15ab2201dfb02
|
4
|
+
data.tar.gz: da3d146f2fc6db90e2a34c9c385d7e9e9d3c272c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49fb39362398e09eac27d929f0689a604234818f8ff6fc2422217977e6312fe46bf0273059d9310ba9b40dea62b75bc437b6e90cc36d5e39456d1927e36739ee
|
7
|
+
data.tar.gz: b81245bc4a7029eae5712aa94f48784240b6666d48ddc270c360980bc6f5599508ff49c7d128bafdff8419b378ab1c6d83282090a6ec10ad1a77a228fb70232a
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
|
3
3
|
webget gem - yet (another) network client for world wide web (www) requests via HTTP
|
4
4
|
|
5
|
-
* home :: [github.com/
|
6
|
-
* bugs :: [github.com/
|
7
|
-
* gem :: [rubygems.org/gems/
|
8
|
-
* rdoc :: [rubydoc.info/gems/
|
5
|
+
* home :: [github.com/rubycoco/webget](https://github.com/rubycoco/webget)
|
6
|
+
* bugs :: [github.com/rubycoco/webget/issues](https://github.com/rubycoco/webget/issues)
|
7
|
+
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
|
+
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
9
|
|
10
10
|
|
11
11
|
## Usage
|
data/Rakefile
CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'webget' do
|
|
5
5
|
|
6
6
|
self.version = Webget::VERSION
|
7
7
|
|
8
|
-
self.summary = 'webget gem - yet (another) network client for world wide web (www) requests
|
8
|
+
self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
|
9
9
|
self.description = summary
|
10
10
|
|
11
11
|
self.urls = { home: 'https://github.com/rubycoco/fetcher' }
|
data/lib/webget.rb
CHANGED
@@ -1,7 +1,35 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'time'
|
3
|
+
require 'date'
|
4
|
+
require 'fileutils'
|
1
5
|
|
2
|
-
|
3
|
-
require '
|
6
|
+
require 'uri'
|
7
|
+
require 'net/http'
|
8
|
+
require 'net/https'
|
4
9
|
|
10
|
+
require 'json'
|
11
|
+
require 'yaml'
|
5
12
|
|
6
|
-
|
7
|
-
|
13
|
+
|
14
|
+
|
15
|
+
## our own code
|
16
|
+
require 'webget/version' # let version go first
|
17
|
+
require 'webget/webclient'
|
18
|
+
require 'webget/webcache'
|
19
|
+
require 'webget/webget'
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
############
|
25
|
+
## add convenience alias for camel case / alternate different spelling
|
26
|
+
WebCache = Webcache
|
27
|
+
WebClient = Webclient
|
28
|
+
WebGet = Webget
|
29
|
+
|
30
|
+
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
31
|
+
WebGo = Webget
|
32
|
+
Webgo = Webget
|
33
|
+
|
34
|
+
|
35
|
+
puts Webget.banner # say hello
|
data/lib/webget/version.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
|
2
2
|
class Webget
|
3
|
+
|
3
4
|
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
4
|
-
MINOR =
|
5
|
-
PATCH =
|
5
|
+
MINOR = 1
|
6
|
+
PATCH = 0
|
6
7
|
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
7
8
|
|
8
9
|
def self.version
|
9
10
|
VERSION
|
10
11
|
end
|
11
12
|
|
13
|
+
# version string for generator meta tag (includes ruby version)
|
12
14
|
def self.banner
|
13
15
|
"webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
14
16
|
end
|
@@ -16,5 +18,6 @@ class Webget
|
|
16
18
|
def self.root
|
17
19
|
"#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
|
18
20
|
end
|
19
|
-
|
21
|
+
|
22
|
+
end # module Webget
|
20
23
|
|
@@ -0,0 +1,167 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Webcache
|
4
|
+
|
5
|
+
#####
|
6
|
+
# copied from props gem, see Env.home
|
7
|
+
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
8
|
+
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
9
|
+
def self.home
|
10
|
+
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
11
|
+
ENV['HOME'] || ENV['USERPROFILE']
|
12
|
+
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
13
|
+
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
14
|
+
else
|
15
|
+
begin
|
16
|
+
File.expand_path('~')
|
17
|
+
rescue
|
18
|
+
if File::ALT_SEPARATOR
|
19
|
+
'C:/'
|
20
|
+
else
|
21
|
+
'/'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
## note: use File.expand_path to "unify" path e.g
|
27
|
+
## C:\Users\roman becomes
|
28
|
+
## C:/Users/roman
|
29
|
+
|
30
|
+
File.expand_path( path )
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class Configuration
|
35
|
+
## root directory - todo/check: find/use a better name - why? why not?
|
36
|
+
def root() @root || "#{Webcache.home}/.cache"; end
|
37
|
+
def root=(value) @root = value; end
|
38
|
+
end # class Configuration
|
39
|
+
|
40
|
+
|
41
|
+
## lets you use
|
42
|
+
## Webcache.configure do |config|
|
43
|
+
## config.root = './cache'
|
44
|
+
## end
|
45
|
+
def self.configure() yield( config ); end
|
46
|
+
def self.config() @config ||= Configuration.new; end
|
47
|
+
|
48
|
+
|
49
|
+
## add "high level" root convenience helpers
|
50
|
+
def self.root() config.root; end
|
51
|
+
def self.root=(value) config.root = value; end
|
52
|
+
|
53
|
+
|
54
|
+
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
|
+
def self.cache() @cache ||= DiskCache.new; end
|
56
|
+
|
57
|
+
def self.record( url, response, format: 'html' )
|
58
|
+
cache.record( url, response, format: format );
|
59
|
+
end
|
60
|
+
def self.cached?( url ) cache.cached?( url ); end
|
61
|
+
class << self
|
62
|
+
alias_method :exist?, :cached?
|
63
|
+
end
|
64
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
|
+
def self.read( url ) cache.read( url ); end
|
66
|
+
|
67
|
+
|
68
|
+
class DiskCache
|
69
|
+
def cached?( url )
|
70
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
71
|
+
File.exist?( body_path )
|
72
|
+
end
|
73
|
+
alias_method :exist?, :cached?
|
74
|
+
|
75
|
+
|
76
|
+
def read( url )
|
77
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
78
|
+
File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
## add more save / put / etc. aliases - why? why not?
|
83
|
+
## rename to record_html - why? why not?
|
84
|
+
def record( url, response, format: 'html' )
|
85
|
+
|
86
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
87
|
+
meta_path = "#{body_path}.meta.txt"
|
88
|
+
|
89
|
+
## make sure path exits
|
90
|
+
FileUtils.mkdir_p( File.dirname( body_path ) )
|
91
|
+
|
92
|
+
|
93
|
+
puts "[cache] saving #{body_path}..."
|
94
|
+
|
95
|
+
## todo/check: verify content-type - why? why not?
|
96
|
+
if format == 'json'
|
97
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
98
|
+
else
|
99
|
+
## note - for now always assume utf8!!!!!!!!!
|
100
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
|
101
|
+
end
|
102
|
+
|
103
|
+
File.open( meta_path, 'w:utf-8' ) do |f|
|
104
|
+
response.headers.each do |key, value| # iterate all response headers
|
105
|
+
f.write( "#{key}: #{value}" )
|
106
|
+
f.write( "\n" )
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
|
114
|
+
## use file:// instead of disk:// - why? why not?
|
115
|
+
def url_to_id( str ) "disk://#{url_to_path( str )}"; end
|
116
|
+
|
117
|
+
|
118
|
+
### helpers
|
119
|
+
def url_to_path( str )
|
120
|
+
## map url to file path
|
121
|
+
uri = URI.parse( str )
|
122
|
+
|
123
|
+
## note: ignore scheme (e.g. http/https)
|
124
|
+
## and post (e.g. 80, 8080, etc.) for now
|
125
|
+
## always downcase for now (internet domain is case insensitive)
|
126
|
+
host_dir = uri.host.downcase
|
127
|
+
|
128
|
+
## "/this/is/everything?query=params"
|
129
|
+
## cut-off leading slash and
|
130
|
+
## convert query ? =
|
131
|
+
req_path = uri.request_uri[1..-1]
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
### special "prettify" rule for weltfussball
|
136
|
+
## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
|
137
|
+
if host_dir.index( 'weltfussball.de' ) ||
|
138
|
+
host_dir.index( 'worldfootball.net' )
|
139
|
+
if req_path.end_with?( '/' )
|
140
|
+
req_path = "#{req_path[0..-2]}.html"
|
141
|
+
else
|
142
|
+
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
143
|
+
exit 1
|
144
|
+
end
|
145
|
+
elsif host_dir.index( 'football-data.org' )
|
146
|
+
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
147
|
+
|
148
|
+
## flattern - make a file path - for auto-save
|
149
|
+
## change ? to -I-
|
150
|
+
## change / to ~~
|
151
|
+
## change = to ~
|
152
|
+
req_path = req_path.gsub( '?', '-I-' )
|
153
|
+
.gsub( '/', '~~' )
|
154
|
+
.gsub( '=', '~')
|
155
|
+
|
156
|
+
req_path = "#{req_path}.json"
|
157
|
+
else
|
158
|
+
## no special rule
|
159
|
+
end
|
160
|
+
|
161
|
+
page_path = "#{host_dir}/#{req_path}"
|
162
|
+
page_path
|
163
|
+
end
|
164
|
+
end # class DiskCache
|
165
|
+
|
166
|
+
|
167
|
+
end # module Webcache
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
class Webclient
|
3
|
+
|
4
|
+
class Response # nested class - wrap Net::HTTP::Response
|
5
|
+
def initialize( response )
|
6
|
+
@response = response
|
7
|
+
end
|
8
|
+
def raw() @response; end
|
9
|
+
|
10
|
+
|
11
|
+
def text
|
12
|
+
# note: Net::HTTP will NOT set encoding UTF-8 etc.
|
13
|
+
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
14
|
+
# thus, set/force encoding to utf-8
|
15
|
+
text = @response.body.to_s
|
16
|
+
text = text.force_encoding( Encoding::UTF_8 )
|
17
|
+
text
|
18
|
+
end
|
19
|
+
|
20
|
+
## convenience helper; returns parsed json data
|
21
|
+
def json() JSON.parse( text ); end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
class Headers # nested (nested) class
|
26
|
+
def initialize( response )
|
27
|
+
@response = response
|
28
|
+
end
|
29
|
+
def each( &blk )
|
30
|
+
@response.each_header do |key, value| # Iterate all response headers
|
31
|
+
blk.call( key, value )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
def headers() @headers ||= Headers.new( @response ); end
|
36
|
+
|
37
|
+
class Status # nested (nested) class
|
38
|
+
def initialize( response )
|
39
|
+
@response = response
|
40
|
+
end
|
41
|
+
def code() @response.code.to_i; end
|
42
|
+
def ok?() code == 200; end
|
43
|
+
def nok?() code != 200; end
|
44
|
+
def message() @response.message; end
|
45
|
+
end
|
46
|
+
def status() @status ||= Status.new( @response ); end
|
47
|
+
end # (nested) class Response
|
48
|
+
|
49
|
+
|
50
|
+
def self.get( url, headers: {} )
|
51
|
+
|
52
|
+
uri = URI.parse( url )
|
53
|
+
http = Net::HTTP.new( uri.host, uri.port )
|
54
|
+
|
55
|
+
if uri.instance_of? URI::HTTPS
|
56
|
+
http.use_ssl = true
|
57
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
58
|
+
end
|
59
|
+
|
60
|
+
request = Net::HTTP::Get.new( uri.request_uri )
|
61
|
+
|
62
|
+
### add (custom) headers if any
|
63
|
+
## check/todo: is there are more idiomatic way for Net::HTTP ???
|
64
|
+
## use
|
65
|
+
## request = Net::HTTP::Get.new( uri.request_uri, headers )
|
66
|
+
## why? why not?
|
67
|
+
## instead of e.g.
|
68
|
+
## request['X-Auth-Token'] = 'xxxxxxx'
|
69
|
+
## request['User-Agent'] = 'ruby'
|
70
|
+
## request['Accept'] = '*/*'
|
71
|
+
if headers && headers.size > 0
|
72
|
+
headers.each do |key,value|
|
73
|
+
request[ key ] = value
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
response = http.request( request )
|
79
|
+
|
80
|
+
## note: return "unified" wrapped response
|
81
|
+
Response.new( response )
|
82
|
+
end # method self.get
|
83
|
+
|
84
|
+
end # class Webclient
|
85
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
|
2
|
+
class Webget # a web (go get) crawler
|
3
|
+
|
4
|
+
class Configuration ## nested class
|
5
|
+
|
6
|
+
#######################
|
7
|
+
## accessors
|
8
|
+
def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
|
9
|
+
def sleep=(value) @sleep = value; end
|
10
|
+
|
11
|
+
end # (nested) class Configuration
|
12
|
+
|
13
|
+
## lets you use
|
14
|
+
## Webget.configure do |config|
|
15
|
+
## config.sleep = 10
|
16
|
+
## end
|
17
|
+
def self.configure() yield( config ); end
|
18
|
+
def self.config() @config ||= Configuration.new; end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format
|
23
|
+
puts " sleep #{config.sleep} sec(s)..."
|
24
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
|
+
|
26
|
+
response = Webclient.get( url, headers: headers )
|
27
|
+
|
28
|
+
if response.status.ok? ## must be HTTP 200
|
29
|
+
puts "#{response.status.code} #{response.status.message}"
|
30
|
+
## note: use format json for pretty printing and parse check!!!!
|
31
|
+
Webcache.record( url, response,
|
32
|
+
format: 'json' )
|
33
|
+
else
|
34
|
+
## todo/check - log error
|
35
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
36
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
37
|
+
end
|
38
|
+
|
39
|
+
## to be done / continued
|
40
|
+
response
|
41
|
+
end # method self.call
|
42
|
+
|
43
|
+
|
44
|
+
def self.page( url, headers: {} ) ## assumes html format
|
45
|
+
puts " sleep #{config.sleep} sec(s)..."
|
46
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
|
+
|
48
|
+
response = Webclient.get( url, headers: headers )
|
49
|
+
|
50
|
+
if response.status.ok? ## must be HTTP 200
|
51
|
+
puts "#{response.status.code} #{response.status.message}"
|
52
|
+
Webcache.record( url, response ) ## assumes format: html (default)
|
53
|
+
else
|
54
|
+
## todo/check - log error
|
55
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
56
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
57
|
+
end
|
58
|
+
|
59
|
+
## to be done / continued
|
60
|
+
response
|
61
|
+
end # method self.page
|
62
|
+
|
63
|
+
end # class Webget
|
64
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '3.22'
|
47
|
-
description:
|
47
|
+
description: webget gem - yet (another) network client for world wide web (www) requests
|
48
48
|
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
50
50
|
extensions: []
|
@@ -59,6 +59,9 @@ files:
|
|
59
59
|
- Rakefile
|
60
60
|
- lib/webget.rb
|
61
61
|
- lib/webget/version.rb
|
62
|
+
- lib/webget/webcache.rb
|
63
|
+
- lib/webget/webclient.rb
|
64
|
+
- lib/webget/webget.rb
|
62
65
|
homepage: https://github.com/rubycoco/fetcher
|
63
66
|
licenses:
|
64
67
|
- Public Domain
|