webget 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +4 -4
- data/Rakefile +1 -1
- data/lib/webget.rb +32 -4
- data/lib/webget/version.rb +6 -3
- data/lib/webget/webcache.rb +167 -0
- data/lib/webget/webclient.rb +85 -0
- data/lib/webget/webget.rb +64 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e0feb02f55f01692b3353f5aaf15ab2201dfb02
|
4
|
+
data.tar.gz: da3d146f2fc6db90e2a34c9c385d7e9e9d3c272c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49fb39362398e09eac27d929f0689a604234818f8ff6fc2422217977e6312fe46bf0273059d9310ba9b40dea62b75bc437b6e90cc36d5e39456d1927e36739ee
|
7
|
+
data.tar.gz: b81245bc4a7029eae5712aa94f48784240b6666d48ddc270c360980bc6f5599508ff49c7d128bafdff8419b378ab1c6d83282090a6ec10ad1a77a228fb70232a
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
|
3
3
|
webget gem - yet (another) network client for world wide web (www) requests via HTTP
|
4
4
|
|
5
|
-
* home :: [github.com/
|
6
|
-
* bugs :: [github.com/
|
7
|
-
* gem :: [rubygems.org/gems/
|
8
|
-
* rdoc :: [rubydoc.info/gems/
|
5
|
+
* home :: [github.com/rubycoco/webget](https://github.com/rubycoco/webget)
|
6
|
+
* bugs :: [github.com/rubycoco/webget/issues](https://github.com/rubycoco/webget/issues)
|
7
|
+
* gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
|
8
|
+
* rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
|
9
9
|
|
10
10
|
|
11
11
|
## Usage
|
data/Rakefile
CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'webget' do
|
|
5
5
|
|
6
6
|
self.version = Webget::VERSION
|
7
7
|
|
8
|
-
self.summary = 'webget gem - yet (another) network client for world wide web (www) requests
|
8
|
+
self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
|
9
9
|
self.description = summary
|
10
10
|
|
11
11
|
self.urls = { home: 'https://github.com/rubycoco/fetcher' }
|
data/lib/webget.rb
CHANGED
@@ -1,7 +1,35 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'time'
|
3
|
+
require 'date'
|
4
|
+
require 'fileutils'
|
1
5
|
|
2
|
-
|
3
|
-
require '
|
6
|
+
require 'uri'
|
7
|
+
require 'net/http'
|
8
|
+
require 'net/https'
|
4
9
|
|
10
|
+
require 'json'
|
11
|
+
require 'yaml'
|
5
12
|
|
6
|
-
|
7
|
-
|
13
|
+
|
14
|
+
|
15
|
+
## our own code
|
16
|
+
require 'webget/version' # let version go first
|
17
|
+
require 'webget/webclient'
|
18
|
+
require 'webget/webcache'
|
19
|
+
require 'webget/webget'
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
############
|
25
|
+
## add convenience alias for camel case / alternate different spelling
|
26
|
+
WebCache = Webcache
|
27
|
+
WebClient = Webclient
|
28
|
+
WebGet = Webget
|
29
|
+
|
30
|
+
## use Webgo as (alias) name (keep reserver for now) - why? why not?
|
31
|
+
WebGo = Webget
|
32
|
+
Webgo = Webget
|
33
|
+
|
34
|
+
|
35
|
+
puts Webget.banner # say hello
|
data/lib/webget/version.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
|
2
2
|
class Webget
|
3
|
+
|
3
4
|
MAJOR = 0 ## todo: namespace inside version or something - why? why not??
|
4
|
-
MINOR =
|
5
|
-
PATCH =
|
5
|
+
MINOR = 1
|
6
|
+
PATCH = 0
|
6
7
|
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
7
8
|
|
8
9
|
def self.version
|
9
10
|
VERSION
|
10
11
|
end
|
11
12
|
|
13
|
+
# version string for generator meta tag (includes ruby version)
|
12
14
|
def self.banner
|
13
15
|
"webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
14
16
|
end
|
@@ -16,5 +18,6 @@ class Webget
|
|
16
18
|
def self.root
|
17
19
|
"#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
|
18
20
|
end
|
19
|
-
|
21
|
+
|
22
|
+
end # module Webget
|
20
23
|
|
@@ -0,0 +1,167 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Webcache
|
4
|
+
|
5
|
+
#####
|
6
|
+
# copied from props gem, see Env.home
|
7
|
+
# - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
|
8
|
+
# todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
|
9
|
+
def self.home
|
10
|
+
path = if( ENV['HOME'] || ENV['USERPROFILE'] )
|
11
|
+
ENV['HOME'] || ENV['USERPROFILE']
|
12
|
+
elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
|
13
|
+
"#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
|
14
|
+
else
|
15
|
+
begin
|
16
|
+
File.expand_path('~')
|
17
|
+
rescue
|
18
|
+
if File::ALT_SEPARATOR
|
19
|
+
'C:/'
|
20
|
+
else
|
21
|
+
'/'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
## note: use File.expand_path to "unify" path e.g
|
27
|
+
## C:\Users\roman becomes
|
28
|
+
## C:/Users/roman
|
29
|
+
|
30
|
+
File.expand_path( path )
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class Configuration
|
35
|
+
## root directory - todo/check: find/use a better name - why? why not?
|
36
|
+
def root() @root || "#{Webcache.home}/.cache"; end
|
37
|
+
def root=(value) @root = value; end
|
38
|
+
end # class Configuration
|
39
|
+
|
40
|
+
|
41
|
+
## lets you use
|
42
|
+
## Webcache.configure do |config|
|
43
|
+
## config.root = './cache'
|
44
|
+
## end
|
45
|
+
def self.configure() yield( config ); end
|
46
|
+
def self.config() @config ||= Configuration.new; end
|
47
|
+
|
48
|
+
|
49
|
+
## add "high level" root convenience helpers
|
50
|
+
def self.root() config.root; end
|
51
|
+
def self.root=(value) config.root = value; end
|
52
|
+
|
53
|
+
|
54
|
+
### "interface" for "generic" cache storage (might be sqlite database or filesystem)
|
55
|
+
def self.cache() @cache ||= DiskCache.new; end
|
56
|
+
|
57
|
+
def self.record( url, response, format: 'html' )
|
58
|
+
cache.record( url, response, format: format );
|
59
|
+
end
|
60
|
+
def self.cached?( url ) cache.cached?( url ); end
|
61
|
+
class << self
|
62
|
+
alias_method :exist?, :cached?
|
63
|
+
end
|
64
|
+
def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
|
65
|
+
def self.read( url ) cache.read( url ); end
|
66
|
+
|
67
|
+
|
68
|
+
class DiskCache
|
69
|
+
def cached?( url )
|
70
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
71
|
+
File.exist?( body_path )
|
72
|
+
end
|
73
|
+
alias_method :exist?, :cached?
|
74
|
+
|
75
|
+
|
76
|
+
def read( url )
|
77
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
78
|
+
File.open( body_path, 'r:utf-8' ) {|f| f.read }
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
## add more save / put / etc. aliases - why? why not?
|
83
|
+
## rename to record_html - why? why not?
|
84
|
+
def record( url, response, format: 'html' )
|
85
|
+
|
86
|
+
body_path = "#{Webcache.root}/#{url_to_path( url )}"
|
87
|
+
meta_path = "#{body_path}.meta.txt"
|
88
|
+
|
89
|
+
## make sure path exits
|
90
|
+
FileUtils.mkdir_p( File.dirname( body_path ) )
|
91
|
+
|
92
|
+
|
93
|
+
puts "[cache] saving #{body_path}..."
|
94
|
+
|
95
|
+
## todo/check: verify content-type - why? why not?
|
96
|
+
if format == 'json'
|
97
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
|
98
|
+
else
|
99
|
+
## note - for now always assume utf8!!!!!!!!!
|
100
|
+
File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
|
101
|
+
end
|
102
|
+
|
103
|
+
File.open( meta_path, 'w:utf-8' ) do |f|
|
104
|
+
response.headers.each do |key, value| # iterate all response headers
|
105
|
+
f.write( "#{key}: #{value}" )
|
106
|
+
f.write( "\n" )
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
|
114
|
+
## use file:// instead of disk:// - why? why not?
|
115
|
+
def url_to_id( str ) "disk://#{url_to_path( str )}"; end
|
116
|
+
|
117
|
+
|
118
|
+
### helpers
|
119
|
+
def url_to_path( str )
|
120
|
+
## map url to file path
|
121
|
+
uri = URI.parse( str )
|
122
|
+
|
123
|
+
## note: ignore scheme (e.g. http/https)
|
124
|
+
## and post (e.g. 80, 8080, etc.) for now
|
125
|
+
## always downcase for now (internet domain is case insensitive)
|
126
|
+
host_dir = uri.host.downcase
|
127
|
+
|
128
|
+
## "/this/is/everything?query=params"
|
129
|
+
## cut-off leading slash and
|
130
|
+
## convert query ? =
|
131
|
+
req_path = uri.request_uri[1..-1]
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
### special "prettify" rule for weltfussball
|
136
|
+
## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
|
137
|
+
if host_dir.index( 'weltfussball.de' ) ||
|
138
|
+
host_dir.index( 'worldfootball.net' )
|
139
|
+
if req_path.end_with?( '/' )
|
140
|
+
req_path = "#{req_path[0..-2]}.html"
|
141
|
+
else
|
142
|
+
puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
|
143
|
+
exit 1
|
144
|
+
end
|
145
|
+
elsif host_dir.index( 'football-data.org' )
|
146
|
+
req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
|
147
|
+
|
148
|
+
## flattern - make a file path - for auto-save
|
149
|
+
## change ? to -I-
|
150
|
+
## change / to ~~
|
151
|
+
## change = to ~
|
152
|
+
req_path = req_path.gsub( '?', '-I-' )
|
153
|
+
.gsub( '/', '~~' )
|
154
|
+
.gsub( '=', '~')
|
155
|
+
|
156
|
+
req_path = "#{req_path}.json"
|
157
|
+
else
|
158
|
+
## no special rule
|
159
|
+
end
|
160
|
+
|
161
|
+
page_path = "#{host_dir}/#{req_path}"
|
162
|
+
page_path
|
163
|
+
end
|
164
|
+
end # class DiskCache
|
165
|
+
|
166
|
+
|
167
|
+
end # module Webcache
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
class Webclient
|
3
|
+
|
4
|
+
class Response # nested class - wrap Net::HTTP::Response
|
5
|
+
def initialize( response )
|
6
|
+
@response = response
|
7
|
+
end
|
8
|
+
def raw() @response; end
|
9
|
+
|
10
|
+
|
11
|
+
def text
|
12
|
+
# note: Net::HTTP will NOT set encoding UTF-8 etc.
|
13
|
+
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
14
|
+
# thus, set/force encoding to utf-8
|
15
|
+
text = @response.body.to_s
|
16
|
+
text = text.force_encoding( Encoding::UTF_8 )
|
17
|
+
text
|
18
|
+
end
|
19
|
+
|
20
|
+
## convenience helper; returns parsed json data
|
21
|
+
def json() JSON.parse( text ); end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
class Headers # nested (nested) class
|
26
|
+
def initialize( response )
|
27
|
+
@response = response
|
28
|
+
end
|
29
|
+
def each( &blk )
|
30
|
+
@response.each_header do |key, value| # Iterate all response headers
|
31
|
+
blk.call( key, value )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
def headers() @headers ||= Headers.new( @response ); end
|
36
|
+
|
37
|
+
class Status # nested (nested) class
|
38
|
+
def initialize( response )
|
39
|
+
@response = response
|
40
|
+
end
|
41
|
+
def code() @response.code.to_i; end
|
42
|
+
def ok?() code == 200; end
|
43
|
+
def nok?() code != 200; end
|
44
|
+
def message() @response.message; end
|
45
|
+
end
|
46
|
+
def status() @status ||= Status.new( @response ); end
|
47
|
+
end # (nested) class Response
|
48
|
+
|
49
|
+
|
50
|
+
def self.get( url, headers: {} )
|
51
|
+
|
52
|
+
uri = URI.parse( url )
|
53
|
+
http = Net::HTTP.new( uri.host, uri.port )
|
54
|
+
|
55
|
+
if uri.instance_of? URI::HTTPS
|
56
|
+
http.use_ssl = true
|
57
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
58
|
+
end
|
59
|
+
|
60
|
+
request = Net::HTTP::Get.new( uri.request_uri )
|
61
|
+
|
62
|
+
### add (custom) headers if any
|
63
|
+
## check/todo: is there are more idiomatic way for Net::HTTP ???
|
64
|
+
## use
|
65
|
+
## request = Net::HTTP::Get.new( uri.request_uri, headers )
|
66
|
+
## why? why not?
|
67
|
+
## instead of e.g.
|
68
|
+
## request['X-Auth-Token'] = 'xxxxxxx'
|
69
|
+
## request['User-Agent'] = 'ruby'
|
70
|
+
## request['Accept'] = '*/*'
|
71
|
+
if headers && headers.size > 0
|
72
|
+
headers.each do |key,value|
|
73
|
+
request[ key ] = value
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
response = http.request( request )
|
79
|
+
|
80
|
+
## note: return "unified" wrapped response
|
81
|
+
Response.new( response )
|
82
|
+
end # method self.get
|
83
|
+
|
84
|
+
end # class Webclient
|
85
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
|
2
|
+
class Webget # a web (go get) crawler
|
3
|
+
|
4
|
+
class Configuration ## nested class
|
5
|
+
|
6
|
+
#######################
|
7
|
+
## accessors
|
8
|
+
def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
|
9
|
+
def sleep=(value) @sleep = value; end
|
10
|
+
|
11
|
+
end # (nested) class Configuration
|
12
|
+
|
13
|
+
## lets you use
|
14
|
+
## Webget.configure do |config|
|
15
|
+
## config.sleep = 10
|
16
|
+
## end
|
17
|
+
def self.configure() yield( config ); end
|
18
|
+
def self.config() @config ||= Configuration.new; end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def self.call( url, headers: {} ) ## assumes json format
|
23
|
+
puts " sleep #{config.sleep} sec(s)..."
|
24
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
25
|
+
|
26
|
+
response = Webclient.get( url, headers: headers )
|
27
|
+
|
28
|
+
if response.status.ok? ## must be HTTP 200
|
29
|
+
puts "#{response.status.code} #{response.status.message}"
|
30
|
+
## note: use format json for pretty printing and parse check!!!!
|
31
|
+
Webcache.record( url, response,
|
32
|
+
format: 'json' )
|
33
|
+
else
|
34
|
+
## todo/check - log error
|
35
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
36
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
37
|
+
end
|
38
|
+
|
39
|
+
## to be done / continued
|
40
|
+
response
|
41
|
+
end # method self.call
|
42
|
+
|
43
|
+
|
44
|
+
def self.page( url, headers: {} ) ## assumes html format
|
45
|
+
puts " sleep #{config.sleep} sec(s)..."
|
46
|
+
sleep( config.sleep ) ## slow down - sleep 3secs before each http request
|
47
|
+
|
48
|
+
response = Webclient.get( url, headers: headers )
|
49
|
+
|
50
|
+
if response.status.ok? ## must be HTTP 200
|
51
|
+
puts "#{response.status.code} #{response.status.message}"
|
52
|
+
Webcache.record( url, response ) ## assumes format: html (default)
|
53
|
+
else
|
54
|
+
## todo/check - log error
|
55
|
+
puts "!! ERROR - #{response.status.code} #{response.status.message}:"
|
56
|
+
pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
|
57
|
+
end
|
58
|
+
|
59
|
+
## to be done / continued
|
60
|
+
response
|
61
|
+
end # method self.page
|
62
|
+
|
63
|
+
end # class Webget
|
64
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
@@ -44,7 +44,7 @@ dependencies:
|
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '3.22'
|
47
|
-
description:
|
47
|
+
description: webget gem - yet (another) network client for world wide web (www) requests
|
48
48
|
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
50
50
|
extensions: []
|
@@ -59,6 +59,9 @@ files:
|
|
59
59
|
- Rakefile
|
60
60
|
- lib/webget.rb
|
61
61
|
- lib/webget/version.rb
|
62
|
+
- lib/webget/webcache.rb
|
63
|
+
- lib/webget/webclient.rb
|
64
|
+
- lib/webget/webget.rb
|
62
65
|
homepage: https://github.com/rubycoco/fetcher
|
63
66
|
licenses:
|
64
67
|
- Public Domain
|