webget 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7cf1205972f496fc25b62a33bc47146e4395ca9d
4
- data.tar.gz: 292bf57c29732fecacd7dae0579cef6cdcba2852
3
+ metadata.gz: 6e0feb02f55f01692b3353f5aaf15ab2201dfb02
4
+ data.tar.gz: da3d146f2fc6db90e2a34c9c385d7e9e9d3c272c
5
5
  SHA512:
6
- metadata.gz: 91395bb6fe0cc5f15cf0ec3f58fd1c06a7f6c97c211a04985910a27dd0826f8ff3a3cf5f336f602cbb6d776169c4900be07405da8ec332f3b9b6dbcc16365e70
7
- data.tar.gz: 4c834cdc685210163ce9fadadfdda4224a888886826e6a6ec17f5d07ee3cd787fa109b8da4e952b322f5e15a91087ffd740ac8f91e35d13e1608571ba0d3a590
6
+ metadata.gz: 49fb39362398e09eac27d929f0689a604234818f8ff6fc2422217977e6312fe46bf0273059d9310ba9b40dea62b75bc437b6e90cc36d5e39456d1927e36739ee
7
+ data.tar.gz: b81245bc4a7029eae5712aa94f48784240b6666d48ddc270c360980bc6f5599508ff49c7d128bafdff8419b378ab1c6d83282090a6ec10ad1a77a228fb70232a
@@ -4,3 +4,6 @@ README.md
4
4
  Rakefile
5
5
  lib/webget.rb
6
6
  lib/webget/version.rb
7
+ lib/webget/webcache.rb
8
+ lib/webget/webclient.rb
9
+ lib/webget/webget.rb
data/README.md CHANGED
@@ -2,10 +2,10 @@
2
2
 
3
3
  webget gem - yet (another) network client for world wide web (www) requests via HTTP
4
4
 
5
- * home :: [github.com/rubylibs/fotos](https://github.com/rubylibs/fotos)
6
- * bugs :: [github.com/rubylibs/fotos/issues](https://github.com/rubylibs/fotos/issues)
7
- * gem :: [rubygems.org/gems/fotos](https://rubygems.org/gems/fotos)
8
- * rdoc :: [rubydoc.info/gems/fotos](http://rubydoc.info/gems/fotos)
5
+ * home :: [github.com/rubycoco/webget](https://github.com/rubycoco/webget)
6
+ * bugs :: [github.com/rubycoco/webget/issues](https://github.com/rubycoco/webget/issues)
7
+ * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
+ * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
9
 
10
10
 
11
11
  ## Usage
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'webget' do
5
5
 
6
6
  self.version = Webget::VERSION
7
7
 
8
- self.summary = 'webget gem - yet (another) network client for world wide web (www) requests '
8
+ self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
9
9
  self.description = summary
10
10
 
11
11
  self.urls = { home: 'https://github.com/rubycoco/fetcher' }
@@ -1,7 +1,35 @@
1
+ require 'pp'
2
+ require 'time'
3
+ require 'date'
4
+ require 'fileutils'
1
5
 
2
- # our own code
3
- require 'webget/version' # note: let version always go first
6
+ require 'uri'
7
+ require 'net/http'
8
+ require 'net/https'
4
9
 
10
+ require 'json'
11
+ require 'yaml'
5
12
 
6
- # say hello
7
- puts Webget.banner if defined?( $RUBYLIBS_DEBUG )
13
+
14
+
15
+ ## our own code
16
+ require 'webget/version' # let version go first
17
+ require 'webget/webclient'
18
+ require 'webget/webcache'
19
+ require 'webget/webget'
20
+
21
+
22
+
23
+
24
+ ############
25
+ ## add convenience alias for camel case / alternate different spelling
26
+ WebCache = Webcache
27
+ WebClient = Webclient
28
+ WebGet = Webget
29
+
30
+ ## use Webgo as (alias) name (keep reserver for now) - why? why not?
31
+ WebGo = Webget
32
+ Webgo = Webget
33
+
34
+
35
+ puts Webget.banner # say hello
@@ -1,14 +1,16 @@
1
1
 
2
2
  class Webget
3
+
3
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
4
- MINOR = 0
5
- PATCH = 1
5
+ MINOR = 1
6
+ PATCH = 0
6
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
7
8
 
8
9
  def self.version
9
10
  VERSION
10
11
  end
11
12
 
13
+ # version string for generator meta tag (includes ruby version)
12
14
  def self.banner
13
15
  "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
14
16
  end
@@ -16,5 +18,6 @@ class Webget
16
18
  def self.root
17
19
  "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
18
20
  end
19
- end # module Webget
21
+
22
+ end # module Webget
20
23
 
@@ -0,0 +1,167 @@
1
+
2
+
3
+ module Webcache
4
+
5
+ #####
6
+ # copied from props gem, see Env.home
7
+ # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
+ # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
+ def self.home
10
+ path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
+ ENV['HOME'] || ENV['USERPROFILE']
12
+ elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
+ "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
+ else
15
+ begin
16
+ File.expand_path('~')
17
+ rescue
18
+ if File::ALT_SEPARATOR
19
+ 'C:/'
20
+ else
21
+ '/'
22
+ end
23
+ end
24
+ end
25
+
26
+ ## note: use File.expand_path to "unify" path e.g
27
+ ## C:\Users\roman becomes
28
+ ## C:/Users/roman
29
+
30
+ File.expand_path( path )
31
+ end
32
+
33
+
34
+ class Configuration
35
+ ## root directory - todo/check: find/use a better name - why? why not?
36
+ def root() @root || "#{Webcache.home}/.cache"; end
37
+ def root=(value) @root = value; end
38
+ end # class Configuration
39
+
40
+
41
+ ## lets you use
42
+ ## Webcache.configure do |config|
43
+ ## config.root = './cache'
44
+ ## end
45
+ def self.configure() yield( config ); end
46
+ def self.config() @config ||= Configuration.new; end
47
+
48
+
49
+ ## add "high level" root convenience helpers
50
+ def self.root() config.root; end
51
+ def self.root=(value) config.root = value; end
52
+
53
+
54
+ ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
+ def self.cache() @cache ||= DiskCache.new; end
56
+
57
+ def self.record( url, response, format: 'html' )
58
+ cache.record( url, response, format: format );
59
+ end
60
+ def self.cached?( url ) cache.cached?( url ); end
61
+ class << self
62
+ alias_method :exist?, :cached?
63
+ end
64
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
+ def self.read( url ) cache.read( url ); end
66
+
67
+
68
+ class DiskCache
69
+ def cached?( url )
70
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
71
+ File.exist?( body_path )
72
+ end
73
+ alias_method :exist?, :cached?
74
+
75
+
76
+ def read( url )
77
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
78
+ File.open( body_path, 'r:utf-8' ) {|f| f.read }
79
+ end
80
+
81
+
82
+ ## add more save / put / etc. aliases - why? why not?
83
+ ## rename to record_html - why? why not?
84
+ def record( url, response, format: 'html' )
85
+
86
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
87
+ meta_path = "#{body_path}.meta.txt"
88
+
89
+ ## make sure path exits
90
+ FileUtils.mkdir_p( File.dirname( body_path ) )
91
+
92
+
93
+ puts "[cache] saving #{body_path}..."
94
+
95
+ ## todo/check: verify content-type - why? why not?
96
+ if format == 'json'
97
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
98
+ else
99
+ ## note - for now always assume utf8!!!!!!!!!
100
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
101
+ end
102
+
103
+ File.open( meta_path, 'w:utf-8' ) do |f|
104
+ response.headers.each do |key, value| # iterate all response headers
105
+ f.write( "#{key}: #{value}" )
106
+ f.write( "\n" )
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+ ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
114
+ ## use file:// instead of disk:// - why? why not?
115
+ def url_to_id( str ) "disk://#{url_to_path( str )}"; end
116
+
117
+
118
+ ### helpers
119
+ def url_to_path( str )
120
+ ## map url to file path
121
+ uri = URI.parse( str )
122
+
123
+ ## note: ignore scheme (e.g. http/https)
124
+ ## and post (e.g. 80, 8080, etc.) for now
125
+ ## always downcase for now (internet domain is case insensitive)
126
+ host_dir = uri.host.downcase
127
+
128
+ ## "/this/is/everything?query=params"
129
+ ## cut-off leading slash and
130
+ ## convert query ? =
131
+ req_path = uri.request_uri[1..-1]
132
+
133
+
134
+
135
+ ### special "prettify" rule for weltfussball
136
+ ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
137
+ if host_dir.index( 'weltfussball.de' ) ||
138
+ host_dir.index( 'worldfootball.net' )
139
+ if req_path.end_with?( '/' )
140
+ req_path = "#{req_path[0..-2]}.html"
141
+ else
142
+ puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
143
+ exit 1
144
+ end
145
+ elsif host_dir.index( 'football-data.org' )
146
+ req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
147
+
148
+ ## flattern - make a file path - for auto-save
149
+ ## change ? to -I-
150
+ ## change / to ~~
151
+ ## change = to ~
152
+ req_path = req_path.gsub( '?', '-I-' )
153
+ .gsub( '/', '~~' )
154
+ .gsub( '=', '~')
155
+
156
+ req_path = "#{req_path}.json"
157
+ else
158
+ ## no special rule
159
+ end
160
+
161
+ page_path = "#{host_dir}/#{req_path}"
162
+ page_path
163
+ end
164
+ end # class DiskCache
165
+
166
+
167
+ end # module Webcache
@@ -0,0 +1,85 @@
1
+
2
+ class Webclient
3
+
4
+ class Response # nested class - wrap Net::HTTP::Response
5
+ def initialize( response )
6
+ @response = response
7
+ end
8
+ def raw() @response; end
9
+
10
+
11
+ def text
12
+ # note: Net::HTTP will NOT set encoding UTF-8 etc.
13
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
14
+ # thus, set/force encoding to utf-8
15
+ text = @response.body.to_s
16
+ text = text.force_encoding( Encoding::UTF_8 )
17
+ text
18
+ end
19
+
20
+ ## convenience helper; returns parsed json data
21
+ def json() JSON.parse( text ); end
22
+
23
+
24
+
25
+ class Headers # nested (nested) class
26
+ def initialize( response )
27
+ @response = response
28
+ end
29
+ def each( &blk )
30
+ @response.each_header do |key, value| # Iterate all response headers
31
+ blk.call( key, value )
32
+ end
33
+ end
34
+ end
35
+ def headers() @headers ||= Headers.new( @response ); end
36
+
37
+ class Status # nested (nested) class
38
+ def initialize( response )
39
+ @response = response
40
+ end
41
+ def code() @response.code.to_i; end
42
+ def ok?() code == 200; end
43
+ def nok?() code != 200; end
44
+ def message() @response.message; end
45
+ end
46
+ def status() @status ||= Status.new( @response ); end
47
+ end # (nested) class Response
48
+
49
+
50
+ def self.get( url, headers: {} )
51
+
52
+ uri = URI.parse( url )
53
+ http = Net::HTTP.new( uri.host, uri.port )
54
+
55
+ if uri.instance_of? URI::HTTPS
56
+ http.use_ssl = true
57
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
58
+ end
59
+
60
+ request = Net::HTTP::Get.new( uri.request_uri )
61
+
62
+ ### add (custom) headers if any
63
+ ## check/todo: is there are more idiomatic way for Net::HTTP ???
64
+ ## use
65
+ ## request = Net::HTTP::Get.new( uri.request_uri, headers )
66
+ ## why? why not?
67
+ ## instead of e.g.
68
+ ## request['X-Auth-Token'] = 'xxxxxxx'
69
+ ## request['User-Agent'] = 'ruby'
70
+ ## request['Accept'] = '*/*'
71
+ if headers && headers.size > 0
72
+ headers.each do |key,value|
73
+ request[ key ] = value
74
+ end
75
+ end
76
+
77
+
78
+ response = http.request( request )
79
+
80
+ ## note: return "unified" wrapped response
81
+ Response.new( response )
82
+ end # method self.get
83
+
84
+ end # class Webclient
85
+
@@ -0,0 +1,64 @@
1
+
2
+ class Webget # a web (go get) crawler
3
+
4
+ class Configuration ## nested class
5
+
6
+ #######################
7
+ ## accessors
8
+ def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
9
+ def sleep=(value) @sleep = value; end
10
+
11
+ end # (nested) class Configuration
12
+
13
+ ## lets you use
14
+ ## Webget.configure do |config|
15
+ ## config.sleep = 10
16
+ ## end
17
+ def self.configure() yield( config ); end
18
+ def self.config() @config ||= Configuration.new; end
19
+
20
+
21
+
22
+ def self.call( url, headers: {} ) ## assumes json format
23
+ puts " sleep #{config.sleep} sec(s)..."
24
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
+
26
+ response = Webclient.get( url, headers: headers )
27
+
28
+ if response.status.ok? ## must be HTTP 200
29
+ puts "#{response.status.code} #{response.status.message}"
30
+ ## note: use format json for pretty printing and parse check!!!!
31
+ Webcache.record( url, response,
32
+ format: 'json' )
33
+ else
34
+ ## todo/check - log error
35
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
36
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
37
+ end
38
+
39
+ ## to be done / continued
40
+ response
41
+ end # method self.call
42
+
43
+
44
+ def self.page( url, headers: {} ) ## assumes html format
45
+ puts " sleep #{config.sleep} sec(s)..."
46
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
+
48
+ response = Webclient.get( url, headers: headers )
49
+
50
+ if response.status.ok? ## must be HTTP 200
51
+ puts "#{response.status.code} #{response.status.message}"
52
+ Webcache.record( url, response ) ## assumes format: html (default)
53
+ else
54
+ ## todo/check - log error
55
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
56
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
57
+ end
58
+
59
+ ## to be done / continued
60
+ response
61
+ end # method self.page
62
+
63
+ end # class Webget
64
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
@@ -44,7 +44,7 @@ dependencies:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
46
  version: '3.22'
47
- description: 'webget gem - yet (another) network client for world wide web (www) requests '
47
+ description: webget gem - yet (another) network client for world wide web (www) requests
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
50
50
  extensions: []
@@ -59,6 +59,9 @@ files:
59
59
  - Rakefile
60
60
  - lib/webget.rb
61
61
  - lib/webget/version.rb
62
+ - lib/webget/webcache.rb
63
+ - lib/webget/webclient.rb
64
+ - lib/webget/webget.rb
62
65
  homepage: https://github.com/rubycoco/fetcher
63
66
  licenses:
64
67
  - Public Domain