webget 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7cf1205972f496fc25b62a33bc47146e4395ca9d
4
- data.tar.gz: 292bf57c29732fecacd7dae0579cef6cdcba2852
3
+ metadata.gz: 6e0feb02f55f01692b3353f5aaf15ab2201dfb02
4
+ data.tar.gz: da3d146f2fc6db90e2a34c9c385d7e9e9d3c272c
5
5
  SHA512:
6
- metadata.gz: 91395bb6fe0cc5f15cf0ec3f58fd1c06a7f6c97c211a04985910a27dd0826f8ff3a3cf5f336f602cbb6d776169c4900be07405da8ec332f3b9b6dbcc16365e70
7
- data.tar.gz: 4c834cdc685210163ce9fadadfdda4224a888886826e6a6ec17f5d07ee3cd787fa109b8da4e952b322f5e15a91087ffd740ac8f91e35d13e1608571ba0d3a590
6
+ metadata.gz: 49fb39362398e09eac27d929f0689a604234818f8ff6fc2422217977e6312fe46bf0273059d9310ba9b40dea62b75bc437b6e90cc36d5e39456d1927e36739ee
7
+ data.tar.gz: b81245bc4a7029eae5712aa94f48784240b6666d48ddc270c360980bc6f5599508ff49c7d128bafdff8419b378ab1c6d83282090a6ec10ad1a77a228fb70232a
@@ -4,3 +4,6 @@ README.md
4
4
  Rakefile
5
5
  lib/webget.rb
6
6
  lib/webget/version.rb
7
+ lib/webget/webcache.rb
8
+ lib/webget/webclient.rb
9
+ lib/webget/webget.rb
data/README.md CHANGED
@@ -2,10 +2,10 @@
2
2
 
3
3
  webget gem - yet (another) network client for world wide web (www) requests via HTTP
4
4
 
5
- * home :: [github.com/rubylibs/fotos](https://github.com/rubylibs/fotos)
6
- * bugs :: [github.com/rubylibs/fotos/issues](https://github.com/rubylibs/fotos/issues)
7
- * gem :: [rubygems.org/gems/fotos](https://rubygems.org/gems/fotos)
8
- * rdoc :: [rubydoc.info/gems/fotos](http://rubydoc.info/gems/fotos)
5
+ * home :: [github.com/rubycoco/webget](https://github.com/rubycoco/webget)
6
+ * bugs :: [github.com/rubycoco/webget/issues](https://github.com/rubycoco/webget/issues)
7
+ * gem :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
8
+ * rdoc :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
9
9
 
10
10
 
11
11
  ## Usage
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ Hoe.spec 'webget' do
5
5
 
6
6
  self.version = Webget::VERSION
7
7
 
8
- self.summary = 'webget gem - yet (another) network client for world wide web (www) requests '
8
+ self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
9
9
  self.description = summary
10
10
 
11
11
  self.urls = { home: 'https://github.com/rubycoco/fetcher' }
@@ -1,7 +1,35 @@
1
+ require 'pp'
2
+ require 'time'
3
+ require 'date'
4
+ require 'fileutils'
1
5
 
2
- # our own code
3
- require 'webget/version' # note: let version always go first
6
+ require 'uri'
7
+ require 'net/http'
8
+ require 'net/https'
4
9
 
10
+ require 'json'
11
+ require 'yaml'
5
12
 
6
- # say hello
7
- puts Webget.banner if defined?( $RUBYLIBS_DEBUG )
13
+
14
+
15
+ ## our own code
16
+ require 'webget/version' # let version go first
17
+ require 'webget/webclient'
18
+ require 'webget/webcache'
19
+ require 'webget/webget'
20
+
21
+
22
+
23
+
24
+ ############
25
+ ## add convenience alias for camel case / alternate different spelling
26
+ WebCache = Webcache
27
+ WebClient = Webclient
28
+ WebGet = Webget
29
+
30
+ ## use Webgo as (alias) name (keep reserver for now) - why? why not?
31
+ WebGo = Webget
32
+ Webgo = Webget
33
+
34
+
35
+ puts Webget.banner # say hello
@@ -1,14 +1,16 @@
1
1
 
2
2
  class Webget
3
+
3
4
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
4
- MINOR = 0
5
- PATCH = 1
5
+ MINOR = 1
6
+ PATCH = 0
6
7
  VERSION = [MAJOR,MINOR,PATCH].join('.')
7
8
 
8
9
  def self.version
9
10
  VERSION
10
11
  end
11
12
 
13
+ # version string for generator meta tag (includes ruby version)
12
14
  def self.banner
13
15
  "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
14
16
  end
@@ -16,5 +18,6 @@ class Webget
16
18
  def self.root
17
19
  "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
18
20
  end
19
- end # module Webget
21
+
22
+ end # module Webget
20
23
 
@@ -0,0 +1,167 @@
1
+
2
+
3
+ module Webcache
4
+
5
+ #####
6
+ # copied from props gem, see Env.home
7
+ # - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
8
+ # todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
9
+ def self.home
10
+ path = if( ENV['HOME'] || ENV['USERPROFILE'] )
11
+ ENV['HOME'] || ENV['USERPROFILE']
12
+ elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
13
+ "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
14
+ else
15
+ begin
16
+ File.expand_path('~')
17
+ rescue
18
+ if File::ALT_SEPARATOR
19
+ 'C:/'
20
+ else
21
+ '/'
22
+ end
23
+ end
24
+ end
25
+
26
+ ## note: use File.expand_path to "unify" path e.g
27
+ ## C:\Users\roman becomes
28
+ ## C:/Users/roman
29
+
30
+ File.expand_path( path )
31
+ end
32
+
33
+
34
+ class Configuration
35
+ ## root directory - todo/check: find/use a better name - why? why not?
36
+ def root() @root || "#{Webcache.home}/.cache"; end
37
+ def root=(value) @root = value; end
38
+ end # class Configuration
39
+
40
+
41
+ ## lets you use
42
+ ## Webcache.configure do |config|
43
+ ## config.root = './cache'
44
+ ## end
45
+ def self.configure() yield( config ); end
46
+ def self.config() @config ||= Configuration.new; end
47
+
48
+
49
+ ## add "high level" root convenience helpers
50
+ def self.root() config.root; end
51
+ def self.root=(value) config.root = value; end
52
+
53
+
54
+ ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
55
+ def self.cache() @cache ||= DiskCache.new; end
56
+
57
+ def self.record( url, response, format: 'html' )
58
+ cache.record( url, response, format: format );
59
+ end
60
+ def self.cached?( url ) cache.cached?( url ); end
61
+ class << self
62
+ alias_method :exist?, :cached?
63
+ end
64
+ def self.url_to_id( url ) cache.url_to_id( url ); end ## todo/check: rename to just id or something - why? why not?
65
+ def self.read( url ) cache.read( url ); end
66
+
67
+
68
+ class DiskCache
69
+ def cached?( url )
70
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
71
+ File.exist?( body_path )
72
+ end
73
+ alias_method :exist?, :cached?
74
+
75
+
76
+ def read( url )
77
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
78
+ File.open( body_path, 'r:utf-8' ) {|f| f.read }
79
+ end
80
+
81
+
82
+ ## add more save / put / etc. aliases - why? why not?
83
+ ## rename to record_html - why? why not?
84
+ def record( url, response, format: 'html' )
85
+
86
+ body_path = "#{Webcache.root}/#{url_to_path( url )}"
87
+ meta_path = "#{body_path}.meta.txt"
88
+
89
+ ## make sure path exits
90
+ FileUtils.mkdir_p( File.dirname( body_path ) )
91
+
92
+
93
+ puts "[cache] saving #{body_path}..."
94
+
95
+ ## todo/check: verify content-type - why? why not?
96
+ if format == 'json'
97
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
98
+ else
99
+ ## note - for now always assume utf8!!!!!!!!!
100
+ File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
101
+ end
102
+
103
+ File.open( meta_path, 'w:utf-8' ) do |f|
104
+ response.headers.each do |key, value| # iterate all response headers
105
+ f.write( "#{key}: #{value}" )
106
+ f.write( "\n" )
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+ ### note: use file path as id for DiskCache (is different for DbCache/SqlCache?)
114
+ ## use file:// instead of disk:// - why? why not?
115
+ def url_to_id( str ) "disk://#{url_to_path( str )}"; end
116
+
117
+
118
+ ### helpers
119
+ def url_to_path( str )
120
+ ## map url to file path
121
+ uri = URI.parse( str )
122
+
123
+ ## note: ignore scheme (e.g. http/https)
124
+ ## and post (e.g. 80, 8080, etc.) for now
125
+ ## always downcase for now (internet domain is case insensitive)
126
+ host_dir = uri.host.downcase
127
+
128
+ ## "/this/is/everything?query=params"
129
+ ## cut-off leading slash and
130
+ ## convert query ? =
131
+ req_path = uri.request_uri[1..-1]
132
+
133
+
134
+
135
+ ### special "prettify" rule for weltfussball
136
+ ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html
137
+ if host_dir.index( 'weltfussball.de' ) ||
138
+ host_dir.index( 'worldfootball.net' )
139
+ if req_path.end_with?( '/' )
140
+ req_path = "#{req_path[0..-2]}.html"
141
+ else
142
+ puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
143
+ exit 1
144
+ end
145
+ elsif host_dir.index( 'football-data.org' )
146
+ req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/
147
+
148
+ ## flattern - make a file path - for auto-save
149
+ ## change ? to -I-
150
+ ## change / to ~~
151
+ ## change = to ~
152
+ req_path = req_path.gsub( '?', '-I-' )
153
+ .gsub( '/', '~~' )
154
+ .gsub( '=', '~')
155
+
156
+ req_path = "#{req_path}.json"
157
+ else
158
+ ## no special rule
159
+ end
160
+
161
+ page_path = "#{host_dir}/#{req_path}"
162
+ page_path
163
+ end
164
+ end # class DiskCache
165
+
166
+
167
+ end # module Webcache
@@ -0,0 +1,85 @@
1
+
2
+ class Webclient
3
+
4
+ class Response # nested class - wrap Net::HTTP::Response
5
+ def initialize( response )
6
+ @response = response
7
+ end
8
+ def raw() @response; end
9
+
10
+
11
+ def text
12
+ # note: Net::HTTP will NOT set encoding UTF-8 etc.
13
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
14
+ # thus, set/force encoding to utf-8
15
+ text = @response.body.to_s
16
+ text = text.force_encoding( Encoding::UTF_8 )
17
+ text
18
+ end
19
+
20
+ ## convenience helper; returns parsed json data
21
+ def json() JSON.parse( text ); end
22
+
23
+
24
+
25
+ class Headers # nested (nested) class
26
+ def initialize( response )
27
+ @response = response
28
+ end
29
+ def each( &blk )
30
+ @response.each_header do |key, value| # Iterate all response headers
31
+ blk.call( key, value )
32
+ end
33
+ end
34
+ end
35
+ def headers() @headers ||= Headers.new( @response ); end
36
+
37
+ class Status # nested (nested) class
38
+ def initialize( response )
39
+ @response = response
40
+ end
41
+ def code() @response.code.to_i; end
42
+ def ok?() code == 200; end
43
+ def nok?() code != 200; end
44
+ def message() @response.message; end
45
+ end
46
+ def status() @status ||= Status.new( @response ); end
47
+ end # (nested) class Response
48
+
49
+
50
+ def self.get( url, headers: {} )
51
+
52
+ uri = URI.parse( url )
53
+ http = Net::HTTP.new( uri.host, uri.port )
54
+
55
+ if uri.instance_of? URI::HTTPS
56
+ http.use_ssl = true
57
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
58
+ end
59
+
60
+ request = Net::HTTP::Get.new( uri.request_uri )
61
+
62
+ ### add (custom) headers if any
63
+ ## check/todo: is there are more idiomatic way for Net::HTTP ???
64
+ ## use
65
+ ## request = Net::HTTP::Get.new( uri.request_uri, headers )
66
+ ## why? why not?
67
+ ## instead of e.g.
68
+ ## request['X-Auth-Token'] = 'xxxxxxx'
69
+ ## request['User-Agent'] = 'ruby'
70
+ ## request['Accept'] = '*/*'
71
+ if headers && headers.size > 0
72
+ headers.each do |key,value|
73
+ request[ key ] = value
74
+ end
75
+ end
76
+
77
+
78
+ response = http.request( request )
79
+
80
+ ## note: return "unified" wrapped response
81
+ Response.new( response )
82
+ end # method self.get
83
+
84
+ end # class Webclient
85
+
@@ -0,0 +1,64 @@
1
+
2
+ class Webget # a web (go get) crawler
3
+
4
+ class Configuration ## nested class
5
+
6
+ #######################
7
+ ## accessors
8
+ def sleep() @sleep || 3; end ### todo/check: use delay / wait or such?
9
+ def sleep=(value) @sleep = value; end
10
+
11
+ end # (nested) class Configuration
12
+
13
+ ## lets you use
14
+ ## Webget.configure do |config|
15
+ ## config.sleep = 10
16
+ ## end
17
+ def self.configure() yield( config ); end
18
+ def self.config() @config ||= Configuration.new; end
19
+
20
+
21
+
22
+ def self.call( url, headers: {} ) ## assumes json format
23
+ puts " sleep #{config.sleep} sec(s)..."
24
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
25
+
26
+ response = Webclient.get( url, headers: headers )
27
+
28
+ if response.status.ok? ## must be HTTP 200
29
+ puts "#{response.status.code} #{response.status.message}"
30
+ ## note: use format json for pretty printing and parse check!!!!
31
+ Webcache.record( url, response,
32
+ format: 'json' )
33
+ else
34
+ ## todo/check - log error
35
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
36
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
37
+ end
38
+
39
+ ## to be done / continued
40
+ response
41
+ end # method self.call
42
+
43
+
44
+ def self.page( url, headers: {} ) ## assumes html format
45
+ puts " sleep #{config.sleep} sec(s)..."
46
+ sleep( config.sleep ) ## slow down - sleep 3secs before each http request
47
+
48
+ response = Webclient.get( url, headers: headers )
49
+
50
+ if response.status.ok? ## must be HTTP 200
51
+ puts "#{response.status.code} #{response.status.message}"
52
+ Webcache.record( url, response ) ## assumes format: html (default)
53
+ else
54
+ ## todo/check - log error
55
+ puts "!! ERROR - #{response.status.code} #{response.status.message}:"
56
+ pp response.raw ## note: dump inner (raw) response (NOT the wrapped)
57
+ end
58
+
59
+ ## to be done / continued
60
+ response
61
+ end # method self.page
62
+
63
+ end # class Webget
64
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
@@ -44,7 +44,7 @@ dependencies:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
46
  version: '3.22'
47
- description: 'webget gem - yet (another) network client for world wide web (www) requests '
47
+ description: webget gem - yet (another) network client for world wide web (www) requests
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
50
50
  extensions: []
@@ -59,6 +59,9 @@ files:
59
59
  - Rakefile
60
60
  - lib/webget.rb
61
61
  - lib/webget/version.rb
62
+ - lib/webget/webcache.rb
63
+ - lib/webget/webclient.rb
64
+ - lib/webget/webget.rb
62
65
  homepage: https://github.com/rubycoco/fetcher
63
66
  licenses:
64
67
  - Public Domain