webget 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Manifest.txt +3 -0
 - data/README.md +4 -4
 - data/Rakefile +1 -1
 - data/lib/webget.rb +32 -4
 - data/lib/webget/version.rb +6 -3
 - data/lib/webget/webcache.rb +167 -0
 - data/lib/webget/webclient.rb +85 -0
 - data/lib/webget/webget.rb +64 -0
 - metadata +5 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 6e0feb02f55f01692b3353f5aaf15ab2201dfb02
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: da3d146f2fc6db90e2a34c9c385d7e9e9d3c272c
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 49fb39362398e09eac27d929f0689a604234818f8ff6fc2422217977e6312fe46bf0273059d9310ba9b40dea62b75bc437b6e90cc36d5e39456d1927e36739ee
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: b81245bc4a7029eae5712aa94f48784240b6666d48ddc270c360980bc6f5599508ff49c7d128bafdff8419b378ab1c6d83282090a6ec10ad1a77a228fb70232a
         
     | 
    
        data/Manifest.txt
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | 
         @@ -2,10 +2,10 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            webget gem - yet (another) network client for world wide web (www) requests via HTTP
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
     | 
    
         
            -
            * home  :: [github.com/ 
     | 
| 
       6 
     | 
    
         
            -
            * bugs  :: [github.com/ 
     | 
| 
       7 
     | 
    
         
            -
            * gem   :: [rubygems.org/gems/ 
     | 
| 
       8 
     | 
    
         
            -
            * rdoc  :: [rubydoc.info/gems/ 
     | 
| 
      
 5 
     | 
    
         
            +
            * home  :: [github.com/rubycoco/webget](https://github.com/rubycoco/webget)
         
     | 
| 
      
 6 
     | 
    
         
            +
            * bugs  :: [github.com/rubycoco/webget/issues](https://github.com/rubycoco/webget/issues)
         
     | 
| 
      
 7 
     | 
    
         
            +
            * gem   :: [rubygems.org/gems/webget](https://rubygems.org/gems/webget)
         
     | 
| 
      
 8 
     | 
    
         
            +
            * rdoc  :: [rubydoc.info/gems/webget](http://rubydoc.info/gems/webget)
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
11 
     | 
    
         
             
            ## Usage
         
     | 
    
        data/Rakefile
    CHANGED
    
    | 
         @@ -5,7 +5,7 @@ Hoe.spec 'webget' do 
     | 
|
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
              self.version = Webget::VERSION
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
     | 
    
         
            -
              self.summary = 'webget gem - yet (another) network client for world wide web (www) requests 
     | 
| 
      
 8 
     | 
    
         
            +
              self.summary = 'webget gem - yet (another) network client for world wide web (www) requests'
         
     | 
| 
       9 
9 
     | 
    
         
             
              self.description = summary
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
11 
     | 
    
         
             
              self.urls    = { home: 'https://github.com/rubycoco/fetcher' }
         
     | 
    
        data/lib/webget.rb
    CHANGED
    
    | 
         @@ -1,7 +1,35 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'pp'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'time'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'date'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'fileutils'
         
     | 
| 
       1 
5 
     | 
    
         | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
            require ' 
     | 
| 
      
 6 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'net/https'
         
     | 
| 
       4 
9 
     | 
    
         | 
| 
      
 10 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 11 
     | 
    
         
            +
            require 'yaml'
         
     | 
| 
       5 
12 
     | 
    
         | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            ## our own code
         
     | 
| 
      
 16 
     | 
    
         
            +
            require 'webget/version'   # let version go first
         
     | 
| 
      
 17 
     | 
    
         
            +
            require 'webget/webclient'
         
     | 
| 
      
 18 
     | 
    
         
            +
            require 'webget/webcache'
         
     | 
| 
      
 19 
     | 
    
         
            +
            require 'webget/webget'
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            ############
         
     | 
| 
      
 25 
     | 
    
         
            +
            ## add convenience alias for camel case / alternate different spelling
         
     | 
| 
      
 26 
     | 
    
         
            +
            WebCache  = Webcache
         
     | 
| 
      
 27 
     | 
    
         
            +
            WebClient = Webclient
         
     | 
| 
      
 28 
     | 
    
         
            +
            WebGet    = Webget
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            ## use Webgo as (alias) name (keep reserver for now) - why? why not?
         
     | 
| 
      
 31 
     | 
    
         
            +
            WebGo    = Webget
         
     | 
| 
      
 32 
     | 
    
         
            +
            Webgo    = Webget
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            puts Webget.banner   # say hello
         
     | 
    
        data/lib/webget/version.rb
    CHANGED
    
    | 
         @@ -1,14 +1,16 @@ 
     | 
|
| 
       1 
1 
     | 
    
         | 
| 
       2 
2 
     | 
    
         
             
            class Webget
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
       3 
4 
     | 
    
         
             
              MAJOR = 0    ## todo: namespace inside version or something - why? why not??
         
     | 
| 
       4 
     | 
    
         
            -
              MINOR =  
     | 
| 
       5 
     | 
    
         
            -
              PATCH =  
     | 
| 
      
 5 
     | 
    
         
            +
              MINOR = 1
         
     | 
| 
      
 6 
     | 
    
         
            +
              PATCH = 0
         
     | 
| 
       6 
7 
     | 
    
         
             
              VERSION = [MAJOR,MINOR,PATCH].join('.')
         
     | 
| 
       7 
8 
     | 
    
         | 
| 
       8 
9 
     | 
    
         
             
              def self.version
         
     | 
| 
       9 
10 
     | 
    
         
             
                VERSION
         
     | 
| 
       10 
11 
     | 
    
         
             
              end
         
     | 
| 
       11 
12 
     | 
    
         | 
| 
      
 13 
     | 
    
         
            +
              # version string for generator meta tag (includes ruby version)
         
     | 
| 
       12 
14 
     | 
    
         
             
              def self.banner
         
     | 
| 
       13 
15 
     | 
    
         
             
                "webget/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
         
     | 
| 
       14 
16 
     | 
    
         
             
              end
         
     | 
| 
         @@ -16,5 +18,6 @@ class Webget 
     | 
|
| 
       16 
18 
     | 
    
         
             
              def self.root
         
     | 
| 
       17 
19 
     | 
    
         
             
                "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
         
     | 
| 
       18 
20 
     | 
    
         
             
              end
         
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            end  # module Webget
         
     | 
| 
       20 
23 
     | 
    
         | 
| 
         @@ -0,0 +1,167 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Webcache
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              #####
         
     | 
| 
      
 6 
     | 
    
         
            +
              # copied from props gem, see Env.home
         
     | 
| 
      
 7 
     | 
    
         
            +
              #    - https://github.com/rubycoco/props/blob/master/props/lib/props/env.rb
         
     | 
| 
      
 8 
     | 
    
         
            +
              #   todo/fix: use original - and do NOT copy-n-paste!!! - why? why not?
         
     | 
| 
      
 9 
     | 
    
         
            +
              def self.home
         
     | 
| 
      
 10 
     | 
    
         
            +
                path = if( ENV['HOME'] || ENV['USERPROFILE'] )
         
     | 
| 
      
 11 
     | 
    
         
            +
                         ENV['HOME'] || ENV['USERPROFILE']
         
     | 
| 
      
 12 
     | 
    
         
            +
                       elsif( ENV['HOMEDRIVE'] && ENV['HOMEPATH'] )
         
     | 
| 
      
 13 
     | 
    
         
            +
                         "#{ENV['HOMEDRIVE']}#{ENV['HOMEPATH']}"
         
     | 
| 
      
 14 
     | 
    
         
            +
                       else
         
     | 
| 
      
 15 
     | 
    
         
            +
                         begin
         
     | 
| 
      
 16 
     | 
    
         
            +
                            File.expand_path('~')
         
     | 
| 
      
 17 
     | 
    
         
            +
                         rescue
         
     | 
| 
      
 18 
     | 
    
         
            +
                            if File::ALT_SEPARATOR
         
     | 
| 
      
 19 
     | 
    
         
            +
                               'C:/'
         
     | 
| 
      
 20 
     | 
    
         
            +
                            else
         
     | 
| 
      
 21 
     | 
    
         
            +
                               '/'
         
     | 
| 
      
 22 
     | 
    
         
            +
                            end
         
     | 
| 
      
 23 
     | 
    
         
            +
                         end
         
     | 
| 
      
 24 
     | 
    
         
            +
                       end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                ## note: use File.expand_path to "unify" path e.g
         
     | 
| 
      
 27 
     | 
    
         
            +
                ##  C:\Users\roman  becomes
         
     | 
| 
      
 28 
     | 
    
         
            +
                ##  C:/Users/roman
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                File.expand_path( path )
         
     | 
| 
      
 31 
     | 
    
         
            +
             end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
             class Configuration
         
     | 
| 
      
 35 
     | 
    
         
            +
                ## root directory - todo/check: find/use a better name - why? why not?
         
     | 
| 
      
 36 
     | 
    
         
            +
                def root()       @root || "#{Webcache.home}/.cache"; end
         
     | 
| 
      
 37 
     | 
    
         
            +
                def root=(value) @root = value; end
         
     | 
| 
      
 38 
     | 
    
         
            +
             end # class Configuration
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             ## lets you use
         
     | 
| 
      
 42 
     | 
    
         
            +
             ##   Webcache.configure do |config|
         
     | 
| 
      
 43 
     | 
    
         
            +
             ##      config.root = './cache'
         
     | 
| 
      
 44 
     | 
    
         
            +
             ##   end
         
     | 
| 
      
 45 
     | 
    
         
            +
             def self.configure() yield( config ); end
         
     | 
| 
      
 46 
     | 
    
         
            +
             def self.config()    @config ||= Configuration.new;  end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
             ## add "high level" root convenience helpers
         
     | 
| 
      
 50 
     | 
    
         
            +
             def self.root()       config.root; end
         
     | 
| 
      
 51 
     | 
    
         
            +
             def self.root=(value) config.root = value; end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
             ### "interface" for "generic" cache storage (might be sqlite database or filesystem)
         
     | 
| 
      
 55 
     | 
    
         
            +
             def self.cache() @cache ||= DiskCache.new; end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
             def self.record( url, response, format: 'html' )
         
     | 
| 
      
 58 
     | 
    
         
            +
               cache.record( url, response, format: format );
         
     | 
| 
      
 59 
     | 
    
         
            +
             end
         
     | 
| 
      
 60 
     | 
    
         
            +
             def self.cached?( url ) cache.cached?( url ); end
         
     | 
| 
      
 61 
     | 
    
         
            +
             class << self
         
     | 
| 
      
 62 
     | 
    
         
            +
               alias_method :exist?, :cached?
         
     | 
| 
      
 63 
     | 
    
         
            +
             end
         
     | 
| 
      
 64 
     | 
    
         
            +
             def self.url_to_id( url ) cache.url_to_id( url ); end  ## todo/check: rename to just id or something - why? why not?
         
     | 
| 
      
 65 
     | 
    
         
            +
             def self.read( url ) cache.read( url ); end
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
            class DiskCache
         
     | 
| 
      
 69 
     | 
    
         
            +
              def cached?( url )
         
     | 
| 
      
 70 
     | 
    
         
            +
                body_path = "#{Webcache.root}/#{url_to_path( url )}"
         
     | 
| 
      
 71 
     | 
    
         
            +
                File.exist?( body_path )
         
     | 
| 
      
 72 
     | 
    
         
            +
              end
         
     | 
| 
      
 73 
     | 
    
         
            +
              alias_method :exist?, :cached?
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
              def read( url )
         
     | 
| 
      
 77 
     | 
    
         
            +
                body_path = "#{Webcache.root}/#{url_to_path( url )}"
         
     | 
| 
      
 78 
     | 
    
         
            +
                File.open( body_path, 'r:utf-8' ) {|f| f.read }
         
     | 
| 
      
 79 
     | 
    
         
            +
              end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
              ## add more save / put / etc. aliases - why? why not?
         
     | 
| 
      
 83 
     | 
    
         
            +
              ##  rename to record_html - why? why not?
         
     | 
| 
      
 84 
     | 
    
         
            +
              def record( url, response, format: 'html' )
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                body_path = "#{Webcache.root}/#{url_to_path( url )}"
         
     | 
| 
      
 87 
     | 
    
         
            +
                meta_path = "#{body_path}.meta.txt"
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                ## make sure path exits
         
     | 
| 
      
 90 
     | 
    
         
            +
                FileUtils.mkdir_p( File.dirname( body_path ) )
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
      
 93 
     | 
    
         
            +
                puts "[cache] saving #{body_path}..."
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
                ## todo/check: verify content-type - why? why not?
         
     | 
| 
      
 96 
     | 
    
         
            +
                if format == 'json'
         
     | 
| 
      
 97 
     | 
    
         
            +
                  File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
         
     | 
| 
      
 98 
     | 
    
         
            +
                else
         
     | 
| 
      
 99 
     | 
    
         
            +
                  ## note - for now always assume utf8!!!!!!!!!
         
     | 
| 
      
 100 
     | 
    
         
            +
                  File.open( body_path, 'w:utf-8' ) {|f| f.write( response.text ) }
         
     | 
| 
      
 101 
     | 
    
         
            +
                end
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                File.open( meta_path, 'w:utf-8' ) do |f|
         
     | 
| 
      
 104 
     | 
    
         
            +
                  response.headers.each do |key, value|  # iterate all response headers
         
     | 
| 
      
 105 
     | 
    
         
            +
                    f.write( "#{key}: #{value}" )
         
     | 
| 
      
 106 
     | 
    
         
            +
                    f.write( "\n" )
         
     | 
| 
      
 107 
     | 
    
         
            +
                  end
         
     | 
| 
      
 108 
     | 
    
         
            +
                end
         
     | 
| 
      
 109 
     | 
    
         
            +
              end
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
              ### note: use file path as id for DiskCache  (is different for DbCache/SqlCache?)
         
     | 
| 
      
 114 
     | 
    
         
            +
              ##    use file:// instead of disk:// - why? why not?
         
     | 
| 
      
 115 
     | 
    
         
            +
              def url_to_id( str ) "disk://#{url_to_path( str )}"; end
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
              ### helpers
         
     | 
| 
      
 119 
     | 
    
         
            +
              def url_to_path( str )
         
     | 
| 
      
 120 
     | 
    
         
            +
                ## map url to file path
         
     | 
| 
      
 121 
     | 
    
         
            +
                uri = URI.parse( str )
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                ## note: ignore scheme (e.g. http/https)
         
     | 
| 
      
 124 
     | 
    
         
            +
                ##         and  post  (e.g. 80, 8080, etc.) for now
         
     | 
| 
      
 125 
     | 
    
         
            +
                ##    always downcase for now (internet domain is case insensitive)
         
     | 
| 
      
 126 
     | 
    
         
            +
                host_dir = uri.host.downcase
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                ## "/this/is/everything?query=params"
         
     | 
| 
      
 129 
     | 
    
         
            +
                ##   cut-off leading slash and
         
     | 
| 
      
 130 
     | 
    
         
            +
                ##    convert query ? =
         
     | 
| 
      
 131 
     | 
    
         
            +
                req_path = uri.request_uri[1..-1]
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
             
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
                ### special "prettify" rule for weltfussball
         
     | 
| 
      
 136 
     | 
    
         
            +
                ##   /eng-league-one-2019-2020/  => /eng-league-one-2019-2020.html
         
     | 
| 
      
 137 
     | 
    
         
            +
                if host_dir.index( 'weltfussball.de' ) ||
         
     | 
| 
      
 138 
     | 
    
         
            +
                   host_dir.index( 'worldfootball.net' )
         
     | 
| 
      
 139 
     | 
    
         
            +
                      if req_path.end_with?( '/' )
         
     | 
| 
      
 140 
     | 
    
         
            +
                         req_path = "#{req_path[0..-2]}.html"
         
     | 
| 
      
 141 
     | 
    
         
            +
                      else
         
     | 
| 
      
 142 
     | 
    
         
            +
                        puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
         
     | 
| 
      
 143 
     | 
    
         
            +
                        exit 1
         
     | 
| 
      
 144 
     | 
    
         
            +
                      end
         
     | 
| 
      
 145 
     | 
    
         
            +
                elsif host_dir.index( 'football-data.org' )
         
     | 
| 
      
 146 
     | 
    
         
            +
                  req_path = req_path.sub( 'v2/', '' )  # shorten - cut off v2/
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
      
 148 
     | 
    
         
            +
                  ## flattern - make a file path - for auto-save
         
     | 
| 
      
 149 
     | 
    
         
            +
                  ##   change ? to -I-
         
     | 
| 
      
 150 
     | 
    
         
            +
                  ##   change / to ~~
         
     | 
| 
      
 151 
     | 
    
         
            +
                  ##   change = to ~
         
     | 
| 
      
 152 
     | 
    
         
            +
                  req_path = req_path.gsub( '?', '-I-' )
         
     | 
| 
      
 153 
     | 
    
         
            +
                                     .gsub( '/', '~~' )
         
     | 
| 
      
 154 
     | 
    
         
            +
                                     .gsub( '=', '~')
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                  req_path = "#{req_path}.json"
         
     | 
| 
      
 157 
     | 
    
         
            +
                else
         
     | 
| 
      
 158 
     | 
    
         
            +
                  ## no special rule
         
     | 
| 
      
 159 
     | 
    
         
            +
                end
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
                page_path = "#{host_dir}/#{req_path}"
         
     | 
| 
      
 162 
     | 
    
         
            +
                page_path
         
     | 
| 
      
 163 
     | 
    
         
            +
              end
         
     | 
| 
      
 164 
     | 
    
         
            +
            end # class DiskCache
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
            end  # module Webcache
         
     | 
| 
         @@ -0,0 +1,85 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            class Webclient
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              class Response   # nested class - wrap Net::HTTP::Response
         
     | 
| 
      
 5 
     | 
    
         
            +
                def initialize( response )
         
     | 
| 
      
 6 
     | 
    
         
            +
                  @response = response
         
     | 
| 
      
 7 
     | 
    
         
            +
                end
         
     | 
| 
      
 8 
     | 
    
         
            +
                def raw() @response; end
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                def text
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # note: Net::HTTP will NOT set encoding UTF-8 etc.
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # thus, set/force encoding to utf-8
         
     | 
| 
      
 15 
     | 
    
         
            +
                  text = @response.body.to_s
         
     | 
| 
      
 16 
     | 
    
         
            +
                  text = text.force_encoding( Encoding::UTF_8 )
         
     | 
| 
      
 17 
     | 
    
         
            +
                  text
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                ## convenience helper; returns parsed json data
         
     | 
| 
      
 21 
     | 
    
         
            +
                def json() JSON.parse( text ); end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                class Headers # nested (nested) class
         
     | 
| 
      
 26 
     | 
    
         
            +
                  def initialize( response )
         
     | 
| 
      
 27 
     | 
    
         
            +
                    @response = response
         
     | 
| 
      
 28 
     | 
    
         
            +
                  end
         
     | 
| 
      
 29 
     | 
    
         
            +
                  def each( &blk )
         
     | 
| 
      
 30 
     | 
    
         
            +
                    @response.each_header do |key, value|  # Iterate all response headers
         
     | 
| 
      
 31 
     | 
    
         
            +
                      blk.call( key, value )
         
     | 
| 
      
 32 
     | 
    
         
            +
                    end
         
     | 
| 
      
 33 
     | 
    
         
            +
                  end
         
     | 
| 
      
 34 
     | 
    
         
            +
                end
         
     | 
| 
      
 35 
     | 
    
         
            +
                def headers() @headers ||= Headers.new( @response ); end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                class Status  # nested (nested) class
         
     | 
| 
      
 38 
     | 
    
         
            +
                  def initialize( response )
         
     | 
| 
      
 39 
     | 
    
         
            +
                    @response = response
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
                  def code() @response.code.to_i; end
         
     | 
| 
      
 42 
     | 
    
         
            +
                  def ok?()  code == 200; end
         
     | 
| 
      
 43 
     | 
    
         
            +
                  def nok?() code != 200; end
         
     | 
| 
      
 44 
     | 
    
         
            +
                  def message() @response.message; end
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
                def status() @status ||= Status.new( @response ); end
         
     | 
| 
      
 47 
     | 
    
         
            +
              end # (nested) class Response
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            def self.get( url, headers: {} )
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
              uri = URI.parse( url )
         
     | 
| 
      
 53 
     | 
    
         
            +
              http = Net::HTTP.new( uri.host, uri.port )
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
              if uri.instance_of? URI::HTTPS
         
     | 
| 
      
 56 
     | 
    
         
            +
                http.use_ssl     = true
         
     | 
| 
      
 57 
     | 
    
         
            +
                http.verify_mode = OpenSSL::SSL::VERIFY_NONE
         
     | 
| 
      
 58 
     | 
    
         
            +
              end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
              request = Net::HTTP::Get.new( uri.request_uri )
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
              ### add (custom) headers if any
         
     | 
| 
      
 63 
     | 
    
         
            +
              ##  check/todo: is there are more idiomatic way for Net::HTTP ???
         
     | 
| 
      
 64 
     | 
    
         
            +
              ##   use
         
     | 
| 
      
 65 
     | 
    
         
            +
              ##     request = Net::HTTP::Get.new( uri.request_uri, headers )
         
     | 
| 
      
 66 
     | 
    
         
            +
              ##    why? why not?
         
     | 
| 
      
 67 
     | 
    
         
            +
              ##  instead of e.g.
         
     | 
| 
      
 68 
     | 
    
         
            +
              ##   request['X-Auth-Token'] = 'xxxxxxx'
         
     | 
| 
      
 69 
     | 
    
         
            +
              ##   request['User-Agent']   = 'ruby'
         
     | 
| 
      
 70 
     | 
    
         
            +
              ##   request['Accept']       = '*/*'
         
     | 
| 
      
 71 
     | 
    
         
            +
              if headers && headers.size > 0
         
     | 
| 
      
 72 
     | 
    
         
            +
                headers.each do |key,value|
         
     | 
| 
      
 73 
     | 
    
         
            +
                  request[ key ] = value
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
              end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
              response = http.request( request )
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
              ## note: return "unified" wrapped response
         
     | 
| 
      
 81 
     | 
    
         
            +
              Response.new( response )
         
     | 
| 
      
 82 
     | 
    
         
            +
            end  # method self.get
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            end  # class Webclient
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,64 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            class Webget   # a web (go get) crawler
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              class Configuration  ## nested class
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                #######################
         
     | 
| 
      
 7 
     | 
    
         
            +
                ## accessors
         
     | 
| 
      
 8 
     | 
    
         
            +
                def sleep()       @sleep || 3; end     ### todo/check: use delay / wait or such?
         
     | 
| 
      
 9 
     | 
    
         
            +
                def sleep=(value) @sleep = value; end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              end # (nested) class Configuration
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
              ## lets you use
         
     | 
| 
      
 14 
     | 
    
         
            +
              ##   Webget.configure do |config|
         
     | 
| 
      
 15 
     | 
    
         
            +
              ##      config.sleep = 10
         
     | 
| 
      
 16 
     | 
    
         
            +
              ##   end
         
     | 
| 
      
 17 
     | 
    
         
            +
              def self.configure() yield( config ); end
         
     | 
| 
      
 18 
     | 
    
         
            +
              def self.config()    @config ||= Configuration.new;  end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
              def self.call( url, headers: {} )  ## assumes json format
         
     | 
| 
      
 23 
     | 
    
         
            +
                puts "  sleep #{config.sleep} sec(s)..."
         
     | 
| 
      
 24 
     | 
    
         
            +
                sleep( config.sleep )   ## slow down - sleep 3secs before each http request
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                response = Webclient.get( url, headers: headers )
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                if response.status.ok?  ## must be HTTP 200
         
     | 
| 
      
 29 
     | 
    
         
            +
                  puts "#{response.status.code} #{response.status.message}"
         
     | 
| 
      
 30 
     | 
    
         
            +
                  ## note: use format json for pretty printing and parse check!!!!
         
     | 
| 
      
 31 
     | 
    
         
            +
                  Webcache.record( url, response,
         
     | 
| 
      
 32 
     | 
    
         
            +
                                   format: 'json' )
         
     | 
| 
      
 33 
     | 
    
         
            +
                else
         
     | 
| 
      
 34 
     | 
    
         
            +
                  ## todo/check - log error
         
     | 
| 
      
 35 
     | 
    
         
            +
                  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
         
     | 
| 
      
 36 
     | 
    
         
            +
                  pp response.raw  ## note: dump inner (raw) response (NOT the wrapped)
         
     | 
| 
      
 37 
     | 
    
         
            +
                end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                ## to be done / continued
         
     | 
| 
      
 40 
     | 
    
         
            +
                response
         
     | 
| 
      
 41 
     | 
    
         
            +
              end  # method self.call
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
              def self.page( url, headers: {} )  ## assumes html format
         
     | 
| 
      
 45 
     | 
    
         
            +
                puts "  sleep #{config.sleep} sec(s)..."
         
     | 
| 
      
 46 
     | 
    
         
            +
                sleep( config.sleep )   ## slow down - sleep 3secs before each http request
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                response = Webclient.get( url, headers: headers )
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                if response.status.ok?  ## must be HTTP 200
         
     | 
| 
      
 51 
     | 
    
         
            +
                  puts "#{response.status.code} #{response.status.message}"
         
     | 
| 
      
 52 
     | 
    
         
            +
                  Webcache.record( url, response )   ## assumes format: html (default)
         
     | 
| 
      
 53 
     | 
    
         
            +
                else
         
     | 
| 
      
 54 
     | 
    
         
            +
                  ## todo/check - log error
         
     | 
| 
      
 55 
     | 
    
         
            +
                  puts "!! ERROR - #{response.status.code} #{response.status.message}:"
         
     | 
| 
      
 56 
     | 
    
         
            +
                  pp response.raw  ## note: dump inner (raw) response (NOT the wrapped)
         
     | 
| 
      
 57 
     | 
    
         
            +
                end
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
                ## to be done / continued
         
     | 
| 
      
 60 
     | 
    
         
            +
                response
         
     | 
| 
      
 61 
     | 
    
         
            +
              end  # method self.page
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
            end  # class Webget
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: webget
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Gerald Bauer
         
     | 
| 
         @@ -44,7 +44,7 @@ dependencies: 
     | 
|
| 
       44 
44 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       45 
45 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       46 
46 
     | 
    
         
             
                    version: '3.22'
         
     | 
| 
       47 
     | 
    
         
            -
            description:  
     | 
| 
      
 47 
     | 
    
         
            +
            description: webget gem - yet (another) network client for world wide web (www) requests
         
     | 
| 
       48 
48 
     | 
    
         
             
            email: ruby-talk@ruby-lang.org
         
     | 
| 
       49 
49 
     | 
    
         
             
            executables: []
         
     | 
| 
       50 
50 
     | 
    
         
             
            extensions: []
         
     | 
| 
         @@ -59,6 +59,9 @@ files: 
     | 
|
| 
       59 
59 
     | 
    
         
             
            - Rakefile
         
     | 
| 
       60 
60 
     | 
    
         
             
            - lib/webget.rb
         
     | 
| 
       61 
61 
     | 
    
         
             
            - lib/webget/version.rb
         
     | 
| 
      
 62 
     | 
    
         
            +
            - lib/webget/webcache.rb
         
     | 
| 
      
 63 
     | 
    
         
            +
            - lib/webget/webclient.rb
         
     | 
| 
      
 64 
     | 
    
         
            +
            - lib/webget/webget.rb
         
     | 
| 
       62 
65 
     | 
    
         
             
            homepage: https://github.com/rubycoco/fetcher
         
     | 
| 
       63 
66 
     | 
    
         
             
            licenses:
         
     | 
| 
       64 
67 
     | 
    
         
             
            - Public Domain
         
     |