datapackage 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/bin/console +0 -0
- data/bin/datapackage +0 -0
- data/lib/datapackage.rb +1 -0
- data/lib/datapackage/interpreter.rb +66 -0
- data/lib/datapackage/package.rb +18 -0
- data/lib/datapackage/resource.rb +28 -0
- data/lib/datapackage/version.rb +1 -1
- metadata +4 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: c75abeabc9ffe3f12589976c2790dec6c2e4a748b97a99d196408fd02689d432
         | 
| 4 | 
            +
              data.tar.gz: 94deaf207fd46c81e154b0617cbfcd2074f10dce2243eff888bb414618f756b8
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 2a54d0a226c49d9a38db69f0a48f5e56220a8636ff5c27601a56d5d4325f3a0efbcb5999347631c4e0238bc4429dcf532391160decc3a4179703c05afb6c343b
         | 
| 7 | 
            +
              data.tar.gz: 6390e3c8018ef3c6b8bf2c654b32e51dee41bca12762c6bc45ee5002547aa4080786c4235a331e0794458b855b884bd991522eb0621cd6fe6f1fc6e8d55dd5f5
         | 
    
        data/bin/console
    CHANGED
    
    | 
            File without changes
         | 
    
        data/bin/datapackage
    CHANGED
    
    | 
            File without changes
         | 
    
        data/lib/datapackage.rb
    CHANGED
    
    
| @@ -0,0 +1,66 @@ | |
| 1 | 
            +
            module DataPackage
         | 
| 2 | 
            +
              class Interpreter
         | 
| 3 | 
            +
                INFER_THRESHOLD = 10
         | 
| 4 | 
            +
                INFER_CONFIDENCE = 0.75
         | 
| 5 | 
            +
                YEAR_PATTERN = /[12]\d{3}/
         | 
| 6 | 
            +
                DATE_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
         | 
| 7 | 
            +
                DATETIME_PATTERN = /(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
         | 
| 8 | 
            +
                TIME_PATTERN = /^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
         | 
| 9 | 
            +
                INTEGER_PATTERN = /^\d+$/
         | 
| 10 | 
            +
                DEFAULT_TYPE_FORMAT = {'type' => 'any', 'format' => 'default'}
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                attr_reader :csv, :threshold
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def initialize(csv)
         | 
| 15 | 
            +
                  @csv = csv
         | 
| 16 | 
            +
                  @threshold = [csv.length, INFER_THRESHOLD].min
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def type_and_format_at(header)
         | 
| 20 | 
            +
                  values = csv.values_at(header).flatten
         | 
| 21 | 
            +
                  counter = {}
         | 
| 22 | 
            +
                  type_and_format = DEFAULT_TYPE_FORMAT
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  values.each_with_index do |value, i|
         | 
| 25 | 
            +
                    inspection_count = i + 1
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    inspection = inspect_value(value)
         | 
| 28 | 
            +
                    counter[inspection] = (counter[inspection] || 0) + 1
         | 
| 29 | 
            +
                    if inspection_count >= threshold
         | 
| 30 | 
            +
                      if counter[inspection] / inspection_count >= INFER_CONFIDENCE
         | 
| 31 | 
            +
                        type_and_format = inspection
         | 
| 32 | 
            +
                        break
         | 
| 33 | 
            +
                      end
         | 
| 34 | 
            +
                    end
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  type_and_format
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def inspect_value(value)
         | 
| 41 | 
            +
                  return DEFAULT_TYPE_FORMAT unless value.is_a?(String)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  if value.length == 4 && value.match(YEAR_PATTERN)
         | 
| 44 | 
            +
                    return { 'type' => 'year', 'format' => 'default' }
         | 
| 45 | 
            +
                  end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  if value.match(DATETIME_PATTERN)
         | 
| 48 | 
            +
                    return { 'type' => 'datetime', 'format' => 'default' }
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                  if value.match(DATE_PATTERN)
         | 
| 52 | 
            +
                    return { 'type' => 'date', 'format' => 'default' }
         | 
| 53 | 
            +
                  end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  if value.match(TIME_PATTERN)
         | 
| 56 | 
            +
                    return { 'type' => 'time', 'format' => 'default' }
         | 
| 57 | 
            +
                  end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  if value.match(INTEGER_PATTERN)
         | 
| 60 | 
            +
                    return { 'type' => 'integer', 'format' => 'default' }
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  DEFAULT_TYPE_FORMAT
         | 
| 64 | 
            +
                end
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
            end
         | 
    
        data/lib/datapackage/package.rb
    CHANGED
    
    | @@ -119,6 +119,24 @@ module DataPackage | |
| 119 119 | 
             
                  self[property] || default
         | 
| 120 120 | 
             
                end
         | 
| 121 121 |  | 
| 122 | 
            +
                def infer(base_path: nil, directory: nil)
         | 
| 123 | 
            +
                  raise PackageException.new('Base path is required for infer') unless base_path
         | 
| 124 | 
            +
                  raise PackageException.new('Directory is required for infer') unless directory
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                  dir_path = File.join(base_path, directory)
         | 
| 127 | 
            +
                  Dir.glob("#{dir_path}/*.csv") do |filename|
         | 
| 128 | 
            +
                    resource = Resource.infer(filename)
         | 
| 129 | 
            +
                    add_resource(resource)
         | 
| 130 | 
            +
                  end
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                  # If there were CSVs, this is a tabular data package
         | 
| 133 | 
            +
                  if resources.count > 0
         | 
| 134 | 
            +
                    self['profile'] = 'tabular-data-package'
         | 
| 135 | 
            +
                  end
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                  descriptor
         | 
| 138 | 
            +
                end
         | 
| 139 | 
            +
             | 
| 122 140 | 
             
                # Private
         | 
| 123 141 |  | 
| 124 142 | 
             
                private
         | 
    
        data/lib/datapackage/resource.rb
    CHANGED
    
    | @@ -6,6 +6,33 @@ module DataPackage | |
| 6 6 |  | 
| 7 7 | 
             
                attr_reader :errors, :profile, :name, :source
         | 
| 8 8 |  | 
| 9 | 
            +
                def self.infer(filepath)
         | 
| 10 | 
            +
                  name = File.basename(filepath)
         | 
| 11 | 
            +
                  if name[-4..-1] != '.csv'
         | 
| 12 | 
            +
                    raise ResourceException.new('Inferrable resource must have .csv extension')
         | 
| 13 | 
            +
                  end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  descr = {
         | 
| 16 | 
            +
                    'format' => 'csv',
         | 
| 17 | 
            +
                    'mediatype' => 'text/csv',
         | 
| 18 | 
            +
                    'name' => name[0...-4],
         | 
| 19 | 
            +
                    'path' => filepath,
         | 
| 20 | 
            +
                    'schema' => {
         | 
| 21 | 
            +
                      'fields' => []
         | 
| 22 | 
            +
                    },
         | 
| 23 | 
            +
                  }
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  csv = CSV.read(filepath, headers: true)
         | 
| 26 | 
            +
                  interpreter = DataPackage::Interpreter.new(csv)
         | 
| 27 | 
            +
                  csv.headers.each do |header|
         | 
| 28 | 
            +
                    field = { 'name' => header, 'type' => 'string'}
         | 
| 29 | 
            +
                    field.merge! interpreter.type_and_format_at(header)
         | 
| 30 | 
            +
                    descr['schema']['fields'] << field
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  new(descr)
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 9 36 | 
             
                def initialize(resource, base_path = '')
         | 
| 10 37 | 
             
                  self.merge! dereference_descriptor(resource, base_path: base_path,
         | 
| 11 38 | 
             
                    reference_fields: ['schema', 'dialect'])
         | 
| @@ -132,6 +159,7 @@ module DataPackage | |
| 132 159 | 
             
                end
         | 
| 133 160 |  | 
| 134 161 | 
             
                def apply_table_defaults!
         | 
| 162 | 
            +
                  self['profile'] = DataPackage::DEFAULTS[:resource][:tabular_profile]
         | 
| 135 163 | 
             
                  if self.fetch('schema', nil)
         | 
| 136 164 | 
             
                    self['schema']['missingValues'] = DataPackage::DEFAULTS[:schema][:missing_values]
         | 
| 137 165 | 
             
                    self['schema'].fetch('fields', []).each do |field_descriptor|
         | 
    
        data/lib/datapackage/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: datapackage
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1. | 
| 4 | 
            +
              version: 1.1.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Leigh Dodds
         | 
| @@ -10,7 +10,7 @@ authors: | |
| 10 10 | 
             
            autorequire: 
         | 
| 11 11 | 
             
            bindir: bin
         | 
| 12 12 | 
             
            cert_chain: []
         | 
| 13 | 
            -
            date:  | 
| 13 | 
            +
            date: 2019-11-21 00:00:00.000000000 Z
         | 
| 14 14 | 
             
            dependencies:
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 16 16 | 
             
              name: json-schema
         | 
| @@ -183,6 +183,7 @@ files: | |
| 183 183 | 
             
            - lib/datapackage/defaults.rb
         | 
| 184 184 | 
             
            - lib/datapackage/exceptions.rb
         | 
| 185 185 | 
             
            - lib/datapackage/helpers.rb
         | 
| 186 | 
            +
            - lib/datapackage/interpreter.rb
         | 
| 186 187 | 
             
            - lib/datapackage/package.rb
         | 
| 187 188 | 
             
            - lib/datapackage/profile.rb
         | 
| 188 189 | 
             
            - lib/datapackage/registry.rb
         | 
| @@ -215,7 +216,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 215 216 | 
             
                  version: '0'
         | 
| 216 217 | 
             
            requirements: []
         | 
| 217 218 | 
             
            rubyforge_project: 
         | 
| 218 | 
            -
            rubygems_version: 2. | 
| 219 | 
            +
            rubygems_version: 2.7.7
         | 
| 219 220 | 
             
            signing_key: 
         | 
| 220 221 | 
             
            specification_version: 4
         | 
| 221 222 | 
             
            summary: Library for working with data packages
         |