RubyGems - simple-data - Versions diffs - 0.1.0 - Mend

simple-data 0.1.0

Files changed (7) hide show

checksums.yaml +7 -0
data/README.md +31 -0
data/lib/simple-data/compression.rb +107 -0
data/lib/simple-data/version.rb +4 -0
data/lib/simple-data.rb +308 -0
data/simple-data.gemspec +29 -0
metadata +96 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: ac192701653932fa65dfad509a17ff7b1e97ff530cf0d7adc7fced1c8a97474f
+  data.tar.gz: 28deb77f46642a96c789ed0b0e83ac9ecc069416bf57c4ecf452ca714ce57940
+SHA512:
+  metadata.gz: 62e1ea2a685d816c61be2c4c9d0724a9bf7f9f34aa23341f94009c135704d453af9a7c00f8ff6c4b225f069e47682bc1557290540f622051fc7771bea6a30053
+  data.tar.gz: cb3f6b31f8dfeafe924eec3fb89f6672501363accfae6d8150471c111f53654a9c9e97b54f349ed15d43254634a7ce0f933fae9c8aec07434d931e89a183d404

data/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+This is the ruby implementation of the [simple-data spec][sda]
+# Example
+~~~ruby
+require 'simple-data'
+SimpleData.generate('spacecraft.sda',
+                    [ [ :cstr, 'name',   'spacecraft'     ],
+                      [ :cstr, 'origin', 'serie or movie' ],
+                      [ :f64,  'x'                        ],
+                      [ :f64,  'y'                        ],
+                      [ :f64,  'z'                        ] ],
+                    tags: { author:  "Stephane D'Alu",
+                            url:     "http://www.sdalu.com/",
+                            license: "MIT" }) do |sda|
+    sda.put("Enterprise", "Star Trek",    1.0, 2.0, 3.0);
+    sda.put("Rocinante",  "The Expanse",  1e6, 2e6, 3e6);
+    sda.put("Serenity",   "Firefly",      0.0, 0.0, 0.0);
+    sda.put("Bebop",      "Cowboy Bebop", 4.0, 6.8, 8.0);
+end
+SimpleData.open('spacecraft.sda') do |sda|
+	while row = sda.get do
+	    puts row.inspect
+    end
+end
+~~~
+[sda]: https://gitlab.inria.fr/dalu/simple-data/

data/lib/simple-data/compression.rb ADDED Viewed

@@ -0,0 +1,107 @@
+# Loading supported compressor
+begin
+    require 'zstd-ruby'
+    require 'stringio'
+rescue LoadError
+    return
+end
+class SimpleData
+    # Magic numbers
+    MAGIC         = { "(\xB5/\xFD".force_encoding('BINARY') => :zstd
+                    }
+    # Compressed I/O wrapper
+    class IOCompressedWrite
+        def initialize(io, write: 16 * 1024)
+            @io         = io
+            @write_size = write
+            @written    = 0
+            @zstd       = Zstd::StreamingCompress.new
+        end
+        def close
+            @io.write(@zstd.finish)
+            @io.close
+        end
+        def write(data)
+            @written += data.size
+            if (@written.size > @write_size)
+                @io.write(@zstd.flush)
+            else
+                @zstd << data
+            end
+        end
+    end
+    class IOCompressedRead
+        def initialize(io, read: 16 * 1024)
+            @io         = io
+            @sio        = StringIO.new
+            @read_size  = read
+            # Peek at data for magic number
+            pos = io.tell
+            max = MAGIC.keys.map(&:size).max
+            str = io.read(max)
+            io.seek(pos)
+            # Magic number lookup
+            type = MAGIC.find {|k, v| str.start_with?(k) }&.last
+            # Sanity check
+            rasie "data is not Zstd compressed" if type != :zstd
+            # Decoder
+            zstd = Zstd::StreamingDecompress.new
+            @decoder = ->(str) { zstd.decompress(str) }
+        end
+        def eof?
+            @io.eof? && @sio.eof?
+        end
+        def close
+            @io.close
+        end
+        def read(size)
+            data = @sio.read(size)
+            # End of buffer
+            if data.nil?
+                if cstr = @io.read(@read_size)
+                    # Refill buffer
+                    @sio.string = @decoder.call(cstr)
+                    read(size)
+                else
+                    # End of stream
+                    nil
+                end
+            # Partial buffer
+            elsif data.size < size
+                # Force a new read (will trigger a refill)
+                odata = read(size - data.size)
+                odata.nil? ? data : (data + odata)
+            # Full data
+            else
+                data
+            end
+        end
+        def each_byte(&block)
+            return to_enum(:each_byte) if block.nil?
+            loop do
+                @sio.each_byte(&block)
+                break unless cstr = @io.read(@read_size)
+                @sio.string = @decoder.call(cstr)
+            end
+        end
+    end
+end

data/lib/simple-data/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+class SimpleData
+    VERSION = '0.1.0'
+end

data/lib/simple-data.rb ADDED Viewed

@@ -0,0 +1,308 @@
+class SimpleData
+    # Current file version
+    FILE_VERSION  = 1
+    # Various regex
+    REGEX_MAGIC   = /\A# simple-data:(?<version>\d+)\s*\z/
+    REGEX_SECTION = /\A# --<(?<section>[^>:]+)(?::(?<extra>[^>]+))?>--+\s*\z/
+    REGEX_FIELD   = /\A\#\s*    (?<type>\w+     )       \s*:\s*
+                                            (?<name>[\w\-.]+)
+                                   \s* (?:\((?<desc>.*      )\))?   \s*\z
+                                /ix
+    REGEX_TAG     = /\A@(?<tag>\w+)\s+(?<value>.*?)\s*\z/
+    REGEX_EMPTY   = /\A#\s*\z/
+    # Supported tags / sections / types
+    TAGS          = %i(title summary author license url doi keywords)
+    SECTIONS      = %i(spec description data)
+    TYPES         = %i(i8 i16 i32 i64 u8 u16 u32 u64 f32 f64
+                       cstr blob char bool)
+    # Error classes
+    class Error < StandardError
+    end
+    class ParserError < Error
+    end
+    # Attributes
+    attr_reader :version
+    attr_reader :fields
+    attr_reader :tags
+    attr_reader :sections
+    def initialize(io, fields, mode, version: FILE_VERSION,
+                   tags: {}, sections: {})
+        @io         = io
+        @mode       = mode
+        @fields     = fields
+        @fields_key = fields.map {|(_, name)| name }
+        @tags       = tags
+        @sections   = sections
+        @version    = version
+        @read_ok, @write_ok =
+                  case mode
+                  when :create, :append then [ false, true  ]
+                  when :read            then [ true,  false ]
+                  else raise Error,
+                             'mode must be one of (:create, :append, or :read)'
+                  end
+    end
+    def put(*data)
+        # Checking mode
+        raise Error, "write is not allowed in #{@mode} mode" unless @write_ok
+        if data.one? && (data.first.is_a?(Array) || data.first.is_a?(Hash))
+            data = data.first
+        end
+        if data.size != @fields.size
+            raise Error, 'dataset size doesn\'t match definition'
+        end
+        if data.is_a?(Hash)
+            if ! (data.keys - @fields_key).empty?
+                raise Error, 'dataset key mismatch'
+            end
+            data = @fields.map {|k| data[k] }
+        end
+        s = @fields.each.with_index.map {|(type,name), i|
+            d = data.fetch(i) { raise "missing data (#{name})" }
+            case type
+            when :i8   then [ d ].pack('c' )
+            when :i16  then [ d ].pack('s<')
+            when :i32  then [ d ].pack('l<')
+            when :i64  then [ d ].pack('q<')
+            when :u8   then [ d ].pack('C' )
+            when :u16  then [ d ].pack('S<')
+            when :u32  then [ d ].pack('L<')
+            when :u64  then [ d ].pack('q<')
+            when :f32  then [ d ].pack('e' )
+            when :f64  then [ d ].pack('E' )
+            when :cstr then [ d ].pack('Z*')
+            when :blob then raise ParserError, 'not implemented'
+            when :char then [ d ].pack('c' )
+            when :bool then [ d ? 'T' : 'F' ].pack('c')
+            end
+        }.join
+        @io.write(s)
+    end
+    def get
+        # Checking mode
+        raise Error, "read is not allowed in #{@mode} mode" unless @read_ok
+        # No-op if end of file
+        return if @io.eof?
+        # Retrieve data
+        @fields.map {|(type)|
+            case type
+            when :i8   then @io.read(1).unpack1('c' )
+            when :i16  then @io.read(2).unpack1('s<')
+            when :i32  then @io.read(4).unpack1('l<')
+            when :i64  then @io.read(8).unpack1('q<')
+            when :u8   then @io.read(1).unpack1('C' )
+            when :u16  then @io.read(2).unpack1('S<')
+            when :u32  then @io.read(4).unpack1('L<')
+            when :u64  then @io.read(8).unpack1('q<')
+            when :f32  then @io.read(4).unpack1('e' )
+            when :f64  then @io.read(8).unpack1('E' )
+            when :cstr then @io.each_byte.lazy.take_while {|b| !b.zero? }
+                                              .map {|b| b.chr }.to_a.join
+            when :blob then raise ParserError, 'not implemented'
+            when :char then @io.read(1)
+            when :bool then @io.read(1) == 'T'
+            end
+        }
+    end
+    def close
+        @io.close
+    end
+    # Generating file
+    def self.generate(file, fields, compress = false,
+                      tags: nil, sections: nil, &block)
+        # Sanity check
+        if compress && !const_defined?(:IOCompressedWrite)
+            raise Error, 'compression not supported (add zstd-ruby gem)'
+        end
+        # Open file
+        io = File.open(file, 'w')
+        # Magic string
+        io.puts "# simple-data:1"
+        # Tags
+        if tags && !tags.empty?
+            io.puts "#"
+            tags.each do |name, value|
+                io.puts "# @%-8s %s" % [ name, value ]
+            end
+        end
+        # Spec
+        io.puts "#"
+        io.puts "# --<spec>--"
+        maxlen = fields.map {|(_, name)| name.size }.max
+        fields.each do |(type, name, desc)|
+            if desc
+                io.puts "# %-4s : %-*s (%s)" % [ type, maxlen, name, desc ]
+            else
+                io.puts "# %-4s : %s" % [ type, name ]
+            end
+        end
+        # Custom sections
+        if sections && !sections.empty?
+            io.puts "#"
+            sections.each do |name, value|
+                io.puts "# --<#{name}>--"
+                value.split(/\r?\n/).each do |line|
+                    io.puts "# #{line}"
+                end
+            end
+        end
+        # Data
+        io.puts "#"
+        io.puts "# --<%s>--" % [ compress ? 'data:compressed' : 'data' ]
+        # Deal with compression
+        io = IOCompressedWrite.new(io) if compress
+        # Instantiate SimpleData
+        sda = self.new(io, fields, :create, tags: tags, sections: sections)
+        block ? block.call(sda) : sda
+    ensure
+        sda.close if sda && block
+    end
+    # Open file for reading
+    def self.open(file, mode = :read, &block)
+        # Open file
+        io = case mode
+             when :read
+                 File.open(file, 'r:BINARY')
+             when :append
+                 File.open(file, 'r+:BINARY').tap {|io|
+                     io.seek(0, :END)
+                 }
+             else raise ArgumentError,
+                        "mode must be one of :read, :append"
+             end
+        # Read textual information
+        version                         = self.get_magic(io)
+        fields, tags, sections, dataopt = self.get_metadata(io)
+        # Deal with compression
+        if dataopt.include?(:compressed)
+            unless const_defined?(:IOCompressedRead)
+                raise Error, 'compression not supported (add zstd-ruby gem)'
+            end
+            io = IOCompressedRead.new(io)
+        end
+        # Instantiate SimpleData
+        sda               = self.new(io, fields, mode, version: version,
+                                     tags: tags, sections: sections)
+        block ? block.call(sda) : sda
+    ensure
+        sda.close if sda && block
+    end
+    private
+    def self.get_magic(io)
+        unless m = REGEX_MAGIC.match(io.readline.chomp)
+            raise ParserError, 'not a simple-data file'
+        end
+        m[:version]
+    end
+    def self.get_metadata(io)
+        tags     = []
+        fields   = []
+        sections = {}
+        dataopts  = nil
+        # Retrieve meta data
+        meta = io.each_line.lazy.map {|l| l.chomp }.take_while {|l|
+            ! ((m = REGEX_SECTION.match(l)) &&
+               (m[:section] == 'data'     )).tap {|is_data|
+                dataopts = m[:extra]&.split(',')&.map(&:to_sym) if is_data
+            }
+        }.drop_while {|l| l =~ REGEX_EMPTY }
+        # Parse
+        meta.slice_before {|l| REGEX_SECTION =~ l }.each do |grp|
+            if m = REGEX_SECTION.match(grp.first)
+                grp.shift
+                case s = m[:section].to_sym
+                # Extract spec
+                when :spec
+                    fields = grp.reject {|l| l =~ REGEX_EMPTY }.map {|l|
+                        field = REGEX_FIELD.match(l)&.captures
+                        raise ParserError, "wrong spec" if field.nil?
+                        # Normalize
+                        t, n, d = field
+                        t = t.downcase.to_sym
+                        n .force_encoding('UTF-8')
+                        d&.force_encoding('UTF-8')
+                        # Sanity check
+                        if !TYPES.include?(t)
+                            raise ParserError, "unknown type (#{t})"
+                        end
+                        # Cleaned-up field description
+                        [ t, n, d ].compact
+                    }
+                # Extract description
+                else
+                    sections[s] = grp.join("\n")
+                end
+            else
+                # Extract tags
+                tags = grp.map          {|l| l.sub(/\A\#\s*/, '') }
+                          .slice_before {|l| l =~  /\A@\w+/       }
+                          .map          {|g| g.join(' ')          }
+                          .map          {|tagline|
+                    REGEX_TAG.match(tagline).captures
+                }.reduce({}) { |obj, (t,v)|
+                    obj.merge(t.to_sym => v) {|k, o, n| Array(o) + [ n ] }
+                }
+                # Normalize tags
+                tags[:keywords] = Array(tags[:keywords]).flat_map {|e|
+                    e.split(/\s*,\s*|\s+/).map(&:strip).uniq
+                }
+                # Tags sanityzing
+                tags.each do |k, v|
+                    if !TAGS.include?(k)
+                        raise ParserError, "unknown tag (#{k})"
+                    end
+                end
+                tags.reject! {|k, v| v.nil? || v.empty? }
+            end
+        end
+        [ fields, tags, sections, dataopts ]
+    end
+end
+require_relative 'simple-data/version'
+require_relative 'simple-data/compression'

data/simple-data.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+require_relative 'lib/simple-data/version'
+Gem::Specification.new do |s|
+    s.name        = 'simple-data'
+    s.version     = SimpleData::VERSION
+    s.summary     = "Simple Data (CSV alternative)"
+    s.description =  <<~EOF
+      An alternative to CSV format for storing data.
+      Provides metadata and field description support, and has a
+      reasonable hunger for disk space.
+      EOF
+    s.homepage    = 'https://gitlab.inria.fr/dalu/simple-data'
+    s.license     = 'MIT'
+    s.authors     = [ "Stéphane D'Alu" ]
+    s.email       = [ 'stephane.dalu@insa-lyon.fr' ]
+    s.files       = %w[ README.md simple-data.gemspec ] +
+                    Dir['lib/**/*.rb']
+    s.add_dependency 'zstd-ruby'
+    s.add_development_dependency 'yard', '~>0'
+    s.add_development_dependency 'rake', '~>13'
+end

metadata ADDED Viewed

@@ -0,0 +1,96 @@
+--- !ruby/object:Gem::Specification
+name: simple-data
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Stéphane D'Alu
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2022-10-07 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: zstd-ruby
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '13'
+description: |2+
+  An alternative to CSV format for storing data.
+  Provides metadata and field description support, and has a
+  reasonable hunger for disk space.
+email:
+- stephane.dalu@insa-lyon.fr
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README.md
+- lib/simple-data.rb
+- lib/simple-data/compression.rb
+- lib/simple-data/version.rb
+- simple-data.gemspec
+homepage: https://gitlab.inria.fr/dalu/simple-data
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.3.21
+signing_key:
+specification_version: 4
+summary: Simple Data (CSV alternative)
+test_files: []
+...