RubyGems - hlsv - Versions diffs - 1.0.0 - Mend

hlsv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/lib/hlsv/xpt/reader.rb ADDED Viewed

@@ -0,0 +1,367 @@
+class SAS
+module XPT
+class Reader
+  # The first header record consists of the following character string, in ASCII
+  TOP_HEADER = "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000  ".b
+  # The first real header record uses the following layout
+  # vvvvvvvv = version, oooooooo = OS, date = create date in datetime16.
+  # FIRST_HEADER = "SAS     SAS     SASLIB  vvvvvvvvoooooooo                        ddMMMyy:hh:mm:ss"
+  # Second real header record
+  # Pad with ASCII blanks to 80 bytes
+  # date = modify date in datetime16.
+  # "ddMMMyy:hh:mm:ss"
+  # Member header records
+  # Notice the doc gives on page 4:
+  # "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!00000000000000000160000 0000140"
+  # But look at the dump on page 9, it is:
+  # "HEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000140"
+  MEMBER_HEADER1     = "HEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000140  ".b
+  MEMBER_HEADER1_VMS = "HEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000136  ".b
+  # > Note the 0140 that appears in the member header record above. This value
+  # > specifies the size of the variable descriptor (NAMESTR) record that is described
+  # > later in this document. On the VAX/VMS operating system, the value will be 0136
+  # > instead of 0140. This means that the descriptor will be only 136 bytes instead
+  # > of 140.
+  MEMBER_HEADER2 = "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000  ".b
+  # "HEADER RECORD*******NAMESTR HEADER RECORD!!!!!!!000000xxxx00000000000000000000  "
+  NAMESTR_RECORD_START = "HEADER RECORD*******NAMESTR HEADER RECORD!!!!!!!000000".b
+  NAMESTR_RECORD_VARS_RE = /^\d{4}$/
+  NAMESTR_RECORD_END = "00000000000000000000  ".b
+  OBSERVATION_HEADER = "HEADER RECORD*******OBS     HEADER RECORD!!!!!!!000000000000000000000000000000  ".b
+  SPECIAL_MISSING_VALUES = '_ABCDEFGHIJKLMNOPQRSTUVWXYZ'.b
+  # input file path
+  attr_reader :xpt_path
+  # input encoding of string values
+  attr_reader :input_encoding
+  # output encoding for string values (default = input_encoding)
+  attr_reader :output_encoding
+  # output Library instance
+  attr_reader :library
+  def initialize(xpt_path, input_encoding: 'binary', output_encoding: nil)
+    @xpt_path = xpt_path
+    @input_encoding = input_encoding
+    @output_encoding = output_encoding || input_encoding
+    read_file
+  end
+  def read_file
+    File.open(xpt_path, 'rb') do |io|
+      # All transport data set records are 80 bytes in length.
+      # If there is not sufficient data to reach 80 bytes,
+      # then a record is padded with ASCII blanks to 80 bytes.
+      # top header
+      top_header = io.read(TOP_HEADER.length)
+      top_header == TOP_HEADER or issue "invalid top header"
+      # first_header
+      _sas_sas_saslib = io.read(24)   # "SAS     SAS     SASLIB  "
+      file_sas_version = io.read(8)   # "vvvvvvvv"
+      file_sas_os = io.read(8)        # "oooooooo"
+      _blanks = io.read(24)
+      file_create_date = io.read(16)  # "ddMMMyy:hh:mm:ss"
+      # second header
+      file_modify_date = io.read(16)  # "ddMMMyy:hh:mm:ss"
+      _blanks = io.read(64)
+      @library = Library.new(xpt_path, file_create_date, file_modify_date, file_sas_version, file_sas_os)
+      until io.eof?
+        # member header record 1
+        member_header1 = io.read(80)
+        case member_header1
+        when MEMBER_HEADER1
+          namestr_record_length = 140
+        when MEMBER_HEADER1_VMS
+          namestr_record_length = 136
+        else
+          issue "invalid header record 1"
+          namestr_record_length = 140
+        end
+        # member header record 2, constant
+        member_header2 = io.read(80)
+        member_header2 == MEMBER_HEADER2 or issue "invalid member header record 2"
+        # member header data
+        # "SAS     dsname  SASDATA version>os>>>>>> (24 blanks) ddMMMyy:hh:mm:ss"
+        _sas            = io.read(8)    # "SAS     "
+        ds_name         = io.read(8)    # "DM      "
+        _sasdata        = io.read(8)    # "SASDATA "
+        ds_sas_version  = io.read(8)    # "9.4     "
+        ds_sas_os       = io.read(8)    # "X64_SRV1"
+        _blanks         = io.read(24)   # "                        "
+        ds_create_date  = io.read(16)   # "26MAY21:15:57:45"
+        # second member header data
+        # "ddMMMyy:hh:mm:ss (16 padding) (40 label) (8 dstype)"
+        ds_modify_date = io.read(16)  # "26MAY21:15:57:45"
+        _blanks        = io.read(16)  # "                "
+        ds_label       = io.read(40)  # "Demographics                            "
+        ds_type        = io.read(8)   # "        "
+        input_encoding != 'binary' and change_encoding ds_label
+        output_encoding != input_encoding and ds_label.encode!(output_encoding)
+        dataset = Dataset.new(ds_name.rstrip, ds_label.rstrip, ds_type.rstrip, ds_create_date, ds_modify_date, ds_sas_version.rstrip, ds_sas_os.rstrip)
+        @library.datasets << dataset
+        # Namestr header record
+        # In this header record, xxxx is the number of variables in the data set,
+        # displayed with blank-padded numeric characters. For example, for 2 variables, xxxx=0002.
+        # xxxx occurs at offset 54 (base 0 as in C language use).
+        namestr_header = io.read(80)
+        namestr_record_start = namestr_header[...NAMESTR_RECORD_START.length]
+        variable_count = namestr_header[NAMESTR_RECORD_START.length, 4]
+        namestr_record_end = namestr_header[-NAMESTR_RECORD_END.length..]
+        namestr_record_start == NAMESTR_RECORD_START or issue "invalid namestr record start", namestr_record_start, NAMESTR_RECORD_START
+        namestr_record_end == NAMESTR_RECORD_END or issue "invalid namestr record end", namestr_record_end, NAMESTR_RECORD_END
+        variable_count =~ NAMESTR_RECORD_VARS_RE or error "invalid namestr variable count #{variable_count.inspect}"
+        variable_count = variable_count.to_i
+        # Namestr records
+        # Each namestr field is 140 bytes long, but the fields are streamed together and broken in 80-byte pieces.
+        # If the last byte of the last namestr field does not fall in the last byte of the 80-byte record, the record is padded with ASCII blanks to 80 bytes.
+        # Here is the C structure definition for the namestr record:
+        #
+        # struct NAMESTR {
+        #   short ntype; /* VARIABLE TYPE: 1=NUMERIC, 2=CHAR */
+        #   short nhfun; /* HASH OF NNAME (always 0) */
+        #   short nlng; /* LENGTH OF VARIABLE IN OBSERVATION */
+        #   short nvar0; /* VARNUM */
+        #   char8 nname; /* NAME OF VARIABLE */
+        #   char40 nlabel; /* LABEL OF VARIABLE */
+        #   char8 nform; /* NAME OF FORMAT */
+        #   short nfl; /* FORMAT FIELD LENGTH OR 0 */
+        #   short nfd; /* FORMAT NUMBER OF DECIMALS */
+        #   short nfj; /* 0=LEFT JUSTIFICATION, 1=RIGHT JUST */
+        #   char nfill[2]; /* (UNUSED, FOR ALIGNMENT AND FUTURE) */
+        #   char8 niform; /* NAME OF INPUT FORMAT */
+        #   short nifl; /* INFORMAT LENGTH ATTRIBUTE */
+        #   short nifd; /* INFORMAT NUMBER OF DECIMALS */
+        #   long npos; /* POSITION OF VALUE IN OBSERVATION */
+        #   char rest[52]; /* remaining fields are irrelevant */
+        # };
+        #
+        # Note that the length given in the last 4 bytes of the member header record
+        # indicates the actual number of bytes for the NAMESTR structure. The size of
+        # the structure listed above is 140 bytes. Under VAX/VMS, the size will be 136
+        # bytes, meaning that the 'rest' variable may be truncated.
+        variable_count.times do
+          dataset.variables << Variable.new(io.read(namestr_record_length), input_encoding: input_encoding, output_encoding: output_encoding)
+        end
+        padding = 80 - (namestr_record_length * variable_count) % 80
+        if padding < 80
+          _padding = io.read(padding)
+        end
+        # Observation header
+        observation_header = io.read(80)
+        observation_header == OBSERVATION_HEADER or issue "invalid observation header"
+        # Data records
+        # Data records are streamed in the same way that namestrs are.
+        # There is ASCII blank padding at the end of the last record if necessary.
+        # There is no special trailing record.
+        # Missing Values
+        # Missing values are written out with the first byte (the exponent) indicating the proper missing values.
+        # All subsequent bytes are 0x00.
+        # The first byte is:
+        # type  byte
+        # ._    0x5f
+        # .     0x2e
+        # .A    0x41
+        # .B    0x42
+        # ...
+        # .Z    0x5a
+        obs_record_length = dataset.obs_record_length
+        record_count = 0
+        # by construction, the 1st obs record starts on a new 80-byte record
+        obs_start_pos = io.pos
+        # puts "obs_record_length = #{obs_record_length} obs_start_pos = #{obs_start_pos}"
+        blank_record = ' '.b * obs_record_length
+        loop do
+          break if io.eof?
+          # read_pos = io.pos
+          buffer = io.read(obs_record_length)
+          # puts "reading at #{read_pos} -> #{buffer.length} bytes #{to_hex(buffer)}"
+          if buffer.length != obs_record_length
+            # puts "#{buffer.length} != #{obs_record_length} -> #{record_count} records"
+            break
+          end
+          if buffer == blank_record
+            read_so_far = io.pos - obs_start_pos
+            _records_read, current_record_position = read_so_far.divmod(80)
+            until_end_of_record = 80 - current_record_position
+            if until_end_of_record > 0
+              buffer = io.read(until_end_of_record)
+              if !buffer.strip.empty?
+                error "non-blank bytes at end of records: #{buffer.inspect}"
+              end
+              break
+            end
+          end
+          record_count += 1
+          obs = []
+          dataset.variables.each do |var|
+            value = buffer[var.position, var.length]
+            if var.type == :char
+              # puts "char: #{value.inspect}"
+              value = value.rstrip
+              input_encoding != 'binary' and change_encoding value
+              output_encoding != input_encoding and value.encode!(output_encoding)
+              obs << value
+            else
+              # display = value.bytes.map { |b| "%02x" % b }.join
+              # puts "num: #{display}"
+              obs << ibm_to_ieee(value)
+              # TODO? option: ._, .A -> .Z to :_, :A, :Z or nil
+              # TODO? option: to_i == to_f => to_i
+            end
+          end
+          dataset.observations << obs
+          # exit
+        end
+        # puts "final pos: #{io.pos} eof: #{io.eof?}"
+      end
+    end
+  end
+  # Convert IBM-format floating point (bytes) to IEEE 754 64-bit (float).
+  def ibm_to_ieee(ibm_bytes)
+    # IBM mainframe:    sign * 0.mantissa * 16 ** (exponent - 64)
+    # Python uses IEEE: sign * 1.mantissa * 2 ** (exponent - 1023)
+    # Pad-out to 8 bytes if necessary. We expect 2 to 8 bytes, but
+    # there's no need to check; bizarre sizes will cause a struct
+    # module unpack error.
+    if ibm_bytes.length < 8
+     ibm_bytes = ibm_bytes.append_as_bytes("\x00\x00\x00\x00\x00\x00\x00\x00")[...8]
+    end
+    # parse the 64 bits of IBM float as one 8-byte unsigned long long
+    ulong = ibm_bytes.unpack1('Q>')
+    # puts "ulong = #{ulong}"
+    # IBM: 1-bit sign, 7-bits exponent, 56-bits mantissa
+    sign = ulong & 0x8000000000000000
+    exponent = (ulong & 0x7f00000000000000) >> 56
+    mantissa = ulong & 0x00ffffffffffffff
+    # puts "sign = #{sign}"
+    # puts "exponent = #{exponent}"
+    # puts "mantissa = #{mantissa}"
+    if mantissa == 0
+      if ibm_bytes[0] == "\x00".b
+        return 0.0
+      elsif ibm_bytes[0] == "\x80".b
+        return -0.0
+      elsif ibm_bytes[0] == '.'.b
+        return nil
+      elsif SPECIAL_MISSING_VALUES.include?(ibm_bytes[0])
+        return :"#{ibm_bytes[0]}"
+      else
+        raise "Neither 'true' zero nor NaN: #{ibm_bytes.inspect}"
+      end
+    end
+    # IBM-format exponent is base 16, so the mantissa can have up to 3
+    # leading zero-bits in the binary mantissa. IEEE format exponent
+    # is base 2, so we don't need any leading zero-bits and will shift
+    # accordingly. This is one of the criticisms of IBM-format, its
+    # wobbling precision.
+    if (ulong & 0x0080000000000000) != 0
+      shift = 3
+    elsif (ulong & 0x0040000000000000) != 0
+      shift = 2
+    elsif (ulong & 0x0020000000000000) != 0
+      shift = 1
+    else
+      shift = 0
+    end
+    mantissa >>= shift
+    # puts "shift = #{shift}"
+    # puts "mantissa = #{mantissa}"
+    # clear the 1 bit to the left of the binary point
+    # this is implicit in IEEE specification
+    mantissa &= 0xffefffffffffffff
+    # puts "mantissa = #{mantissa}"
+    # IBM exponent is excess 64, but we subtract 65, because of the
+    # implicit 1 left of the radix point for the IEEE mantissa
+    exponent -= 65
+    # puts "exponent = #{exponent}"
+    # IBM exponent is base 16, IEEE is base 2, so we multiply by 4
+    exponent <<= 2
+    # puts "exponent = #{exponent}"
+    # IEEE exponent is excess 1023, but we also increment for each
+    # right-shift when aligning the mantissa's first 1-bit
+    exponent += shift + 1023
+    # puts "exponent = #{exponent}"
+    # IEEE: 1-bit sign, 11-bits exponent, 52-bits mantissa
+    # We didn't shift the sign bit, so it's already in the right spot
+    ieee = sign | (exponent << 52) | mantissa
+    # puts "ieee = #{ieee}"
+    result = [ieee].pack('Q>').unpack1('G')
+    # puts "result = #{result}"
+    result
+  end
+  def change_encoding(string)
+    string.force_encoding(input_encoding)
+    unless string.valid_encoding?
+      issue "invalid input encoding #{input_encoding} for #{string.inspect}"
+    end
+  end
+  def issue(message, actual = nil, expected = nil)
+    warn message
+    warn "- expected: #{expected.inspect}" if expected
+    warn "- actual:   #{actual.inspect}" if actual
+  end
+  def error(message)
+    warn "ERROR: #{message}"
+    raise "cannot continue"
+  end
+  def to_hex(byte_string)
+    byte_string.bytes.map { |b| "%02x" % b }.join(' ')
+  end
+end
+end
+end

data/lib/hlsv/xpt/variable.rb ADDED Viewed

@@ -0,0 +1,130 @@
+class SAS
+module XPT
+class Variable
+  attr_reader :name
+  attr_reader :label
+  # :numeric or :char
+  attr_reader :type
+  attr_reader :length
+  attr_reader :varnum
+  # string or nil
+  attr_reader :format
+  # :left or :right
+  attr_reader :format_justification
+  # string or nil
+  attr_reader :informat
+  # position in the observation byte record
+  attr_reader :position
+  NAMESTR_RECORD_TEMPLATE_140 = 's>x2s>2a8a40a8s>3x2a8s>2Nx52'
+  NAMESTR_RECORD_TEMPLATE_136 = 's>x2s>2a8a40a8s>3x2a8s>2Nx48'
+  def initialize(namestr_record, input_encoding:, output_encoding:)
+    template = namestr_record.length == 140 ? NAMESTR_RECORD_TEMPLATE_140 : NAMESTR_RECORD_TEMPLATE_136
+    array = namestr_record.unpack(template)
+    # values as read: [2, 0, 21, 1, "STUDYID ", "Study Identifier                        ", "        ", 0, 0, 0, "        ", 0, 0, 0]
+    ntype,
+    nlng,
+    nvar0,
+    nname,
+    nlabel,
+    nform,
+    nfl,
+    nfd,
+    nfj,
+    niform,
+    nifl,
+    nifd,
+    npos = array
+    # p io.read(2).unpack('s>')   # short   ntype;    /* VARIABLE TYPE: 1=NUMERIC, 2=CHAR */
+    # p io.read(2).unpack('x2')   # short   nhfun;    /* HASH OF NNAME (always 0) */
+    # p io.read(2).unpack('s>')   # short   nlng;     /* LENGTH OF VARIABLE IN OBSERVATION */
+    # p io.read(2).unpack('s>')   # short   nvar0;    /* VARNUM */
+    # p io.read(8).unpack('a8')   # char8   nname;    /* NAME OF VARIABLE */
+    # p io.read(40).unpack('a40') # char40  nlabel;   /* LABEL OF VARIABLE */
+    # p io.read(8).unpack('a8')   # char8   nform;    /* NAME OF FORMAT */
+    # p io.read(2).unpack('s>')   # short   nfl;      /* FORMAT FIELD LENGTH OR 0 */
+    # p io.read(2).unpack('s>')   # short   nfd;      /* FORMAT NUMBER OF DECIMALS */
+    # p io.read(2).unpack('s>')   # short   nfj;      /* 0=LEFT JUSTIFICATION, 1=RIGHT JUST */
+    # p io.read(2).unpack('x2')   # char    nfill[2]; /* (UNUSED, FOR ALIGNMENT AND FUTURE) */
+    # p io.read(8).unpack('a8')   # char8   niform;   /* NAME OF INPUT FORMAT */
+    # p io.read(2).unpack('s>')   # short   nifl;     /* INFORMAT LENGTH ATTRIBUTE */
+    # p io.read(2).unpack('s>')   # short   nifd;     /* INFORMAT NUMBER OF DECIMALS */
+    # p io.read(8).unpack('N')    # long    npos;     /* POSITION OF VALUE IN OBSERVATION */
+    # p io.read(52).unpack('x52') # char    rest[52]; /* remaining fields are irrelevant */
+    # puts "ntype  = #{ntype.inspect}   VARIABLE TYPE: 1=NUMERIC, 2=CHAR"
+    # puts "nlng   = #{nlng.inspect}    LENGTH OF VARIABLE IN OBSERVATION"
+    # puts "nvar0  = #{nvar0.inspect}   VARNUM"
+    # puts "nname  = #{nname.inspect}   NAME OF VARIABLE"
+    # puts "nlabel = #{nlabel.inspect}  LABEL OF VARIABLE"
+    # puts "nform  = #{nform.inspect}   NAME OF FORMAT"
+    # puts "nfl    = #{nfl.inspect}     FORMAT FIELD LENGTH OR 0"
+    # puts "nfd    = #{nfd.inspect}     FORMAT NUMBER OF DECIMALS"
+    # puts "nfj    = #{nfj.inspect}     0=LEFT JUSTIFICATION, 1=RIGHT JUST"
+    # puts "niform = #{niform.inspect}  NAME OF INPUT FORMAT"
+    # puts "nifl   = #{nifl.inspect}    INFORMAT LENGTH ATTRIBUTE"
+    # puts "nifd   = #{nifd.inspect}    INFORMAT NUMBER OF DECIMALS"
+    # puts "npos   = #{npos.inspect}    POSITION OF VALUE IN OBSERVATION"
+    errors = []
+    @name = nname.rstrip
+    @label = nlabel.rstrip
+    unless input_encoding != 'binary'
+      @label.force_encoding(input_encoding)
+      unless @label.valid_encoding?
+        warn "invalid encoding #{input_encoding} for #{@label.inspect}"
+      end
+    end
+    input_encoding == output_encoding or
+      @label.encode!(output_encoding)
+    case ntype
+    when 1 then @type = :numeric
+    when 2 then @type = :char
+    else errors << "invalid type #{ntype.inspect}"
+    end
+    @length = nlng
+    @varnum = nvar0
+    if nform.strip.empty?
+      @format = nil
+      @format_justification = nil
+    else
+      @format = "#{nform.strip}#{nfl}."
+      @format << nfd.to_s if nfd > 0
+      case nfj
+      when 0 then @format_justification = :left
+      when 1 then @format_justification = :right
+      else errors << "invalid justification #{nfj.inspect}"
+      end
+    end
+    if niform.strip.empty?
+      @informat = nil
+    else
+      @informat = "#{niform.strip}#{nifl}."
+      @informat << nifd.to_s if nifd > 0
+    end
+    @position = npos
+  end
+end
+end
+end

data/lib/hlsv/xpt.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) 2026 AdClin
+# Licensed under the GNU Affero General Public License v3.0 or later.
+# See the LICENSE file for details.
+require_relative 'xpt/library'
+require_relative 'xpt/dataset'
+require_relative 'xpt/variable'
+require_relative 'xpt/reader'
+module Hlsv
+end

data/lib/hlsv.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+module Hlsv
+  class Error < StandardError; end
+  INSTALL_ROOT = File.expand_path("#{File.dirname(__FILE__)}/..")
+  def self.start_server(host: '127.0.0.1', port: 4567)
+    ensure_config_file
+    WebApp.run! host: host, port: port
+  end
+  def self.ensure_config_file
+    unless File.exist?(config_path)
+      if File.exist?(default_config_path)
+        config_default = YAML.load_file(default_config_path)
+        File.write(config_path, config_default.to_yaml)
+        puts "✓ config.yaml created from config.default.yaml"
+      else
+        puts "⚠ WARNING: Neither config.yaml nor config.default.yaml exists!"
+      end
+    end
+  end
+  def self.config_path
+    "#{Dir.pwd}/config.yaml"
+  end
+  def self.default_config_path
+    "#{INSTALL_ROOT}/config.default.yaml"
+  end
+  def self.license_path
+    "#{INSTALL_ROOT}/LICENSE"
+  end
+end
+require 'sinatra/base'
+require_relative "hlsv/version"
+require_relative "hlsv/find_keys"
+require_relative "hlsv/html2word"
+require_relative "hlsv/mon_script"
+require_relative "hlsv/version"
+require_relative "hlsv/web_app"
+require_relative "hlsv/xpt"
+require_relative "hlsv/cli"

data/public/Contact-LOGO.png ADDED Viewed

Binary file