RubyGems - ndr_import - Versions diffs - 3.0.0 - Mend

ndr_import 3.0.0

Files changed (103) hide show

checksums.yaml +15 -0
data/.gitignore +14 -0
data/.rubocop.yml +27 -0
data/.ruby-version +1 -0
data/.travis.yml +22 -0
data/CODE_OF_CONDUCT.md +13 -0
data/Gemfile +4 -0
data/Guardfile +16 -0
data/LICENSE.txt +21 -0
data/README.md +69 -0
data/Rakefile +13 -0
data/code_safety.yml +374 -0
data/gemfiles/Gemfile.rails32 +5 -0
data/gemfiles/Gemfile.rails32.lock +142 -0
data/gemfiles/Gemfile.rails41 +5 -0
data/gemfiles/Gemfile.rails41.lock +145 -0
data/gemfiles/Gemfile.rails42 +5 -0
data/gemfiles/Gemfile.rails42.lock +145 -0
data/lib/ndr_import.rb +13 -0
data/lib/ndr_import/csv_library.rb +40 -0
data/lib/ndr_import/file/all.rb +8 -0
data/lib/ndr_import/file/base.rb +76 -0
data/lib/ndr_import/file/delimited.rb +86 -0
data/lib/ndr_import/file/excel.rb +131 -0
data/lib/ndr_import/file/pdf.rb +38 -0
data/lib/ndr_import/file/registry.rb +50 -0
data/lib/ndr_import/file/text.rb +52 -0
data/lib/ndr_import/file/word.rb +30 -0
data/lib/ndr_import/file/zip.rb +67 -0
data/lib/ndr_import/helpers/file/delimited.rb +105 -0
data/lib/ndr_import/helpers/file/excel.rb +181 -0
data/lib/ndr_import/helpers/file/pdf.rb +29 -0
data/lib/ndr_import/helpers/file/word.rb +27 -0
data/lib/ndr_import/helpers/file/xml.rb +45 -0
data/lib/ndr_import/helpers/file/zip.rb +44 -0
data/lib/ndr_import/mapper.rb +220 -0
data/lib/ndr_import/mapping_error.rb +5 -0
data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
data/lib/ndr_import/non_tabular/line.rb +46 -0
data/lib/ndr_import/non_tabular/mapping.rb +35 -0
data/lib/ndr_import/non_tabular/record.rb +99 -0
data/lib/ndr_import/non_tabular/table.rb +193 -0
data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
data/lib/ndr_import/standard_mappings.rb +23 -0
data/lib/ndr_import/table.rb +179 -0
data/lib/ndr_import/version.rb +4 -0
data/ndr_import.gemspec +44 -0
data/test/file/base_test.rb +54 -0
data/test/file/delimited_test.rb +143 -0
data/test/file/excel_test.rb +85 -0
data/test/file/pdf_test.rb +35 -0
data/test/file/registry_test.rb +60 -0
data/test/file/text_test.rb +92 -0
data/test/file/word_test.rb +35 -0
data/test/file/zip_test.rb +47 -0
data/test/helpers/file/delimited_test.rb +113 -0
data/test/helpers/file/excel_test.rb +97 -0
data/test/helpers/file/pdf_test.rb +26 -0
data/test/helpers/file/word_test.rb +26 -0
data/test/helpers/file/xml_test.rb +131 -0
data/test/helpers/file/zip_test.rb +75 -0
data/test/mapper_test.rb +551 -0
data/test/non_tabular/mapping_test.rb +36 -0
data/test/non_tabular/table_test.rb +510 -0
data/test/non_tabular_file_helper_test.rb +501 -0
data/test/readme_test.rb +53 -0
data/test/resources/bomd.csv +3 -0
data/test/resources/broken.csv +3 -0
data/test/resources/filesystem_paths.yml +26 -0
data/test/resources/flat_file.pdf +0 -0
data/test/resources/flat_file.txt +27 -0
data/test/resources/flat_file.yml +20 -0
data/test/resources/hello_utf16be.txt +0 -0
data/test/resources/hello_utf16le.txt +0 -0
data/test/resources/hello_utf8.txt +2 -0
data/test/resources/hello_windows.txt +2 -0
data/test/resources/hello_world.doc +0 -0
data/test/resources/hello_world.pdf +0 -0
data/test/resources/hello_world.txt +2 -0
data/test/resources/high_ascii_delimited.txt +2 -0
data/test/resources/malformed.xml +6 -0
data/test/resources/normal.csv +3 -0
data/test/resources/normal.csv.zip +0 -0
data/test/resources/normal_pipe.csv +3 -0
data/test/resources/normal_thorn.csv +3 -0
data/test/resources/not_a_pdf.pdf +0 -0
data/test/resources/not_a_word_file.doc +0 -0
data/test/resources/sample_xls.xls +0 -0
data/test/resources/sample_xlsx.xlsx +0 -0
data/test/resources/standard_mappings.yml +39 -0
data/test/resources/txt_file_xls_extension.xls +1 -0
data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
data/test/resources/utf-16be_xml.xml +0 -0
data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
data/test/resources/utf-16le_xml.xml +0 -0
data/test/resources/utf-8_xml.xml +9 -0
data/test/resources/windows-1252_xml.xml +9 -0
data/test/resources/windows.csv +5 -0
data/test/resources/xlsx_file_xls_extension.xls +0 -0
data/test/standard_mappings_test.rb +22 -0
data/test/table_test.rb +288 -0
data/test/test_helper.rb +13 -0
metadata +443 -0

data/lib/ndr_import/non_tabular/mapping.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# encoding: UTF-8
+require 'ndr_import/non_tabular/table'
+module NdrImport
+  module NonTabular
+    # This class stores the mapping used to break an incoming file into multiple rows/records
+    class Mapping < Table
+      def self.all_valid_options
+        super + %w(non_tabular_row)
+      end
+      def initialize(options)
+        non_tabular_mappings = options['non_tabular_row']
+        if non_tabular_mappings
+          initialize_non_tabular_mappings(non_tabular_mappings)
+        else
+          # validate presence of non_tabular_row
+          fail NdrImport::MappingError,
+               I18n.t('mapping.errors.missing_non_tabular_row')
+        end
+        super(options)
+      end
+      private
+      def initialize_non_tabular_mappings(non_tabular_mappings)
+        NON_TABULAR_OPTIONS.each do |key|
+          next unless non_tabular_mappings[key]
+          instance_variable_set("@#{key}", non_tabular_mappings[key])
+        end
+      end
+    end
+  end
+end

data/lib/ndr_import/non_tabular/record.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# encoding: UTF-8
+module NdrImport
+  module NonTabular
+    # This class behaves like an array of NdrImport::NonTabular::Line elements
+    # that contains all the source lines of text that relate to a single record of data.
+    # It also encapsulates the logic that tabulates the data.
+    class Record
+      attr_reader :lines
+      def initialize
+        @lines = []
+      end
+      def <<(line)
+        return if line.removed
+        line.in_a_record = true
+        line.record_line_number = @lines.length
+        @lines << line
+      end
+      def empty?
+        @lines.empty?
+      end
+      # Call this if it turns out that this is not a record.
+      # All lines will be flagged accordingly.
+      def not_a_record!
+        @lines.each { |line| line.in_a_record = false }
+      end
+      # Returns an array of "cells" for a given array of lines of a file that represent
+      # a single "row" of data. Allowing the output to be mapped by the standard mapper.
+      #
+      # ==== Signature
+      #
+      #   tabulate(mappings)
+      #
+      # ==== Examples
+      #
+      #   If the YAML mapping is
+      #   ---
+      #   - standard_mapping: nhsnumber
+      #     non_tabular_cell:
+      #       lines: 0
+      #       capture:
+      #       - !ruby/regexp /^D\|([^|]*).*/
+      #   - column: fulltextreport
+      #     non_tabular_cell:
+      #       lines: !ruby/range
+      #         begin: 1
+      #         end: -1
+      #         excl: false
+      #       capture: !ruby/regexp /^(?:R|\d+)\|(.*)$/i
+      #       join: \n
+      #
+      #   lines = [
+      #     "D|1111111111|...",
+      #     "R|This is a",
+      #     "1|multiline report"
+      #   ]
+      #
+      #   tabulate(mappings)
+      #
+      #   # =>
+      #   [
+      #     "1111111111",
+      #     "This is a\nmultiline report"
+      #   ]
+      #
+      def tabulate(mappings)
+        cells = []
+        mappings.each do |column_mapping|
+          begin
+            matches = get_matches(column_mapping)
+            # Join the non-blank lines together and add to the array of cells
+            cells << matches.select { |value| !value.blank? }.join(column_mapping.join || '')
+          rescue RegexpRange::PatternMatchError
+            cells << nil
+          end
+        end
+        cells
+      end
+      # returns an array of matches from within the captured lines
+      def get_matches(column_mapping)
+        matching_lines = column_mapping.matching_lines(@lines)
+        # loop through the specified line (or lines)
+        matches = Array(@lines[matching_lines]).map do |line|
+          line.captured_for(column_mapping.name)
+          value = column_mapping.capture_value(line)
+          line.matches_for(column_mapping.name, value)
+          value
+        end
+        matches
+      end
+    end
+  end
+end

data/lib/ndr_import/non_tabular/table.rb ADDED Viewed

@@ -0,0 +1,193 @@
+require 'ndr_import/table'
+module NdrImport
+  module NonTabular
+    # This class maintains the state of a non tabular table mapping and encapsulates
+    # the logic required to transform a table of data into "records". Particular
+    # attention has been made to use enumerables throughout to help with the
+    # transformation of large quantities of data.
+    class Table < ::NdrImport::Table
+      require 'i18n'
+      require 'ndr_support/regexp_range' # TODO: unneeded?
+      require 'ndr_support/utf8_encoding'
+      require 'ndr_import/non_tabular/column_mapping'
+      require 'ndr_import/non_tabular/record'
+      require 'ndr_import/non_tabular/line'
+      include UTF8Encoding
+      NON_TABULAR_OPTIONS = %w(capture_start_line start_line_pattern end_line_pattern remove_lines
+                               start_in_a_record end_in_a_record)
+      def self.all_valid_options
+        super - %w(tablename_pattern header_lines footer_lines) + NON_TABULAR_OPTIONS
+      end
+      attr_reader(*NON_TABULAR_OPTIONS)
+      attr_reader :non_tabular_lines
+      def header_lines
+        0
+      end
+      def footer_lines
+        0
+      end
+      def initialize(options = {})
+        super(options)
+        validate_presence_of_start_line_pattern
+      end
+      def tablename_pattern=(_value)
+        fail NdrImport::MappingError, 'Should not define tablename_pattern'
+      end
+      def validate_presence_of_start_line_pattern
+        return if @start_line_pattern
+        fail NdrImport::MappingError,
+             I18n.t('mapping.errors.missing_start_line_pattern')
+      end
+      # This method transforms a table of data, given a line array/enumerator and yields
+      # klass, fields and index (input row number) for each record that it would create
+      # as a result of the transformation process.
+      def transform(lines, &block)
+        return enum_for(:transform, lines) unless block
+        self.non_tabular_lines = ensure_utf8_enum!(lines)
+        remove_unwanted_lines
+        super(read_non_tabular_array, &block)
+      end
+      def validate_header(_line, _column_mappings)
+        @header_valid = true
+      end
+      protected
+      def ensure_utf8_enum!(lines)
+        return enum_for(:ensure_utf8_enum!, lines) unless block_given?
+        lines.each do |line|
+          # puts 'ensure_utf8_object!'
+          yield ensure_utf8_object!(line)
+        end
+      end
+      # This method flages unwanted lines, typically page headers and footers as removed
+      # preventing them from being captured in the non tabular record. Especially useful
+      # when there page headers and footers that are out of step with the start and end
+      # of each record and could therefore appear anywhere in an individual record if kept.
+      def remove_unwanted_lines
+        return unless @remove_lines.is_a?(Hash)
+        @non_tabular_lines.each_with_index do |_line, i|
+          @remove_lines.each do |_key, lines_to_remove|
+            comparable_lines = @non_tabular_lines[i, lines_to_remove.length]
+            next unless lines_equal(comparable_lines, lines_to_remove)
+            # All lines are equal, so flag them as removed
+            comparable_lines.each { |line| line.removed = true }
+          end
+        end
+      end
+      def read_non_tabular_array
+        @tabular_array = []
+        @in_a_record = @start_in_a_record
+        @non_tabular_record = NdrImport::NonTabular::Record.new
+        partition_and_process_non_tabular_lines
+        process_end_of_record
+        @tabular_array
+      end
+      # Reads the array of lines, looking to see if a line matches the start_line_pattern,
+      # identifying the start of a record. It then collects all the lines until a line
+      # matches the end_line_pattern (if defined, otherwise when it matches the next
+      # start_line_pattern) and sends these line to NdrImport::NonTabular::Record#tabulate.
+      #
+      # NOTE: Currently the end line is consumed and does not form part of the
+      # collected array.
+      def partition_and_process_non_tabular_lines
+        non_tabular_lines.each do |line|
+          if line =~ @start_line_pattern
+            # This is a start line
+            start_record(line)
+          elsif line =~ @end_line_pattern
+            # This is an end line
+            end_record
+          else
+            @non_tabular_record << line if @in_a_record
+          end
+        end
+      end
+      # Checks to see if we get the start of a new record before getting the end of the previous
+      # one and fails if so. Otherwise it tabulates the previous record
+      def start_record(line)
+        if @end_line_pattern
+          fail NdrImport::MappingError,
+               I18n.t('mapping.errors.start_pattern_before_end') if @in_a_record
+        else
+          # No endline mapping
+          @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+        end
+        @non_tabular_record = NdrImport::NonTabular::Record.new
+        @non_tabular_record << line if @capture_start_line
+        @in_a_record = true
+      end
+      # Tabulate the record (if in one), flagged it as no longer being in a record
+      # and set the record to be a new one.
+      def end_record
+        @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+        @in_a_record = false
+        @non_tabular_record = NdrImport::NonTabular::Record.new
+      end
+      # If the non-tabular data ends in a record (i.e. the last record is terminated by the EOF)
+      # then we need to process the last record manually or flag those lines as not being part
+      # of a record
+      def process_end_of_record
+        return if @non_tabular_record.empty?
+        if @end_in_a_record
+          @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+        else
+          @non_tabular_record.not_a_record!
+        end
+      end
+      # Store the source lines as instances of NdrImport::NonTabular::Line
+      def non_tabular_lines=(lines)
+        @non_tabular_lines = lines.map.with_index do |line, i|
+          NdrImport::NonTabular::Line.new(line, i)
+        end
+      end
+      # Create and memoize the column mappings
+      def column_mappings
+        @column_mappings ||= raw_column_mappings.map do |column_mapping|
+          NdrImport::NonTabular::ColumnMapping.new(column_mapping)
+        end
+      end
+      def raw_column_mappings
+        @columns || []
+      end
+      # This method compares two arrays, where the first must be an array of
+      # NdrImport::NonTabular::Line or string elements
+      # and the second can be a mix of strings and/or regular expressions
+      def lines_equal(lines, other_lines)
+        return false unless lines.length == other_lines.length
+        lines.each_with_index.map do |line, i|
+          other_line = other_lines[i]
+          other_line.is_a?(Regexp) ? line.to_s =~ other_line : line.to_s == other_line
+        end.all?
+      end
+    end
+  end
+end

data/lib/ndr_import/non_tabular_file_helper.rb ADDED Viewed

@@ -0,0 +1,160 @@
+# encoding: UTF-8
+module NdrImport
+  # This mixin adds (multiline) non-tabular file functionality to unified importers.
+  # It provides a file reader method and method to capture the rawtext value
+  # appropriately. These methods can be overridden or aliased as required.
+  #
+  # The YAML mapping must define the start_line_pattern which identifies the start
+  # of a multiline record (or "row") and can optionally define an end_line_pattern.
+  module NonTabularFileHelper
+    require 'i18n'
+    require 'ndr_support/regexp_range' # TODO: unneeded?
+    require 'ndr_support/utf8_encoding'
+    require 'ndr_import/non_tabular/column_mapping'
+    require 'ndr_import/non_tabular/record'
+    require 'ndr_import/non_tabular/line'
+    require 'ndr_import/non_tabular/mapping'
+    include UTF8Encoding
+    attr_reader :non_tabular_lines
+    protected
+    # Reads a non-tabular text file and returns an array of tabulated rows of data,
+    # where each row is an array of cells.
+    def read_non_tabular_file
+      self.non_tabular_lines = ensure_utf8_object! SafeFile.readlines(filename)
+      remove_unwanted_lines
+      read_non_tabular_array
+    end
+    # Reads a string and returns an array of tabulated data. Use only for prototyping.
+    def read_non_tabular_string(text)
+      self.non_tabular_lines = ensure_utf8_object!(text).split("\n")
+      remove_unwanted_lines
+      read_non_tabular_array
+    end
+    # This method flages unwanted lines, typically page headers and footers as removed
+    # preventing them from being captured in the non tabular record. Especially useful
+    # when there page headers and footers that are out of step with the start and end
+    # of each record and could therefore appear anywhere in an individual record if kept.
+    def remove_unwanted_lines
+      return unless row_mapping.remove_lines.is_a?(Hash)
+      @non_tabular_lines.each_with_index do |_line, i|
+        row_mapping.remove_lines.each do |_key, lines_to_remove|
+          comparable_lines = @non_tabular_lines[i, lines_to_remove.length]
+          next unless lines_equal(comparable_lines, lines_to_remove)
+          # All lines are equal, so flag them as removed
+          comparable_lines.each { |line| line.removed = true }
+        end
+      end
+    end
+    def read_non_tabular_array
+      @tabular_array = []
+      @in_a_record = row_mapping.start_in_a_record
+      @non_tabular_record = NdrImport::NonTabular::Record.new
+      partition_and_process_non_tabular_lines
+      process_end_of_record
+      # We change the mapping instance variable to only contain the column mappings.
+      # This enables the standard mapper to work unaltered.
+      @mappings = raw_column_mappings
+      @tabular_array
+    end
+    # Reads the array of lines, looking to see if a line matches the start_line_pattern,
+    # identifying the start of a record. It then collects all the lines until a line
+    # matches the end_line_pattern (if defined, otherwise when it matches the next
+    # start_line_pattern) and sends these line to NdrImport::NonTabular::Record#tabulate.
+    #
+    # NOTE: Currently the end line is consumed and does not form part of the
+    # collected array.
+    def partition_and_process_non_tabular_lines
+      non_tabular_lines.each do |line|
+        if line =~ row_mapping.start_line_pattern
+          # This is a start line
+          start_record(line)
+        elsif line =~ row_mapping.end_line_pattern
+          # This is an end line
+          end_record
+        else
+          @non_tabular_record << line if @in_a_record
+        end
+      end
+    end
+    # Checks to see if we get the start of a new record before getting the end of the previous
+    # one and fails if so. Otherwise it tabulates the previous record
+    def start_record(line)
+      if row_mapping.end_line_pattern
+        fail NdrImport::MappingError,
+             I18n.t('mapping.errors.start_pattern_before_end') if @in_a_record
+      else
+        # No endline mapping
+        @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+      end
+      @non_tabular_record = NdrImport::NonTabular::Record.new
+      @non_tabular_record << line if row_mapping.capture_start_line
+      @in_a_record = true
+    end
+    # Tabulate the record (if in one), flagged it as no longer being in a record
+    # and set the record to be a new one.
+    def end_record
+      @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+      @in_a_record = false
+      @non_tabular_record = NdrImport::NonTabular::Record.new
+    end
+    # If the non-tabular data ends in a record (i.e. the last record is terminated by the EOF)
+    # then we need to process the last record manually or flag those lines as not being part
+    # of a record
+    def process_end_of_record
+      return if @non_tabular_record.empty?
+      if row_mapping.end_in_a_record
+        @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
+      else
+        @non_tabular_record.not_a_record!
+      end
+    end
+    # Store the source lines as instances of NdrImport::NonTabular::Line
+    def non_tabular_lines=(lines)
+      @non_tabular_lines = lines.map.with_index do |line, i|
+        NdrImport::NonTabular::Line.new(line, i)
+      end
+    end
+    # Create and memoize the row mappings
+    def row_mapping
+      @row_mapping ||= NdrImport::NonTabular::Mapping.new(@mappings)
+    end
+    # Create and memoize the column mappings
+    def column_mappings
+      @column_mappings ||= raw_column_mappings.map do |column_mapping|
+        NdrImport::NonTabular::ColumnMapping.new(column_mapping)
+      end
+    end
+    def raw_column_mappings
+      @mappings['columns'] || []
+    end
+    # This method compares two arrays, where the first must be an array of
+    # NdrImport::NonTabular::Line or string elements
+    # and the second can be a mix of strings and/or regular expressions
+    def lines_equal(lines, other_lines)
+      return false unless lines.length == other_lines.length
+      lines.each_with_index.map do |line, i|
+        other_line = other_lines[i]
+        other_line.is_a?(Regexp) ? line.to_s =~ other_line : line.to_s == other_line
+      end.all?
+    end
+  end
+end