RubyGems - nebulous - Versions diffs - 0.0.2 - Mend

nebulous 0.0.2

Files changed (31) hide show

checksums.yaml +7 -0
data/.gitignore +34 -0
data/.rspec +1 -0
data/.rubocop.yml +265 -0
data/.travis.yml +5 -0
data/Gemfile +3 -0
data/Gemfile.lock +54 -0
data/LICENSE +22 -0
data/README.md +29 -0
data/lib/nebulous.rb +15 -0
data/lib/nebulous/chunk.rb +14 -0
data/lib/nebulous/delimiter_detector.rb +68 -0
data/lib/nebulous/parser.rb +129 -0
data/lib/nebulous/row.rb +47 -0
data/lib/nebulous/version.rb +3 -0
data/nebulous.gemspec +27 -0
data/spec/nebulous/chunk_spec.rb +6 -0
data/spec/nebulous/delimiter_detector_spec.rb +161 -0
data/spec/nebulous/parser_spec.rb +287 -0
data/spec/nebulous/row_spec.rb +6 -0
data/spec/spec_helper.rb +7 -0
data/spec/support/assets/clrf-batches.csv +1001 -0
data/spec/support/assets/cr-comma-delimited.csv +1 -0
data/spec/support/assets/cr-lf-comma-delimited.csv +1 -0
data/spec/support/assets/crlf-comma-delimited.csv +21 -0
data/spec/support/assets/crlf-dolla-delimited.csv +21 -0
data/spec/support/assets/crlf-pipe-delimited.csv +21 -0
data/spec/support/assets/crlf-semicolon-delimited.csv +21 -0
data/spec/support/assets/crlf-tab-delimited.tsv +21 -0
data/spec/support/assets/no-headers.csv +20 -0
metadata +171 -0

@@ -0,0 +1,14 @@
+module Nebulous
+  class Chunk < Array
+    attr_reader :options
+    def initialize(*args)
+      @options = args.extract_options!
+      super
+    end
+    def full?
+      options.has_key?(:size) && options[:size] == size
+    end
+  end
+end

data/lib/nebulous/delimiter_detector.rb ADDED

@@ -0,0 +1,68 @@
+module Nebulous
+  class DelimiterDetector
+    LINE_DELIMITERS = [
+      [/CRLF/, "\n"],
+      [/CR, LF/, "\r"],
+      [/CR(?!,)/, "\r"]
+    ]
+    COLUMN_DELIMITERS = [',', ';', "\t", '|']
+    attr_reader :path
+    def initialize(path, *args)
+      @path = path
+      @options = args.extract_options!
+      raise ArgumentError unless File.exists?(@path)
+    end
+    def detect
+      { col_sep: detect_column_delimiter,
+        row_sep: detect_line_delimiter }
+    end
+    def detect_column_delimiter
+      ln = readline
+      column_delimiters.each_with_index do |exp, index|
+        counts[index] = ln.split(exp).length - 1
+      end
+      count = counts.each_with_index.max[1]
+      column_delimiters[count]
+    end
+    def detect_line_delimiter
+      res = Cocaine::CommandLine.new('file', ':path').run(path: path).chomp
+      map = line_delimiters.map do |sep|
+        sep[1] if res =~ sep[0]
+      end.compact
+      map.first || line_delimiters[0][1]
+    end
+    private
+    def line_delimiters
+      @options.fetch(:line_delimiters, LINE_DELIMITERS)
+    end
+    def column_delimiters
+      @options.fetch(:column_delimiters, COLUMN_DELIMITERS)
+    end
+    def encoding
+      @options.fetch(:encoding, Encoding::UTF_8.to_s)
+    end
+    def counts
+      @counts ||= column_delimiters.map { 0 }
+    end
+    def readline
+      File.open(path, &:readline).encode(encoding, invalid: :replace)
+    end
+  end
+end

data/lib/nebulous/parser.rb ADDED

@@ -0,0 +1,129 @@
+module Nebulous
+  class Parser
+    DEFAULT_OPTIONS = {
+      col_sep: nil,
+      row_sep: nil,
+      quote_char: '"',
+      comment_exp: /^#/,
+      chunk: false,
+      headers: true,
+      mapping: nil,
+      limit: false,
+      remove_empty_values: true,
+      encoding: Encoding::UTF_8.to_s
+    }
+    attr_reader :file
+    attr_reader :options
+    def initialize(file, *args)
+      opts = args.extract_options!
+      @options = OpenStruct.new DEFAULT_OPTIONS.merge(opts)
+      @file = read_input(file)
+      merge_delimiters
+    end
+    def process(&block)
+      @index = 0
+      read_headers
+      iterate(&block)
+    ensure
+      reset
+      file.rewind
+    end
+    def delimiters
+      @delimiters ||= DelimiterDetector.new(file.path).detect
+    end
+    private
+    def reset
+      @index = 0
+      @headers = nil
+      @chunk = nil
+    end
+    def chunk
+      @chunk ||= Chunk.new chunk_options
+    end
+    def read_headers
+      @headers ||= Row.headers(readline, options) if options[:headers]
+    end
+    def iterate(&block)
+      while !file.eof?
+        break if limit?
+        chunk << replace_keys(parse_row.merge(@headers))
+        yield_chunk(chunk, &block) if block_given? && options.chunk
+      end
+      @chunk.to_a
+    end
+    def sequence
+      @index += 1
+    end
+    def limit?
+      options.limit && options.limit == @index
+    end
+    def parse_row
+      sequence
+      Row.parse(read_complete_line, options)
+    end
+    def yield_chunk(chunk, &_block)
+      if chunk.full? || file.eof?
+        yield chunk.map(&:to_a)
+        @chunk = nil
+      end
+    end
+    def read_input(input)
+      input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
+    end
+    def read_complete_line
+      ln = readline
+      while ln.count(options.quote_char) % 2 == 1
+        ln += readline
+      end
+      ln
+    end
+    def readline
+      file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
+    end
+    def encoding
+      options.encoding
+    end
+    def merge_delimiters
+      options.row_sep ||= delimiters[:row_sep]
+      options.col_sep ||= delimiters[:col_sep]
+    end
+    def line_terminator
+      options.row_sep
+    end
+    def chunk_options
+      Hash.new.tap do |attrs|
+        attrs[:size] = options.chunk.to_i if options.chunk
+      end
+    end
+    def replace_keys(row)
+      return row unless options.mapping
+      row.map do |key, value|
+        [options.mapping[key], value] if options.mapping.has_key?(key)
+      end.compact.to_h
+    end
+  end
+end

data/lib/nebulous/row.rb ADDED

@@ -0,0 +1,47 @@
+module Nebulous
+  class Row < Array
+    def self.headers(str, opts)
+      headers = parse(str, opts).
+        map(&:parameterize).
+        map(&:underscore).
+        map(&:to_sym)
+      headers.zip(headers).to_h
+    end
+    def self.parse(str, opts)
+      str.gsub!(opts.comment_exp, '')
+      str.chomp!
+      begin
+        args = opts.to_h.slice(:col_sep, :row_sep, :quote_char)
+        data = CSV.parse_line str, args
+      rescue CSV::MalformedCSVError
+        exp = /(#{opts.col_sep})(?=(?:[^"]|"[^"]*")*$)/
+        data = str.gsub(exp, "\0").split(/\0/)
+      end
+      data.map!(&:strip)
+      new(data).to_numeric
+    end
+    def to_numeric
+      arr = map do |val|
+        case val
+        when /^[+-]?\d+\.\d+$/
+          val.to_i
+        when /^[+-]?\d+$/
+          val.to_i
+        else
+          val
+        end
+      end
+      self.class.new(arr)
+    end
+    def merge(keys)
+      return self unless keys
+      keys.values.zip(self).to_h
+    end
+  end
+end

data/lib/nebulous/version.rb ADDED

@@ -0,0 +1,3 @@
+module Nebulous
+  VERSION = '0.0.2'.freeze
+end

data/nebulous.gemspec ADDED

@@ -0,0 +1,27 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'nebulous/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'nebulous'
+  spec.version       = Nebulous::VERSION
+  spec.authors       = ['Zach Graves']
+  spec.email         = ['zagraves@gmail.com']
+  spec.summary       = 'Read CSV files with substantially less murderous rage!'
+  spec.description   = spec.summary
+  spec.homepage      = 'https://github.com/zachgraves/nebulous'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(spec)/})
+  spec.require_paths = ['lib']
+  spec.add_dependency 'cocaine', '~> 0.5'
+  spec.add_dependency 'activesupport'
+  spec.add_development_dependency 'bundler', '~> 1.6'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'rspec'
+  spec.add_development_dependency 'byebug'
+end

data/spec/nebulous/chunk_spec.rb ADDED

@@ -0,0 +1,6 @@
+require 'spec_helper'
+describe Nebulous::Chunk do
+  context 'around batches of csv data' do
+  end
+end

data/spec/nebulous/delimiter_detector_spec.rb ADDED

@@ -0,0 +1,161 @@
+require 'spec_helper'
+describe Nebulous::DelimiterDetector do
+  context 'around detecting csv delimiters' do
+    subject { Nebulous::DelimiterDetector }
+    let(:path) { './spec/support/assets/crlf-comma-delimited.csv' }
+    let(:detector) { subject.new(path) }
+    context '#initialize' do
+      it 'can be initialized' do
+        expect(detector).to be_instance_of subject
+      end
+      it 'assigns specified file path' do
+        expect(detector.path).to eq path
+      end
+    end
+    context '#detect' do
+      context 'with CRLF and comma delimiters' do
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: ",", row_sep: "\n" }
+          )
+        end
+      end
+      context 'with CRLF and tab delimiters' do
+        let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: "\t", row_sep: "\n" }
+          )
+        end
+      end
+      context 'with CR and comma delimiters' do
+        let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: ",", row_sep: "\r" }
+          )
+        end
+      end
+      context 'with semicolon delimiters' do
+        let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: ";", row_sep: "\n" }
+          )
+        end
+      end
+      context 'with pipe delimiters' do
+        let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: "|", row_sep: "\n" }
+          )
+        end
+      end
+      context 'with custom delimiters' do
+        let(:detector) { subject.new(path, options) }
+        let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
+        let(:options) do
+          { column_delimiters: ["\n", '$', "\t"] }
+        end
+        it 'detects expected delimiters' do
+          expect(detector.detect).to eq(
+            { col_sep: "$", row_sep: "\n" }
+          )
+        end
+      end
+    end
+    context '#detect_column_delimiter' do
+      context 'with comma delimiters' do
+        it 'detects expected delimiters' do
+          expect(detector.detect_column_delimiter).to eq ','
+        end
+      end
+      context 'with tab delimiters' do
+        let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect_column_delimiter).to eq "\t"
+        end
+      end
+      context 'with semicolon delimiters' do
+        let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect_column_delimiter).to eq ';'
+        end
+      end
+      context 'with pipe delimiters' do
+        let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect_column_delimiter).to eq '|'
+        end
+      end
+      context 'with custom delimiters' do
+        let(:detector) { subject.new(path, options) }
+        let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
+        let(:options) do
+          { column_delimiters: ["\n", '$', "\t"] }
+        end
+        it 'detects expected delimiters' do
+          expect(detector.detect_column_delimiter).to eq '$'
+        end
+      end
+    end
+    context '#detect_line_delimiter' do
+      context 'with CRLF terminators' do
+        it 'detects expected delimiters' do
+          expect(detector.detect_line_delimiter).to eq "\n"
+        end
+      end
+      context 'with CR terminators' do
+        let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect_line_delimiter).to eq "\r"
+        end
+      end
+      context 'with CR, LF terminators' do
+        let(:path) { './spec/support/assets/cr-lf-comma-delimited.csv' }
+        it 'detects expected delimiters' do
+          expect(detector.detect_line_delimiter).to eq "\r"
+        end
+      end
+    end
+    context '#encoding' do
+      it 'defaults to UTF-8' do
+        expect(detector.send(:encoding)).to eq 'UTF-8'
+      end
+    end
+    context '#counts' do
+      it 'returns an array initialized at 0 for each column delimiter' do
+        expect(detector.send(:counts)).to eq [0,0,0,0]
+      end
+    end
+    context '#readline' do
+      it 'returns first line from provided file' do
+        ln = detector.send(:readline)
+        expect(ln).to eq "First name,Last name,From,Access,Qty\n"
+      end
+    end
+  end
+end