RubyGems - table_importer - Versions diffs - 0.0.1 - Mend

table_importer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +7 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/LICENSE.txt +22 -0
data/README.md +9 -0
data/Rakefile +2 -0
data/lib/table_importer/copy_and_paste.rb +118 -0
data/lib/table_importer/csv.rb +146 -0
data/lib/table_importer/excel.rb +92 -0
data/lib/table_importer/exceptions.rb +29 -0
data/lib/table_importer/source.rb +149 -0
data/lib/table_importer/version.rb +3 -0
data/lib/table_importer.rb +3 -0
data/spec/copy_and_paste_spec.rb +134 -0
data/spec/csv_spec.rb +135 -0
data/spec/excel_spec.rb +139 -0
data/spec/files/csv/10-1359651839-google_(1).csv +0 -0
data/spec/files/csv/11-1359651879-contacts_(1) (1).csv +158 -0
data/spec/files/csv/11-1359651879-contacts_(1).csv +158 -0
data/spec/files/csv/6-1359649307-contacts (1).csv +157 -0
data/spec/files/csv/6-1359649307-contacts (2).csv +158 -0
data/spec/files/csv/6-1359649307-contacts (3).csv +158 -0
data/spec/files/csv/6-1359649307-contacts.csv +158 -0
data/spec/files/csv/7-1359650836-6-1359649307-contacts.csv +158 -0
data/spec/files/csv/8-1359651745-contacts.csv +158 -0
data/spec/files/csv/9-1359651826-google_(1).csv +0 -0
data/spec/files/csv/bad_headers_2.csv +45 -0
data/spec/files/csv/csv_headers.csv +55 -0
data/spec/files/csv/csv_no_headers.csv +5 -0
data/spec/files/csv/edge_cases.csv +16 -0
data/spec/files/csv/hong_kong.csv +1150 -0
data/spec/files/csv/hong_kong_no_headers.csv +9 -0
data/spec/files/csv/hong_kong_small.csv +10 -0
data/spec/files/csv/mexico2013_pressdoc.csv +3248 -0
data/spec/files/csv/no_content.csv +22 -0
data/spec/files/csv/semicolon.csv +214 -0
data/spec/files/csv/with_headers.csv +10 -0
data/spec/files/csv/with_headers_large.csv +10760 -0
data/spec/files/csv/without_headers.csv +9 -0
data/spec/files/excel/edge_cases.xls +0 -0
data/spec/files/excel/no_content.xls +0 -0
data/spec/files/excel/no_content.xlsx +0 -0
data/spec/files/excel/with_headers.xls +0 -0
data/spec/files/excel/with_headers_large.xls +0 -0
data/spec/files/excel/with_headers_large.xlsx +0 -0
data/spec/files/excel/without_headers.xls +0 -0
data/spec/spec_helper.rb +20 -0
data/table_importer.gemspec +32 -0
data/tasks/rspec.rake +4 -0
metadata +254 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: a17aaa63e9aa58fb38435c0b2a205bbbfa63af1e
+  data.tar.gz: bd75fd7ac865182feb9d0ff6ab7c0d4e18af35a2
+SHA512:
+  metadata.gz: 19aca54d1033e6bb488482f5a8506b902ef6f73bfd80f1539102710b87ef506fb319ab9252daa271e37635ccf26127c0a4a3c1336ec8f5b211f32357c7a5e6fc
+  data.tar.gz: 27b5e4f763ee14972c3fddb2f69ea84d111a097069ab5605bf50df40f55120480bef0ecbcc42147880ffd073470e4d9fbec06d9d4129b87dbb76b9e06efbe918

data/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.travis.yml ADDED Viewed

@@ -0,0 +1,7 @@
+language: ruby
+rvm:
+  - "1.9.3"
+  - "2.0.0"
+  - "2.1.1"
+  - "2.1.2"
+  - jruby-19mode # JRuby in 1.9 mode

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in table_importer.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2014 pr.co
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 TODO: Write your name
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,9 @@
+[![Build Status](https://travis-ci.org/pressdoc/table_importer.svg?branch=master)](https://travis-ci.org/pressdoc/table_importer)[![Coverage Status](https://coveralls.io/repos/pressdoc/table_importer/badge.png?branch=master)](https://coveralls.io/r/pressdoc/table_importer?branch=master)
+==============
+Table Importer
+==============
+Given a file (or a string) containing a container, along with options, it will return a hash of those values. Great for importing poorly formatted CSV files.
+Only works  for ruby versions >= 1.9.3.

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+ Dir.glob('tasks/*/.rake').each(&method(:import))

data/lib/table_importer/copy_and_paste.rb ADDED Viewed

@@ -0,0 +1,118 @@
+module TableImporter
+  class CopyAndPaste < Source
+    def initialize(data)
+      @data = assign_data(data[:content])
+      @column_separator, @record_separator = assign_separators(data[:col_sep], data[:rec_sep])
+      @headers, @headers_present = assign_headers(data[:headers], data[:headers_present])
+      @compulsory_headers = data[:compulsory_headers]
+      @delete_empty_columns = @data.length < 50000
+    end
+    def assign_data(content)
+      raise Exceptions::EmptyFileImportError.new if content.blank? || content[0..100].gsub(/[^A-Za-z0-9]/, '').blank?
+      content.gsub!(/\r\n|\r/, "\n")
+      return content
+    end
+    def assign_separators(col_sep, rec_sep)
+      col_sep = SEPARATORS[col_sep.to_sym] if !col_sep.nil?
+      rec_sep = SEPARATORS[rec_sep.to_sym] if !rec_sep.nil?
+      col_sep, rec_sep = data_conforms_pattern(col_sep, rec_sep)
+    end
+    def data_conforms_pattern(col_sep, rec_sep)
+      # Check to see if data is of bcc style
+      first_item = @data.split(",").first
+      if first_item.present? && first_item.match(/\S@\S/)
+        if first_item.match(/<(\S+@\S+)/)
+          rec_sep ||= ">, "
+          col_sep ||= " <"
+        end
+      end
+      return col_sep, rec_sep
+    end
+    def assign_headers(headers, headers_present)
+      headers = headers_present ? get_first_line : get_headers if headers.blank?
+      return headers, headers_present
+    end
+    def get_first_line
+      @data.split(get_record_separator).first.split(get_column_separator).map(&:to_sym)
+    end
+    def get_type
+      "copy_and_paste"
+    end
+    def get_headers
+      return @headers if @headers.present?
+      default_headers(100)
+    end
+    def get_preview_lines(start_point = @headers_present ? 1 : 0, end_point = 10)
+      begin
+        lines = clean_chunks([get_lines(start_point, end_point)], @compulsory_headers, @delete_empty_columns)[0][:lines]
+        if lines.first.nil?
+          get_preview_lines(start_point+10, end_point+10)
+        else
+          lines[0..7]
+        end
+      rescue StandardError
+        raise Exceptions::EmptyStringImportError.new
+      end
+    end
+    def get_lines(start_point, number_of_lines)
+      number_of_lines = number_of_lines - 1 if number_of_lines != -1 # -1 means return all lines.
+      mapped_lines = []
+      get_column_separator
+      @data.split(get_record_separator).each do |line|
+        split_line = line.split(@column_separator)
+        split_line = remove_whitespace(split_line)
+        mapped_lines << Hash[@headers.zip split_line]
+      end
+      mapped_lines[start_point..(start_point+number_of_lines)]
+    end
+    def remove_whitespace(column)
+      column.each do |column_item|
+        column_item.strip!
+      end
+      column
+    end
+    def get_chunks(chunk_size)
+      @headers = convert_headers(get_first_line, @headers, @headers_present)
+      lines = get_lines(0, -1).in_groups_of(chunk_size, false)
+      clean_chunks(lines, @compulsory_headers)
+    end
+    def convert_headers(provided_headers, mapped_headers, headers_present)
+      new_headers = headers_present ? provided_headers : default_headers
+      new_headers = default_headers(new_headers.count)
+      mapped_headers.each do |key, value|
+        if value.to_i.to_s == value
+          new_headers[value.to_i] = key.to_sym
+        end
+      end
+      new_headers
+    end
+    def get_column_separator(first_line = @data)
+      return @column_separator if !@column_separator.nil? && @column_separator.length > 0
+      separators = get_sep_count(first_line)
+      separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
+      @column_separator = sort_separators(separators)
+    end
+    def get_record_separator(first_line = @data)
+      return @record_separator if !@record_separator.nil? && @record_separator.length > 0
+      separators = get_sep_count(first_line)
+      separators.reject!{ |sep| sep.keys[0] == get_column_separator}
+      @record_separator = sort_separators(separators)
+    end
+  end
+end

data/lib/table_importer/csv.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# encoding: UTF-8
+module TableImporter
+  class CSV < Source
+    def initialize(data)
+      @headers_present = data[:headers_present] # user has indicated headers are provided
+      @headers = data[:headers]
+      @column_separator = SEPARATORS[data[:column_separator].to_sym] if !data[:column_separator].nil?
+      @record_separator = !data[:record_separator].nil? && data[:record_separator].length > 0 ? SEPARATORS[data[:record_separator].to_sym] : "\n"
+      @compulsory_headers = data[:compulsory_headers]
+      @file = data[:content]
+      @delete_empty_columns = File.size(@file) < 100000
+      begin
+        first_line = get_first_line
+        if first_line == 0
+          raise ArgumentError
+        end
+        get_column_separator(first_line)
+        @preview_lines = file_has_no_content
+        @headers = @headers_present ? first_line.split(@column_separator) : default_headers(100) if @headers.blank?
+      rescue ArgumentError
+        @file = clean_file(@file)
+        retry
+      end
+    end
+    def get_first_line
+      begin
+        SmarterCSV.process(@file.path, default_options({:col_sep => @column_separator.present? ? @column_separator : "\n", :row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
+          if @headers_present
+            return chunk.first.keys[0].to_s
+          else
+            return chunk.first.values[0].to_s
+          end
+        end
+      rescue EOFError
+        raise Exceptions::EmptyFileImportError.new
+      end
+    end
+    def file_has_no_content
+      begin
+        lines = get_preview_lines
+        if lines.blank? || lines == 0
+          raise Exceptions::EmptyFileImportError.new
+        else
+          return lines
+        end
+      rescue NoMethodError
+        raise Exceptions::EmptyFileImportError.new
+      end
+    end
+    def get_type
+      "csv"
+    end
+    def get_headers
+      @headers
+    end
+    def get_column_separator(first_line = get_first_line)
+      return @column_separator if !@column_separator.nil? && @column_separator.length > 0
+      separators = get_sep_count(first_line)
+      separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
+      @column_separator = sort_separators(separators)
+    end
+    def get_record_separator(first_line = get_first_line)
+      return @record_separator if !@record_separator.nil? && @record_separator.length > 0
+      separators = get_sep_count(first_line)
+      separators.reject!{ |sep| sep.keys[0] == get_column_separator}
+      @record_separator = sort_separators(separators)
+    end
+    def get_preview_lines
+      begin
+        return clean_chunks([@preview_lines], @compulsory_headers, @delete_empty_columns)[0].symbolize_keys[:lines] if !@preview_lines.blank?
+        if @delete_empty_columns
+          chunks = SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 50}))
+          return clean_chunks(chunks, @compulsory_headers, true)[0].symbolize_keys[:lines][0..7]
+        end
+        SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
+          return clean_chunks([chunk], @compulsory_headers)[0].symbolize_keys[:lines][0..7]
+        end
+      rescue SmarterCSV::HeaderSizeMismatch
+        raise Exceptions::HeaderMismatchError.new
+      end
+    end
+    # this is horrendously slow
+    def get_lines(start, number_of_lines)
+      get_chunks(50)[start..(start + number_of_lines)]
+    end
+    def get_chunks(chunk_size)
+      begin
+        chunks = []
+        if @headers_present
+          key_mapping = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).delete_if{ |key, value| value.blank?}
+          chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :key_mapping => key_mapping, :remove_unmapped_keys => true, :user_provided_headers => nil}))
+        else
+          user_provided_headers = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).values
+          chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :user_provided_headers => user_provided_headers, :remove_empty_values => true}))
+        end
+        clean_chunks(chunks, @compulsory_headers, @delete_empty_columns)
+      rescue ArgumentError
+        @file = clean_file(@file)
+        retry
+      end
+    end
+    def convert_headers(provided_headers, mapped_headers, headers_present)
+      new_headers = []
+      old_headers = headers_present ? provided_headers : default_headers
+      old_headers.each_with_index do |key, index|
+        key_to_add = "column_#{index}".to_sym
+        mapped_headers.each do |new_key, value|
+          if value.to_s == index.to_s
+            key_to_add = new_key
+          end
+        end
+        new_headers << key_to_add
+      end
+      Hash[old_headers.zip(new_headers)]
+    end
+    # fix quote_char
+    # bit of a hack here to provide the correct number of default headers to the user (rather than just 100)
+    def default_options(options = {})
+      {:col_sep => @column_separator, :row_sep => @record_separator, :quote_char => "‱", :remove_empty_values => false,
+        :verbose => false, :headers_in_file => @headers_present, :convert_values_to_numeric => false,
+        :user_provided_headers => @headers_present ? (@headers == nil || @headers == {} ? nil : @headers) : default_headers(100)}.merge(options)
+    end
+    def clean_file(file)
+      contents = file.read
+      import = Tempfile.new(["import", ".xls"], :encoding => "UTF-8")
+      import.write(contents.force_encoding('UTF-8').encode('UTF-16', :invalid => :replace, :replace => '?').encode('UTF-8').gsub!(/\r\n|\r/, "\n"))
+      import.close
+      return import
+    end
+  end
+end

data/lib/table_importer/excel.rb ADDED Viewed

@@ -0,0 +1,92 @@
+module TableImporter
+  class Excel < Source
+    def initialize(data)
+      begin
+        @type = File.extname(data[:content]) == ".xls" ? "xls" : "xlsx"
+        @file_path = data[:content].path
+        @headers_present = data[:headers_present]
+        @file = get_file
+        @compulsory_headers = data[:compulsory_headers]
+        @delete_empty_columns = (File.size(@file_path) < 100000)
+        @mapping = !data[:user_headers].blank? ? data[:user_headers] : data[:headers]
+        raise Exceptions::EmptyFileImportError.new if !@file.first_row
+        if !data[:headers].nil?
+          @headers = data[:headers]
+        else
+          @headers = @headers_present ? @file.row(1).map { |header| header.to_sym } : default_headers
+        end
+      rescue NoMethodError
+        raise Exceptions::HeaderMismatchError.new
+      end
+    end
+    def get_headers
+      @headers
+    end
+    def get_file
+      begin
+        if @type == "xls"
+          Roo::Excel.new(@file_path).sheet(0)
+        elsif @type == "xlsx"
+          Roo::Excelx.new(@file_path).sheet(0)
+        end
+      rescue TypeError
+        raise Exceptions::IncorrectFileError.new
+      end
+    end
+    def get_type
+      "xls"
+    end
+    def get_preview_lines(start_point = 0, end_point = 10)
+      begin
+        if clean_chunks([get_lines(start_point, end_point)], @compulsory_headers)[0][:lines].first.nil?
+          get_preview_lines(start_point+10, end_point+10)
+        else
+          @headers = @mapping.present? ? convert_headers : @headers
+          clean_chunks([get_lines(start_point+1, end_point+1)], @compulsory_headers, @delete_empty_columns)[0][:lines][0..7]
+        end
+      rescue SystemStackError
+        raise Exceptions::EmptyFileImportError.new
+      end
+    end
+    def get_lines(start, number_of_lines)
+      @last_row ||= @file.last_row
+      finish = [@last_row, start + number_of_lines].min
+      mapped_lines = []
+      (start...finish).each do |row_number|
+        mapped_lines << Hash[@headers.zip(@file.row(row_number))]
+      end
+      mapped_lines
+    end
+    def convert_headers
+      new_headers = @headers_present ? @file.row(1) : default_headers
+      new_headers = default_headers(new_headers.count)
+      @mapping.each do |key, value|
+        if value.to_i.to_s == value
+          new_headers[value.to_i] = key.to_sym
+        end
+      end
+      new_headers
+    end
+    def get_chunks(chunk_size)
+      @headers = convert_headers
+      @last_row ||= @file.last_row
+      chunks = []
+      start_point = @headers_present ? 2 : 1
+      while chunks.count <= @last_row/chunk_size
+        chunks << get_lines(start_point, chunk_size)
+        start_point += chunk_size
+      end
+      chunks.last << Hash[@headers.zip(@file.row(@last_row))]
+      clean_chunks(chunks, @compulsory_headers)
+    end
+  end
+end