RubyGems - data_loader - Versions diffs - 0.2.0 → 0.2.4 - Mend

data_loader 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.gitignore +2 -1
data/README.markdown +5 -5
data/data_loader.gemspec +3 -2
data/lib/data_loader/inspector.rb +53 -11
data/lib/data_loader/loader.rb +18 -9
data/lib/data_loader/migrator.rb +22 -7
data/lib/data_loader/version.rb +1 -1
metadata +32 -17

data/.gitignore CHANGED

@@ -4,4 +4,5 @@ Gemfile.lock
 pkg/*
 .rvmrc
 bin/*
-*.tmproj
+*.tmproj
+*.bbprojectd

data/README.markdown CHANGED

@@ -7,8 +7,9 @@ Features:
 * Uses MySQL LOAD DATA to efficiently load very large files
 * Fastercsv is used to inspect the first few rows and choose datatypes
+* Datatypes can be overridden (types are :text, :string, :datetime, :integer)
 * Converts header row in to nice ruby-esque column names
-* Builds a schema using ActiveRecord 2.x
+* Builds a schema using ActiveRecord
 * If table names are unspecified, they will be derived from the file name
 * Will prefix table names to avoid collisions (it overwrites existing tables)
 * Can run under a different connection, as defined in your database.yml
@@ -24,20 +25,19 @@ Features:
       config.connection = :development
       config.separator = ','
       config.default_ext = 'csv'
+      config.use_local = true
     end
     # Load data
-    loader.load 'my_csv_file', :my_table
+    loader.load 'my_csv_file', :my_table, :cancel_at => :datetime
 ### TODO
 * A task to clean up all these temporary tables when we're done.
-* Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
 * Broader support for Rubies, Databases, and ORM/tools for building the schema.
 * More options for the log file (txt vs textile, filename).
-* Better tests!
+* Better tests!

data/data_loader.gemspec CHANGED

@@ -12,10 +12,11 @@ Gem::Specification.new do |s|
   s.homepage    = "https://github.com/nathany/data_loader"
   s.summary     = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
   s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
   s.add_dependency('fastercsv', '~> 1.5.4')
-  s.add_dependency('activerecord', '~> 2.3')
+  s.add_dependency('activerecord', '>= 2.0.0')
   s.add_development_dependency('rspec', '~> 1.3')
+  s.add_development_dependency('rake', '~> 0.9.2')
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")

data/lib/data_loader/inspector.rb CHANGED

@@ -1,11 +1,20 @@
 require 'fastercsv'
 require 'active_support'
+# FasterCSV will auto-detect the line separator, which we'd like to pass to MySQL
+class FasterCSV
+  attr_reader :row_sep
+end
 module DataLoader
   class Inspector
+    class << self
+      attr_reader :row_sep      # set after inspect_file
+    end
     # read a csv and return the columns and types in an ordered array
-    def self.inspect_file(file, separator = ',', inspect_rows = 10)
+    def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
       fields = nil
       FasterCSV.open(file,
         :col_sep => separator,
@@ -13,31 +22,64 @@ module DataLoader
         :headers => true,
         :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
         :skip_blanks => true) do |csv|
-          fields = scan_rows(csv, inspect_rows)
+          @row_sep = csv.row_sep
+          fields = scan_rows(csv, inspect_rows, hints)
       end
       fields
     end
     # scan a few rows to determine data types
-    def self.scan_rows(csv, inspect_rows)
+    def self.scan_rows(csv, inspect_rows, hints = {})
       first_row = nil
-      columns = {}  # unordered hash containing date types for each header
+      columns = {}  # unordered hash containing data types for each header
       1.upto(inspect_rows) do
-        row = csv.gets
-        break unless row
-        row.each do |header, value|
-          columns[header] = promote_type(columns[header], dbtype(value))
+        begin
+          row = csv.gets
+          break unless row
+          row.each do |header, value|
+            columns[header] = promote_type(columns[header], dbtype(value))
+          end
+          first_row ||= row # save for later
+        rescue FasterCSV::MalformedCSVError => boom
+          # Don't care about the error but let's retry, since fastercsv will skip this line
+          retry
         end
-        first_row ||= row # save for later
       end
       # form an ordered array based on the first row read:
       fields = []
       first_row.each do |header, value|
-        data_type = columns[header] || :string  # default to :string if everything was nil
+        data_type = columns[header]
         fields << {:name => header, :type => data_type}
       end
+      # validate hints
+      hints.stringify_keys!
+      invalid_columns = hints.keys - fields.map {|f| f[:name]}
+      puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
+      invalid_types = hints.values - [:text, :string, :datetime, :integer]
+      abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?
+      fields.each do |field|
+        name, field_type = field[:name], field[:type]
+        # override columns with hints
+        if hints.has_key?(name)
+          hint_type = hints[name].to_sym
+          if field_type.nil?
+            puts "Note: undertermined type for #{name} hinted as #{hint_type}."
+          elsif hint_type != field_type
+            puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
+          end
+          field[:type] = hint_type
+        end
+        # default to :string if everything was nil (and no hint)
+        if field[:type].nil?
+          puts "Warning: type could not be determined for #{name}, defaulting to string."
+          field[:type] = :string
+        end
+      end
       fields
     end
@@ -79,4 +121,4 @@ module DataLoader
     end
   end
-end
+end

data/lib/data_loader/loader.rb CHANGED

@@ -13,6 +13,9 @@
 #     how many rows to scan the CSV file to determine the data types
 #   connection
 #     a connection name from database.yml to run it under (e.g. :production)
+#   use_local
+#     when true, use LOAD DATA LOCAL INFILE with MySQL if server can't access file
+#     requires MySQL to be compiled with --enable-local-infile
 #   default_ext
 #     extension to append if no file extension is specified
 #   separator
@@ -23,38 +26,44 @@
 module DataLoader
   class Loader
-    attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log
+    attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log, :use_local
     def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
       @folder, @separator = folder, separator
       @table_prefix, @connection = table_prefix, connection
       @default_ext = 'csv'
       @inspect_rows = 10
+      @use_local = false  # with MySQL INFILE
       @log = true
       yield(self) if block_given?
       @logfile = File.expand_path(File.join(@folder, 'data_loader.textile'))
       puts @logfile
     end
-    def load(filename, table = nil)
+    # load
+    # - filename - name of file to load (in folder and default_ext)
+    # - table - table to load file into (with table_prefix), derives from filename by default
+    # - hints - hash of column name => data type (one of :text, :string, :datetime, :integer)
+    def load(filename, table = nil, hints = {})
       filename = [filename, default_ext].join('.') if File.extname(filename).empty?
       full_file = File.expand_path(File.join(@folder, filename))
       table = Migrator.derive_table_name(filename) if table.nil?
       table = [@table_prefix, table].join('_') unless @table_prefix.blank?
-      columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
+      columns = Inspector.inspect_file(full_file, @separator, @inspect_rows, hints)
+      row_sep = Inspector.row_sep
       log_columns(table, columns)
-      Migrator.migrate(full_file, columns, table, @separator, @connection)
+      Migrator.migrate(full_file, columns, table, @separator, @connection, @use_local, row_sep)
       table
     end
     def log(text)
       return unless @log
       File.open(@logfile, 'a') do |file|
         file << text
       end
     end
     def clear_log
       FileUtils.remove(@logfile) if File.exist?(@logfile)
     end
@@ -63,7 +72,7 @@ module DataLoader
     def log_columns(table, columns)
       return unless @log
       File.open(@logfile, 'a') do |file|
         file << "\ntable{width:80%}.\n|_\\2. #{table} |\n"   # table header (textile)
         columns.each_with_index do |column, index|
@@ -75,7 +84,7 @@ module DataLoader
         end
       end
     end
   end
 end

data/lib/data_loader/migrator.rb CHANGED

@@ -1,11 +1,12 @@
 module DataLoader
   class Migrator
-    def self.migrate(file, columns, table, separator = ',', conn = :root)
+    def self.migrate(file, columns, table, separator = ',', conn = :root, local = false, row_sep = "\r\n")
       with_connection(conn) do
         create_schema(table, columns)
         puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
-        load_data(file, table, separator)
+        load_data(file, table, local, separator, row_sep)
+        nullify_dates(table, columns)
       end
     end
@@ -20,12 +21,26 @@ module DataLoader
       end
     end
+    # empty strings import as 0000-00-00 00:00:00, convert to nil
+    def self.nullify_dates(table_name, data_struct)
+      date_columns = data_struct.map {|column| column[:name] if column[:type] == :datetime }.compact!
+      date_columns.each do |column|
+        sql = <<-SQL
+          UPDATE #{table_name}
+          SET #{column} = NULL
+          WHERE #{column} = 0
+        SQL
+        ActiveRecord::Base.connection.execute(sql)
+      end
+    end
     # uses MySQL LOAD DATA to import the whole file, ignoring the header line
-    def self.load_data(file, table_name, separator = ',')
+    def self.load_data(file, table_name, local, separator = ',', row_sep = "\r\n")
+      local_txt = local ? "LOCAL" : ''
       sql = <<-SQL
-        LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
+        LOAD DATA #{local_txt} INFILE '#{file}' INTO TABLE #{table_name.to_s}
           FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
-          LINES TERMINATED BY '\r\n'
+          LINES TERMINATED BY '#{row_sep}'
           IGNORE 1 LINES;
       SQL
       ActiveRecord::Base.connection.execute(sql)
@@ -36,7 +51,7 @@ module DataLoader
       if Rails.env.development?
         yield
       else
-        ActiveRecord::Base.establish_connection(conn)
+        ActiveRecord::Base.establish_connection(conn)
         yield
         ActiveRecord::Base.establish_connection(RAILS_ENV)
       end
@@ -48,4 +63,4 @@ module DataLoader
       name.underscore.sub(/[0-9_]*$/, '')      # remove trailing numbers
     end
   end
-end
+end

data/lib/data_loader/version.rb CHANGED

@@ -1,3 +1,3 @@
 module DataLoader
-  VERSION = "0.2.0"
+  VERSION = "0.2.4"
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: data_loader
 version: !ruby/object:Gem::Version
-  hash: 23
+  hash: 31
   prerelease:
   segments:
   - 0
   - 2
-  - 0
-  version: 0.2.0
+  - 4
+  version: 0.2.4
 platform: ruby
 authors:
 - Nathan Youngman
@@ -15,11 +15,10 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-05 00:00:00 -06:00
-default_executable:
+date: 2011-10-22 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: fastercsv
+  type: :runtime
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
@@ -32,25 +31,26 @@ dependencies:
         - 5
         - 4
         version: 1.5.4
-  type: :runtime
   version_requirements: *id001
+  name: fastercsv
 - !ruby/object:Gem::Dependency
-  name: activerecord
+  type: :runtime
   prerelease: false
   requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ~>
+    - - ">="
       - !ruby/object:Gem::Version
-        hash: 5
+        hash: 15
         segments:
         - 2
-        - 3
-        version: "2.3"
-  type: :runtime
+        - 0
+        - 0
+        version: 2.0.0
   version_requirements: *id002
+  name: activerecord
 - !ruby/object:Gem::Dependency
-  name: rspec
+  type: :development
   prerelease: false
   requirement: &id003 !ruby/object:Gem::Requirement
     none: false
@@ -62,8 +62,24 @@ dependencies:
         - 1
         - 3
         version: "1.3"
-  type: :development
   version_requirements: *id003
+  name: rspec
+- !ruby/object:Gem::Dependency
+  type: :development
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 63
+        segments:
+        - 0
+        - 9
+        - 2
+        version: 0.9.2
+  version_requirements: *id004
+  name: rake
 description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
 email:
 - git@nathany.com
@@ -87,7 +103,6 @@ files:
 - lib/data_loader/version.rb
 - spec/lib/data_loader/inspector_spec.rb
 - spec/spec_helper.rb
-has_rdoc: true
 homepage: https://github.com/nathany/data_loader
 licenses: []
@@ -119,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.6.2
+rubygems_version: 1.8.11
 signing_key:
 specification_version: 3
 summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.