RubyGems - data_loader - Versions diffs - 0.1.0 - Mend

data_loader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/.gitignore +6 -0
data/Gemfile +4 -0
data/LICENSE +19 -0
data/README.markdown +42 -0
data/Rakefile +2 -0
data/data_loader.gemspec +24 -0
data/lib/data_loader.rb +3 -0
data/lib/data_loader/inspector.rb +82 -0
data/lib/data_loader/loader.rb +45 -0
data/lib/data_loader/migrator.rb +51 -0
data/lib/data_loader/version.rb +3 -0
data/spec/lib/data_loader/inspector_spec.rb +82 -0
data/spec/spec_helper.rb +1 -0
metadata +128 -0

data/.gitignore ADDED

@@ -0,0 +1,6 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*
+.rvmrc
+bin/*

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in data_loader.gemspec
+gemspec

data/LICENSE ADDED

@@ -0,0 +1,19 @@
+Copyright (c) 2011 Nathan Youngman
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.markdown ADDED

@@ -0,0 +1,42 @@
+## Data Loader
+Data Loader is a tool to load CSV files into a MySQL database. It was designed
+to import raw data into tables that could then be manipulated with SQL.
+Features:
+* Uses MySQL LOAD DATA to efficiently load very large files
+* Fastercsv is used to inspect the first few rows and choose datatypes
+* Converts header row in to nice ruby-esque column names
+* Builds a schema using ActiveRecord 2.x
+* If table names are unspecified, they will be derived from the file name
+* Will prefix table names to avoid collisions (it overwrites existing tables)
+* Can run under a different connection, as defined in your database.yml
+### Usage
+    # Configure (everything has defaults, see loader.rb)
+    loader = DataLoader::Loader.new do |config|
+      config.table_prefix = :import
+      config.folder = 'path/to/csv/files/'
+      config.inspect_rows = 10
+      config.connection = :development
+      config.separator = ','
+      config.default_ext = 'csv'
+    end
+    # Load data
+    loader.load 'my_csv_file', :my_table
+### TODO
+* Write the column structure for each table to a text file. This file can be stored in Git, so that if the CSV files change, the diff will make it obvious what changed.
+* A task to clean up all these temporary tables when we're done.
+* Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
+* Broader support for Rubies, Databases, and ORM/tools for building the schema.
+* Better tests!

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ require 'bundler'
2	+ Bundler::GemHelper.install_tasks

data/data_loader.gemspec ADDED

@@ -0,0 +1,24 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "data_loader/version"
+Gem::Specification.new do |s|
+  s.name        = "data_loader"
+  s.version     = DataLoader::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.required_ruby_version = '>= 1.8.7'
+  s.authors     = ["Nathan Youngman"]
+  s.email       = ["git@nathany.com"]
+  s.homepage    = "https://github.com/nathany/data_loader"
+  s.summary     = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
+  s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
+  s.add_dependency('fastercsv', '~> 1.5.4')
+  s.add_dependency('activerecord', '~> 2.3')
+  s.add_development_dependency('rspec', '~> 1.3')
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+end

data/lib/data_loader.rb ADDED

@@ -0,0 +1,3 @@
+require 'data_loader/inspector'
+require 'data_loader/migrator'
+require 'data_loader/loader'  # <= start here

data/lib/data_loader/inspector.rb ADDED

@@ -0,0 +1,82 @@
+require 'fastercsv'
+require 'active_support'
+module DataLoader
+  class Inspector
+    # read a csv and return the columns and types in an ordered array
+    def self.inspect_file(file, separator = ',', inspect_rows = 10)
+      fields = nil
+      FasterCSV.open(file,
+        :col_sep => separator,
+        :converters => [:date_time, :integer],    # :integer, :float, :date, :date_time
+        :headers => true,
+        :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
+        :skip_blanks => true) do |csv|
+          fields = scan_rows(csv, inspect_rows)
+      end
+      fields
+    end
+    # scan a few rows to determine data types
+    def self.scan_rows(csv, inspect_rows)
+      first_row = nil
+      columns = {}  # unordered hash containing date types for each header
+      1.upto(inspect_rows) do
+        row = csv.gets
+        break unless row
+        row.each do |header, value|
+          columns[header] = promote_type(columns[header], dbtype(value))
+        end
+        first_row ||= row # save for later
+      end
+      # form an ordered array based on the first row read:
+      fields = []
+      first_row.each do |header, value|
+        data_type = columns[header] || :string  # default to :string if everything was nil
+        fields << {:name => header, :type => data_type}
+      end
+      fields
+    end
+    # determine what datatype is most suitable for the value
+    def self.dbtype(value)
+      if value.is_a?(Fixnum)
+        :integer
+      elsif value.is_a?(DateTime)
+        :datetime
+      elsif value.is_a?(String)
+        if value.blank?
+          nil
+        elsif value.length <= 255
+          :string
+        else
+          :text
+        end
+      elsif value.nil?
+        nil
+      else
+        raise 'Unknown type'
+      end
+    end
+    # given two datatypes choose what fits them both
+    def self.promote_type(*types)
+      types.compact!
+      if types.empty?
+        nil
+      elsif (types - [:text, :string, :datetime, :integer]).length > 0 # unknown types
+        raise 'Unknown type'
+      elsif Set.new(types).length == 1  # one type
+        types.first
+      elsif types.include?(:text)
+        :text
+      else
+        :string
+      end
+    end
+  end
+end

data/lib/data_loader/loader.rb ADDED

@@ -0,0 +1,45 @@
+# DataLoader::Loader
+#
+# Loads CSV files into MySQL
+#
+# Config:
+#
+#   folder
+#     base folder for calls to load()
+#   table_prefix
+#     prefix for derived table names
+#     very important because an existing table will be overwritten!!!
+#   inspect_rows
+#     how many rows to scan the CSV file to determine the data types
+#   connection
+#     a connection name from database.yml to run it under (e.g. :production)
+#   default_ext
+#     extension to append if no file extension is specified
+#   separator
+#     a comma (,)
+module DataLoader
+  class Loader
+    attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator
+    def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
+      @folder, @separator = folder, separator
+      @table_prefix, @connection = table_prefix, connection
+      @default_ext = 'csv'
+      @inspect_rows = 10
+      yield(self) if block_given?
+    end
+    def load(filename, table = nil)
+      filename = [filename, default_ext].join('.') if File.extname(filename).empty?
+      full_file = File.expand_path(File.join(@folder, filename))
+      table = Migrator.derive_table_name(filename) if table.nil?
+      table = [@table_prefix, table].join('_') unless @table_prefix.blank?
+      columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
+      Migrator.migrate(full_file, columns, table, @separator, @connection)
+      table
+    end
+  end
+end

data/lib/data_loader/migrator.rb ADDED

@@ -0,0 +1,51 @@
+module DataLoader
+  class Migrator
+    def self.migrate(file, columns, table, separator = ',', conn = :root)
+      with_connection(conn) do
+        create_schema(table, columns)
+        puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
+        load_data(file, table, separator)
+      end
+    end
+    # takes a column,type data structre and makes a table
+    def self.create_schema(table_name, data_struct)
+      ActiveRecord::Schema.define do
+        create_table table_name, :force => true, :id => false do |t|
+          data_struct.each do |column|
+            t.column(column[:name], column[:type])
+          end
+        end
+      end
+    end
+    # uses MySQL LOAD DATA to import the whole file, ignoring the header line
+    def self.load_data(file, table_name, separator = ',')
+      sql = <<-SQL
+        LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
+          FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
+          LINES TERMINATED BY '\r\n'
+          IGNORE 1 LINES;
+      SQL
+      ActiveRecord::Base.connection.execute(sql)
+    end
+    # runs a block under a different connection from database.yml
+    def self.with_connection(conn = :root)
+      if Rails.env.development?
+        yield
+      else
+        ActiveRecord::Base.establish_connection(conn)
+        yield
+        ActiveRecord::Base.establish_connection(RAILS_ENV)
+      end
+    end
+    # a pretty table name
+    def self.derive_table_name(file)
+      name = File.basename(file, File.extname(file))  # just file
+      name.underscore.sub(/[0-9_]*$/, '')      # remove trailing numbers
+    end
+  end
+end

data/lib/data_loader/version.rb ADDED

@@ -0,0 +1,3 @@
+module DataLoader
+  VERSION = "0.1.0"
+end

data/spec/lib/data_loader/inspector_spec.rb ADDED

@@ -0,0 +1,82 @@
+require 'spec_helper'
+describe DataLoader::Inspector, "data typer" do
+  before(:each) do
+    @loader = DataLoader::Inspector
+  end
+  it "should use :integer for Fixnum" do
+    @loader.dbtype(3).should == :integer
+  end
+  it "should use :datetime for DateTime" do
+    @loader.dbtype(DateTime.new).should == :datetime
+  end
+  it "should use :string for String under 255 characters" do
+    @loader.dbtype('s' * 255).should == :string
+    @loader.dbtype('s').should == :string
+  end
+  it "should use :text for larger Strings" do
+    @loader.dbtype('s' * 256).should == :text
+    @loader.dbtype('s' * 4096).should == :text
+  end
+  it "should use nil for nil" do
+    @loader.dbtype(nil).should be_nil
+  end
+  it "should use nil for empty strings" do
+    @loader.dbtype('').should be_nil
+  end
+  it "should use nil for empty looking strings" do
+    @loader.dbtype('  ').should be_nil
+  end
+  it "should raise an error for unknown types" do
+    lambda { @loader.dbtype(Float) }.should raise_exception
+  end
+end
+describe DataLoader::Inspector, "promoter" do
+  before(:each) do
+    @loader = DataLoader::Inspector
+  end
+  it "should choose :text over :string, :datetime, or :integer" do
+    [:string, :datetime, :integer].each do |t|
+      @loader.promote_type(:text, t).should == :text
+    end
+  end
+  it "should choose :string over :datetime or :integer" do
+    @loader.promote_type(:string, :integer).should == :string
+    @loader.promote_type(:string, :datetime).should == :string
+  end
+  it "should choose :string for :datetime and :integer" do
+    @loader.promote_type(:datetime, :integer).should == :string
+  end
+  it "should choose keep the type if both the same" do
+    [:text, :string, :datetime, :integer].each do |t|
+      @loader.promote_type(t, t).should == t
+    end
+  end
+  it "should ignore nils" do
+    @loader.promote_type(:integer, nil).should == :integer
+  end
+  it "should return nil if everything is nil" do
+    @loader.promote_type(nil).should be_nil
+    @loader.promote_type(nil, nil).should be_nil
+  end
+  it "should raise an error for unknown types" do
+    lambda { @loader.promote_type(:string, :blarg) }.should raise_exception
+  end
+end

data/spec/spec_helper.rb ADDED

	@@ -0,0 +1 @@
1	+ require 'lib/data_loader/inspector'

metadata ADDED

@@ -0,0 +1,128 @@
+--- !ruby/object:Gem::Specification
+name: data_loader
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease:
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Nathan Youngman
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-03-25 00:00:00 -06:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: fastercsv
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 11
+        segments:
+        - 1
+        - 5
+        - 4
+        version: 1.5.4
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: activerecord
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 5
+        segments:
+        - 2
+        - 3
+        version: "2.3"
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 9
+        segments:
+        - 1
+        - 3
+        version: "1.3"
+  type: :development
+  version_requirements: *id003
+description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
+email:
+- git@nathany.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- README.markdown
+- Rakefile
+- data_loader.gemspec
+- lib/data_loader.rb
+- lib/data_loader/inspector.rb
+- lib/data_loader/loader.rb
+- lib/data_loader/migrator.rb
+- lib/data_loader/version.rb
+- spec/lib/data_loader/inspector_spec.rb
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: https://github.com/nathany/data_loader
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 57
+      segments:
+      - 1
+      - 8
+      - 7
+      version: 1.8.7
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
+test_files:
+- spec/lib/data_loader/inspector_spec.rb
+- spec/spec_helper.rb