data_loader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rvmrc
6
+ bin/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_loader.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2011 Nathan Youngman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,42 @@
1
+ ## Data Loader
2
+
3
+ Data Loader is a tool to load CSV files into a MySQL database. It was designed
4
+ to import raw data into tables that could then be manipulated with SQL.
5
+
6
+ Features:
7
+
8
+ * Uses MySQL LOAD DATA to efficiently load very large files
9
+ * Fastercsv is used to inspect the first few rows and choose datatypes
10
+ * Converts header row in to nice ruby-esque column names
11
+ * Builds a schema using ActiveRecord 2.x
12
+ * If table names are unspecified, they will be derived from the file name
13
+ * Will prefix table names to avoid collisions (it overwrites existing tables)
14
+ * Can run under a different connection, as defined in your database.yml
15
+
16
+ ### Usage
17
+
18
+ # Configure (everything has defaults, see loader.rb)
19
+ loader = DataLoader::Loader.new do |config|
20
+ config.table_prefix = :import
21
+ config.folder = 'path/to/csv/files/'
22
+ config.inspect_rows = 10
23
+ config.connection = :development
24
+ config.separator = ','
25
+ config.default_ext = 'csv'
26
+ end
27
+
28
+ # Load data
29
+ loader.load 'my_csv_file', :my_table
30
+
31
+
32
+ ### TODO
33
+
34
+ * Write the column structure for each table to a text file. This file can be stored in Git, so that if the CSV files change, the diff will make it obvious what changed.
35
+
36
+ * A task to clean up all these temporary tables when we're done.
37
+
38
+ * Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
39
+
40
+ * Broader support for Rubies, Databases, and ORM/tools for building the schema.
41
+
42
+ * Better tests!
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_loader/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_loader"
7
+ s.version = DataLoader::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.required_ruby_version = '>= 1.8.7'
10
+ s.authors = ["Nathan Youngman"]
11
+ s.email = ["git@nathany.com"]
12
+ s.homepage = "https://github.com/nathany/data_loader"
13
+ s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
14
+ s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
15
+
16
+ s.add_dependency('fastercsv', '~> 1.5.4')
17
+ s.add_dependency('activerecord', '~> 2.3')
18
+ s.add_development_dependency('rspec', '~> 1.3')
19
+
20
+ s.files = `git ls-files`.split("\n")
21
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
22
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
23
+ s.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,3 @@
1
+ require 'data_loader/inspector'
2
+ require 'data_loader/migrator'
3
+ require 'data_loader/loader' # <= start here
@@ -0,0 +1,82 @@
1
+ require 'fastercsv'
2
+ require 'active_support'
3
+
4
+ module DataLoader
5
+
6
+ class Inspector
7
+ # read a csv and return the columns and types in an ordered array
8
+ def self.inspect_file(file, separator = ',', inspect_rows = 10)
9
+ fields = nil
10
+ FasterCSV.open(file,
11
+ :col_sep => separator,
12
+ :converters => [:date_time, :integer], # :integer, :float, :date, :date_time
13
+ :headers => true,
14
+ :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
15
+ :skip_blanks => true) do |csv|
16
+ fields = scan_rows(csv, inspect_rows)
17
+ end
18
+ fields
19
+ end
20
+
21
+ # scan a few rows to determine data types
22
+ def self.scan_rows(csv, inspect_rows)
23
+ first_row = nil
24
+ columns = {} # unordered hash containing date types for each header
25
+
26
+ 1.upto(inspect_rows) do
27
+ row = csv.gets
28
+ break unless row
29
+ row.each do |header, value|
30
+ columns[header] = promote_type(columns[header], dbtype(value))
31
+ end
32
+ first_row ||= row # save for later
33
+ end
34
+
35
+ # form an ordered array based on the first row read:
36
+ fields = []
37
+ first_row.each do |header, value|
38
+ data_type = columns[header] || :string # default to :string if everything was nil
39
+ fields << {:name => header, :type => data_type}
40
+ end
41
+ fields
42
+ end
43
+
44
+ # determine what datatype is most suitable for the value
45
+ def self.dbtype(value)
46
+ if value.is_a?(Fixnum)
47
+ :integer
48
+ elsif value.is_a?(DateTime)
49
+ :datetime
50
+ elsif value.is_a?(String)
51
+ if value.blank?
52
+ nil
53
+ elsif value.length <= 255
54
+ :string
55
+ else
56
+ :text
57
+ end
58
+ elsif value.nil?
59
+ nil
60
+ else
61
+ raise 'Unknown type'
62
+ end
63
+ end
64
+
65
+ # given two datatypes choose what fits them both
66
+ def self.promote_type(*types)
67
+ types.compact!
68
+ if types.empty?
69
+ nil
70
+ elsif (types - [:text, :string, :datetime, :integer]).length > 0 # unknown types
71
+ raise 'Unknown type'
72
+ elsif Set.new(types).length == 1 # one type
73
+ types.first
74
+ elsif types.include?(:text)
75
+ :text
76
+ else
77
+ :string
78
+ end
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,45 @@
1
+ # DataLoader::Loader
2
+ #
3
+ # Loads CSV files into MySQL
4
+ #
5
+ # Config:
6
+ #
7
+ # folder
8
+ # base folder for calls to load()
9
+ # table_prefix
10
+ # prefix for derived table names
11
+ # very important because an existing table will be overwritten!!!
12
+ # inspect_rows
13
+ # how many rows to scan the CSV file to determine the data types
14
+ # connection
15
+ # a connection name from database.yml to run it under (e.g. :production)
16
+ # default_ext
17
+ # extension to append if no file extension is specified
18
+ # separator
19
+ # a comma (,)
20
+
21
+ module DataLoader
22
+
23
+ class Loader
24
+ attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator
25
+
26
+ def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
27
+ @folder, @separator = folder, separator
28
+ @table_prefix, @connection = table_prefix, connection
29
+ @default_ext = 'csv'
30
+ @inspect_rows = 10
31
+ yield(self) if block_given?
32
+ end
33
+
34
+ def load(filename, table = nil)
35
+ filename = [filename, default_ext].join('.') if File.extname(filename).empty?
36
+ full_file = File.expand_path(File.join(@folder, filename))
37
+ table = Migrator.derive_table_name(filename) if table.nil?
38
+ table = [@table_prefix, table].join('_') unless @table_prefix.blank?
39
+ columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
40
+ Migrator.migrate(full_file, columns, table, @separator, @connection)
41
+ table
42
+ end
43
+ end
44
+
45
+ end
@@ -0,0 +1,51 @@
1
+ module DataLoader
2
+
3
+ class Migrator
4
+ def self.migrate(file, columns, table, separator = ',', conn = :root)
5
+ with_connection(conn) do
6
+ create_schema(table, columns)
7
+ puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
8
+ load_data(file, table, separator)
9
+ end
10
+ end
11
+
12
+ # takes a column,type data structre and makes a table
13
+ def self.create_schema(table_name, data_struct)
14
+ ActiveRecord::Schema.define do
15
+ create_table table_name, :force => true, :id => false do |t|
16
+ data_struct.each do |column|
17
+ t.column(column[:name], column[:type])
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ # uses MySQL LOAD DATA to import the whole file, ignoring the header line
24
+ def self.load_data(file, table_name, separator = ',')
25
+ sql = <<-SQL
26
+ LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
27
+ FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
28
+ LINES TERMINATED BY '\r\n'
29
+ IGNORE 1 LINES;
30
+ SQL
31
+ ActiveRecord::Base.connection.execute(sql)
32
+ end
33
+
34
+ # runs a block under a different connection from database.yml
35
+ def self.with_connection(conn = :root)
36
+ if Rails.env.development?
37
+ yield
38
+ else
39
+ ActiveRecord::Base.establish_connection(conn)
40
+ yield
41
+ ActiveRecord::Base.establish_connection(RAILS_ENV)
42
+ end
43
+ end
44
+
45
+ # a pretty table name
46
+ def self.derive_table_name(file)
47
+ name = File.basename(file, File.extname(file)) # just file
48
+ name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,3 @@
1
+ module DataLoader
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,82 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataLoader::Inspector, "data typer" do
4
+ before(:each) do
5
+ @loader = DataLoader::Inspector
6
+ end
7
+
8
+ it "should use :integer for Fixnum" do
9
+ @loader.dbtype(3).should == :integer
10
+ end
11
+
12
+ it "should use :datetime for DateTime" do
13
+ @loader.dbtype(DateTime.new).should == :datetime
14
+ end
15
+
16
+ it "should use :string for String under 255 characters" do
17
+ @loader.dbtype('s' * 255).should == :string
18
+ @loader.dbtype('s').should == :string
19
+ end
20
+
21
+ it "should use :text for larger Strings" do
22
+ @loader.dbtype('s' * 256).should == :text
23
+ @loader.dbtype('s' * 4096).should == :text
24
+ end
25
+
26
+ it "should use nil for nil" do
27
+ @loader.dbtype(nil).should be_nil
28
+ end
29
+
30
+ it "should use nil for empty strings" do
31
+ @loader.dbtype('').should be_nil
32
+ end
33
+
34
+ it "should use nil for empty looking strings" do
35
+ @loader.dbtype(' ').should be_nil
36
+ end
37
+
38
+ it "should raise an error for unknown types" do
39
+ lambda { @loader.dbtype(Float) }.should raise_exception
40
+ end
41
+ end
42
+
43
+
44
+ describe DataLoader::Inspector, "promoter" do
45
+ before(:each) do
46
+ @loader = DataLoader::Inspector
47
+ end
48
+
49
+ it "should choose :text over :string, :datetime, or :integer" do
50
+ [:string, :datetime, :integer].each do |t|
51
+ @loader.promote_type(:text, t).should == :text
52
+ end
53
+ end
54
+
55
+ it "should choose :string over :datetime or :integer" do
56
+ @loader.promote_type(:string, :integer).should == :string
57
+ @loader.promote_type(:string, :datetime).should == :string
58
+ end
59
+
60
+ it "should choose :string for :datetime and :integer" do
61
+ @loader.promote_type(:datetime, :integer).should == :string
62
+ end
63
+
64
+ it "should choose keep the type if both the same" do
65
+ [:text, :string, :datetime, :integer].each do |t|
66
+ @loader.promote_type(t, t).should == t
67
+ end
68
+ end
69
+
70
+ it "should ignore nils" do
71
+ @loader.promote_type(:integer, nil).should == :integer
72
+ end
73
+
74
+ it "should return nil if everything is nil" do
75
+ @loader.promote_type(nil).should be_nil
76
+ @loader.promote_type(nil, nil).should be_nil
77
+ end
78
+
79
+ it "should raise an error for unknown types" do
80
+ lambda { @loader.promote_type(:string, :blarg) }.should raise_exception
81
+ end
82
+ end
@@ -0,0 +1 @@
1
+ require 'lib/data_loader/inspector'
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_loader
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Nathan Youngman
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-03-25 00:00:00 -06:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: fastercsv
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 11
30
+ segments:
31
+ - 1
32
+ - 5
33
+ - 4
34
+ version: 1.5.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: activerecord
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ hash: 5
46
+ segments:
47
+ - 2
48
+ - 3
49
+ version: "2.3"
50
+ type: :runtime
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: rspec
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 9
61
+ segments:
62
+ - 1
63
+ - 3
64
+ version: "1.3"
65
+ type: :development
66
+ version_requirements: *id003
67
+ description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
68
+ email:
69
+ - git@nathany.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files: []
75
+
76
+ files:
77
+ - .gitignore
78
+ - Gemfile
79
+ - LICENSE
80
+ - README.markdown
81
+ - Rakefile
82
+ - data_loader.gemspec
83
+ - lib/data_loader.rb
84
+ - lib/data_loader/inspector.rb
85
+ - lib/data_loader/loader.rb
86
+ - lib/data_loader/migrator.rb
87
+ - lib/data_loader/version.rb
88
+ - spec/lib/data_loader/inspector_spec.rb
89
+ - spec/spec_helper.rb
90
+ has_rdoc: true
91
+ homepage: https://github.com/nathany/data_loader
92
+ licenses: []
93
+
94
+ post_install_message:
95
+ rdoc_options: []
96
+
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ hash: 57
105
+ segments:
106
+ - 1
107
+ - 8
108
+ - 7
109
+ version: 1.8.7
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ hash: 3
116
+ segments:
117
+ - 0
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project:
122
+ rubygems_version: 1.6.2
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
126
+ test_files:
127
+ - spec/lib/data_loader/inspector_spec.rb
128
+ - spec/spec_helper.rb