data_loader 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/LICENSE +19 -0
- data/README.markdown +42 -0
- data/Rakefile +2 -0
- data/data_loader.gemspec +24 -0
- data/lib/data_loader.rb +3 -0
- data/lib/data_loader/inspector.rb +82 -0
- data/lib/data_loader/loader.rb +45 -0
- data/lib/data_loader/migrator.rb +51 -0
- data/lib/data_loader/version.rb +3 -0
- data/spec/lib/data_loader/inspector_spec.rb +82 -0
- data/spec/spec_helper.rb +1 -0
- metadata +128 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2011 Nathan Youngman
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
## Data Loader
|
2
|
+
|
3
|
+
Data Loader is a tool to load CSV files into a MySQL database. It was designed
|
4
|
+
to import raw data into tables that could then be manipulated with SQL.
|
5
|
+
|
6
|
+
Features:
|
7
|
+
|
8
|
+
* Uses MySQL LOAD DATA to efficiently load very large files
|
9
|
+
* Fastercsv is used to inspect the first few rows and choose datatypes
|
10
|
+
* Converts header row in to nice ruby-esque column names
|
11
|
+
* Builds a schema using ActiveRecord 2.x
|
12
|
+
* If table names are unspecified, they will be derived from the file name
|
13
|
+
* Will prefix table names to avoid collisions (it overwrites existing tables)
|
14
|
+
* Can run under a different connection, as defined in your database.yml
|
15
|
+
|
16
|
+
### Usage
|
17
|
+
|
18
|
+
# Configure (everything has defaults, see loader.rb)
|
19
|
+
loader = DataLoader::Loader.new do |config|
|
20
|
+
config.table_prefix = :import
|
21
|
+
config.folder = 'path/to/csv/files/'
|
22
|
+
config.inspect_rows = 10
|
23
|
+
config.connection = :development
|
24
|
+
config.separator = ','
|
25
|
+
config.default_ext = 'csv'
|
26
|
+
end
|
27
|
+
|
28
|
+
# Load data
|
29
|
+
loader.load 'my_csv_file', :my_table
|
30
|
+
|
31
|
+
|
32
|
+
### TODO
|
33
|
+
|
34
|
+
* Write the column structure for each table to a text file. This file can be stored in Git, so that if the CSV files change, the diff will make it obvious what changed.
|
35
|
+
|
36
|
+
* A task to clean up all these temporary tables when we're done.
|
37
|
+
|
38
|
+
* Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
|
39
|
+
|
40
|
+
* Broader support for Rubies, Databases, and ORM/tools for building the schema.
|
41
|
+
|
42
|
+
* Better tests!
|
data/Rakefile
ADDED
data/data_loader.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "data_loader/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "data_loader"
|
7
|
+
s.version = DataLoader::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.required_ruby_version = '>= 1.8.7'
|
10
|
+
s.authors = ["Nathan Youngman"]
|
11
|
+
s.email = ["git@nathany.com"]
|
12
|
+
s.homepage = "https://github.com/nathany/data_loader"
|
13
|
+
s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
|
14
|
+
s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
|
15
|
+
|
16
|
+
s.add_dependency('fastercsv', '~> 1.5.4')
|
17
|
+
s.add_dependency('activerecord', '~> 2.3')
|
18
|
+
s.add_development_dependency('rspec', '~> 1.3')
|
19
|
+
|
20
|
+
s.files = `git ls-files`.split("\n")
|
21
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
22
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
23
|
+
s.require_paths = ["lib"]
|
24
|
+
end
|
data/lib/data_loader.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'fastercsv'
|
2
|
+
require 'active_support'
|
3
|
+
|
4
|
+
module DataLoader
|
5
|
+
|
6
|
+
class Inspector
|
7
|
+
# read a csv and return the columns and types in an ordered array
|
8
|
+
def self.inspect_file(file, separator = ',', inspect_rows = 10)
|
9
|
+
fields = nil
|
10
|
+
FasterCSV.open(file,
|
11
|
+
:col_sep => separator,
|
12
|
+
:converters => [:date_time, :integer], # :integer, :float, :date, :date_time
|
13
|
+
:headers => true,
|
14
|
+
:header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
|
15
|
+
:skip_blanks => true) do |csv|
|
16
|
+
fields = scan_rows(csv, inspect_rows)
|
17
|
+
end
|
18
|
+
fields
|
19
|
+
end
|
20
|
+
|
21
|
+
# scan a few rows to determine data types
|
22
|
+
def self.scan_rows(csv, inspect_rows)
|
23
|
+
first_row = nil
|
24
|
+
columns = {} # unordered hash containing date types for each header
|
25
|
+
|
26
|
+
1.upto(inspect_rows) do
|
27
|
+
row = csv.gets
|
28
|
+
break unless row
|
29
|
+
row.each do |header, value|
|
30
|
+
columns[header] = promote_type(columns[header], dbtype(value))
|
31
|
+
end
|
32
|
+
first_row ||= row # save for later
|
33
|
+
end
|
34
|
+
|
35
|
+
# form an ordered array based on the first row read:
|
36
|
+
fields = []
|
37
|
+
first_row.each do |header, value|
|
38
|
+
data_type = columns[header] || :string # default to :string if everything was nil
|
39
|
+
fields << {:name => header, :type => data_type}
|
40
|
+
end
|
41
|
+
fields
|
42
|
+
end
|
43
|
+
|
44
|
+
# determine what datatype is most suitable for the value
|
45
|
+
def self.dbtype(value)
|
46
|
+
if value.is_a?(Fixnum)
|
47
|
+
:integer
|
48
|
+
elsif value.is_a?(DateTime)
|
49
|
+
:datetime
|
50
|
+
elsif value.is_a?(String)
|
51
|
+
if value.blank?
|
52
|
+
nil
|
53
|
+
elsif value.length <= 255
|
54
|
+
:string
|
55
|
+
else
|
56
|
+
:text
|
57
|
+
end
|
58
|
+
elsif value.nil?
|
59
|
+
nil
|
60
|
+
else
|
61
|
+
raise 'Unknown type'
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# given two datatypes choose what fits them both
|
66
|
+
def self.promote_type(*types)
|
67
|
+
types.compact!
|
68
|
+
if types.empty?
|
69
|
+
nil
|
70
|
+
elsif (types - [:text, :string, :datetime, :integer]).length > 0 # unknown types
|
71
|
+
raise 'Unknown type'
|
72
|
+
elsif Set.new(types).length == 1 # one type
|
73
|
+
types.first
|
74
|
+
elsif types.include?(:text)
|
75
|
+
:text
|
76
|
+
else
|
77
|
+
:string
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# DataLoader::Loader
|
2
|
+
#
|
3
|
+
# Loads CSV files into MySQL
|
4
|
+
#
|
5
|
+
# Config:
|
6
|
+
#
|
7
|
+
# folder
|
8
|
+
# base folder for calls to load()
|
9
|
+
# table_prefix
|
10
|
+
# prefix for derived table names
|
11
|
+
# very important because an existing table will be overwritten!!!
|
12
|
+
# inspect_rows
|
13
|
+
# how many rows to scan the CSV file to determine the data types
|
14
|
+
# connection
|
15
|
+
# a connection name from database.yml to run it under (e.g. :production)
|
16
|
+
# default_ext
|
17
|
+
# extension to append if no file extension is specified
|
18
|
+
# separator
|
19
|
+
# a comma (,)
|
20
|
+
|
21
|
+
module DataLoader
|
22
|
+
|
23
|
+
class Loader
|
24
|
+
attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator
|
25
|
+
|
26
|
+
def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
|
27
|
+
@folder, @separator = folder, separator
|
28
|
+
@table_prefix, @connection = table_prefix, connection
|
29
|
+
@default_ext = 'csv'
|
30
|
+
@inspect_rows = 10
|
31
|
+
yield(self) if block_given?
|
32
|
+
end
|
33
|
+
|
34
|
+
def load(filename, table = nil)
|
35
|
+
filename = [filename, default_ext].join('.') if File.extname(filename).empty?
|
36
|
+
full_file = File.expand_path(File.join(@folder, filename))
|
37
|
+
table = Migrator.derive_table_name(filename) if table.nil?
|
38
|
+
table = [@table_prefix, table].join('_') unless @table_prefix.blank?
|
39
|
+
columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
|
40
|
+
Migrator.migrate(full_file, columns, table, @separator, @connection)
|
41
|
+
table
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module DataLoader
|
2
|
+
|
3
|
+
class Migrator
|
4
|
+
def self.migrate(file, columns, table, separator = ',', conn = :root)
|
5
|
+
with_connection(conn) do
|
6
|
+
create_schema(table, columns)
|
7
|
+
puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
|
8
|
+
load_data(file, table, separator)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# takes a column,type data structre and makes a table
|
13
|
+
def self.create_schema(table_name, data_struct)
|
14
|
+
ActiveRecord::Schema.define do
|
15
|
+
create_table table_name, :force => true, :id => false do |t|
|
16
|
+
data_struct.each do |column|
|
17
|
+
t.column(column[:name], column[:type])
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# uses MySQL LOAD DATA to import the whole file, ignoring the header line
|
24
|
+
def self.load_data(file, table_name, separator = ',')
|
25
|
+
sql = <<-SQL
|
26
|
+
LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
|
27
|
+
FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
|
28
|
+
LINES TERMINATED BY '\r\n'
|
29
|
+
IGNORE 1 LINES;
|
30
|
+
SQL
|
31
|
+
ActiveRecord::Base.connection.execute(sql)
|
32
|
+
end
|
33
|
+
|
34
|
+
# runs a block under a different connection from database.yml
|
35
|
+
def self.with_connection(conn = :root)
|
36
|
+
if Rails.env.development?
|
37
|
+
yield
|
38
|
+
else
|
39
|
+
ActiveRecord::Base.establish_connection(conn)
|
40
|
+
yield
|
41
|
+
ActiveRecord::Base.establish_connection(RAILS_ENV)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# a pretty table name
|
46
|
+
def self.derive_table_name(file)
|
47
|
+
name = File.basename(file, File.extname(file)) # just file
|
48
|
+
name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DataLoader::Inspector, "data typer" do
|
4
|
+
before(:each) do
|
5
|
+
@loader = DataLoader::Inspector
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should use :integer for Fixnum" do
|
9
|
+
@loader.dbtype(3).should == :integer
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should use :datetime for DateTime" do
|
13
|
+
@loader.dbtype(DateTime.new).should == :datetime
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should use :string for String under 255 characters" do
|
17
|
+
@loader.dbtype('s' * 255).should == :string
|
18
|
+
@loader.dbtype('s').should == :string
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should use :text for larger Strings" do
|
22
|
+
@loader.dbtype('s' * 256).should == :text
|
23
|
+
@loader.dbtype('s' * 4096).should == :text
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should use nil for nil" do
|
27
|
+
@loader.dbtype(nil).should be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should use nil for empty strings" do
|
31
|
+
@loader.dbtype('').should be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should use nil for empty looking strings" do
|
35
|
+
@loader.dbtype(' ').should be_nil
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should raise an error for unknown types" do
|
39
|
+
lambda { @loader.dbtype(Float) }.should raise_exception
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
describe DataLoader::Inspector, "promoter" do
|
45
|
+
before(:each) do
|
46
|
+
@loader = DataLoader::Inspector
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should choose :text over :string, :datetime, or :integer" do
|
50
|
+
[:string, :datetime, :integer].each do |t|
|
51
|
+
@loader.promote_type(:text, t).should == :text
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should choose :string over :datetime or :integer" do
|
56
|
+
@loader.promote_type(:string, :integer).should == :string
|
57
|
+
@loader.promote_type(:string, :datetime).should == :string
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should choose :string for :datetime and :integer" do
|
61
|
+
@loader.promote_type(:datetime, :integer).should == :string
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should choose keep the type if both the same" do
|
65
|
+
[:text, :string, :datetime, :integer].each do |t|
|
66
|
+
@loader.promote_type(t, t).should == t
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should ignore nils" do
|
71
|
+
@loader.promote_type(:integer, nil).should == :integer
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should return nil if everything is nil" do
|
75
|
+
@loader.promote_type(nil).should be_nil
|
76
|
+
@loader.promote_type(nil, nil).should be_nil
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should raise an error for unknown types" do
|
80
|
+
lambda { @loader.promote_type(:string, :blarg) }.should raise_exception
|
81
|
+
end
|
82
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'lib/data_loader/inspector'
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: data_loader
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Nathan Youngman
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-03-25 00:00:00 -06:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: fastercsv
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 11
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 5
|
33
|
+
- 4
|
34
|
+
version: 1.5.4
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: activerecord
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 5
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 3
|
49
|
+
version: "2.3"
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: rspec
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 9
|
61
|
+
segments:
|
62
|
+
- 1
|
63
|
+
- 3
|
64
|
+
version: "1.3"
|
65
|
+
type: :development
|
66
|
+
version_requirements: *id003
|
67
|
+
description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
|
68
|
+
email:
|
69
|
+
- git@nathany.com
|
70
|
+
executables: []
|
71
|
+
|
72
|
+
extensions: []
|
73
|
+
|
74
|
+
extra_rdoc_files: []
|
75
|
+
|
76
|
+
files:
|
77
|
+
- .gitignore
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE
|
80
|
+
- README.markdown
|
81
|
+
- Rakefile
|
82
|
+
- data_loader.gemspec
|
83
|
+
- lib/data_loader.rb
|
84
|
+
- lib/data_loader/inspector.rb
|
85
|
+
- lib/data_loader/loader.rb
|
86
|
+
- lib/data_loader/migrator.rb
|
87
|
+
- lib/data_loader/version.rb
|
88
|
+
- spec/lib/data_loader/inspector_spec.rb
|
89
|
+
- spec/spec_helper.rb
|
90
|
+
has_rdoc: true
|
91
|
+
homepage: https://github.com/nathany/data_loader
|
92
|
+
licenses: []
|
93
|
+
|
94
|
+
post_install_message:
|
95
|
+
rdoc_options: []
|
96
|
+
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
hash: 57
|
105
|
+
segments:
|
106
|
+
- 1
|
107
|
+
- 8
|
108
|
+
- 7
|
109
|
+
version: 1.8.7
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
requirements: []
|
120
|
+
|
121
|
+
rubyforge_project:
|
122
|
+
rubygems_version: 1.6.2
|
123
|
+
signing_key:
|
124
|
+
specification_version: 3
|
125
|
+
summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
|
126
|
+
test_files:
|
127
|
+
- spec/lib/data_loader/inspector_spec.rb
|
128
|
+
- spec/spec_helper.rb
|