data_loader 0.2.0 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -1
- data/README.markdown +5 -5
- data/data_loader.gemspec +3 -2
- data/lib/data_loader/inspector.rb +53 -11
- data/lib/data_loader/loader.rb +18 -9
- data/lib/data_loader/migrator.rb +22 -7
- data/lib/data_loader/version.rb +1 -1
- metadata +32 -17
data/.gitignore
CHANGED
data/README.markdown
CHANGED
@@ -7,8 +7,9 @@ Features:
|
|
7
7
|
|
8
8
|
* Uses MySQL LOAD DATA to efficiently load very large files
|
9
9
|
* Fastercsv is used to inspect the first few rows and choose datatypes
|
10
|
+
* Datatypes can be overridden (types are :text, :string, :datetime, :integer)
|
10
11
|
* Converts header row in to nice ruby-esque column names
|
11
|
-
* Builds a schema using ActiveRecord
|
12
|
+
* Builds a schema using ActiveRecord
|
12
13
|
* If table names are unspecified, they will be derived from the file name
|
13
14
|
* Will prefix table names to avoid collisions (it overwrites existing tables)
|
14
15
|
* Can run under a different connection, as defined in your database.yml
|
@@ -24,20 +25,19 @@ Features:
|
|
24
25
|
config.connection = :development
|
25
26
|
config.separator = ','
|
26
27
|
config.default_ext = 'csv'
|
28
|
+
config.use_local = true
|
27
29
|
end
|
28
30
|
|
29
31
|
# Load data
|
30
|
-
loader.load 'my_csv_file', :my_table
|
32
|
+
loader.load 'my_csv_file', :my_table, :cancel_at => :datetime
|
31
33
|
|
32
34
|
|
33
35
|
### TODO
|
34
36
|
|
35
37
|
* A task to clean up all these temporary tables when we're done.
|
36
38
|
|
37
|
-
* Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
|
38
|
-
|
39
39
|
* Broader support for Rubies, Databases, and ORM/tools for building the schema.
|
40
40
|
|
41
41
|
* More options for the log file (txt vs textile, filename).
|
42
42
|
|
43
|
-
* Better tests!
|
43
|
+
* Better tests!
|
data/data_loader.gemspec
CHANGED
@@ -12,10 +12,11 @@ Gem::Specification.new do |s|
|
|
12
12
|
s.homepage = "https://github.com/nathany/data_loader"
|
13
13
|
s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
|
14
14
|
s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
|
15
|
-
|
15
|
+
|
16
16
|
s.add_dependency('fastercsv', '~> 1.5.4')
|
17
|
-
s.add_dependency('activerecord', '
|
17
|
+
s.add_dependency('activerecord', '>= 2.0.0')
|
18
18
|
s.add_development_dependency('rspec', '~> 1.3')
|
19
|
+
s.add_development_dependency('rake', '~> 0.9.2')
|
19
20
|
|
20
21
|
s.files = `git ls-files`.split("\n")
|
21
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -1,11 +1,20 @@
|
|
1
1
|
require 'fastercsv'
|
2
2
|
require 'active_support'
|
3
3
|
|
4
|
+
# FasterCSV will auto-detect the line separator, which we'd like to pass to MySQL
|
5
|
+
class FasterCSV
|
6
|
+
attr_reader :row_sep
|
7
|
+
end
|
8
|
+
|
4
9
|
module DataLoader
|
5
10
|
|
6
11
|
class Inspector
|
12
|
+
class << self
|
13
|
+
attr_reader :row_sep # set after inspect_file
|
14
|
+
end
|
15
|
+
|
7
16
|
# read a csv and return the columns and types in an ordered array
|
8
|
-
def self.inspect_file(file, separator = ',', inspect_rows = 10)
|
17
|
+
def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
|
9
18
|
fields = nil
|
10
19
|
FasterCSV.open(file,
|
11
20
|
:col_sep => separator,
|
@@ -13,31 +22,64 @@ module DataLoader
|
|
13
22
|
:headers => true,
|
14
23
|
:header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
|
15
24
|
:skip_blanks => true) do |csv|
|
16
|
-
|
25
|
+
@row_sep = csv.row_sep
|
26
|
+
fields = scan_rows(csv, inspect_rows, hints)
|
17
27
|
end
|
18
28
|
fields
|
19
29
|
end
|
20
30
|
|
21
31
|
# scan a few rows to determine data types
|
22
|
-
def self.scan_rows(csv, inspect_rows)
|
32
|
+
def self.scan_rows(csv, inspect_rows, hints = {})
|
23
33
|
first_row = nil
|
24
|
-
columns = {} # unordered hash containing
|
34
|
+
columns = {} # unordered hash containing data types for each header
|
25
35
|
|
26
36
|
1.upto(inspect_rows) do
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
37
|
+
begin
|
38
|
+
row = csv.gets
|
39
|
+
break unless row
|
40
|
+
row.each do |header, value|
|
41
|
+
columns[header] = promote_type(columns[header], dbtype(value))
|
42
|
+
end
|
43
|
+
first_row ||= row # save for later
|
44
|
+
rescue FasterCSV::MalformedCSVError => boom
|
45
|
+
# Don't care about the error but let's retry, since fastercsv will skip this line
|
46
|
+
retry
|
31
47
|
end
|
32
|
-
first_row ||= row # save for later
|
33
48
|
end
|
34
49
|
|
35
50
|
# form an ordered array based on the first row read:
|
36
51
|
fields = []
|
37
52
|
first_row.each do |header, value|
|
38
|
-
data_type = columns[header]
|
53
|
+
data_type = columns[header]
|
39
54
|
fields << {:name => header, :type => data_type}
|
40
55
|
end
|
56
|
+
|
57
|
+
# validate hints
|
58
|
+
hints.stringify_keys!
|
59
|
+
invalid_columns = hints.keys - fields.map {|f| f[:name]}
|
60
|
+
puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
|
61
|
+
invalid_types = hints.values - [:text, :string, :datetime, :integer]
|
62
|
+
abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?
|
63
|
+
|
64
|
+
fields.each do |field|
|
65
|
+
name, field_type = field[:name], field[:type]
|
66
|
+
# override columns with hints
|
67
|
+
if hints.has_key?(name)
|
68
|
+
hint_type = hints[name].to_sym
|
69
|
+
if field_type.nil?
|
70
|
+
puts "Note: undertermined type for #{name} hinted as #{hint_type}."
|
71
|
+
elsif hint_type != field_type
|
72
|
+
puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
|
73
|
+
end
|
74
|
+
field[:type] = hint_type
|
75
|
+
end
|
76
|
+
# default to :string if everything was nil (and no hint)
|
77
|
+
if field[:type].nil?
|
78
|
+
puts "Warning: type could not be determined for #{name}, defaulting to string."
|
79
|
+
field[:type] = :string
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
41
83
|
fields
|
42
84
|
end
|
43
85
|
|
@@ -79,4 +121,4 @@ module DataLoader
|
|
79
121
|
end
|
80
122
|
end
|
81
123
|
|
82
|
-
end
|
124
|
+
end
|
data/lib/data_loader/loader.rb
CHANGED
@@ -13,6 +13,9 @@
|
|
13
13
|
# how many rows to scan the CSV file to determine the data types
|
14
14
|
# connection
|
15
15
|
# a connection name from database.yml to run it under (e.g. :production)
|
16
|
+
# use_local
|
17
|
+
# when true, use LOAD DATA LOCAL INFILE with MySQL if server can't access file
|
18
|
+
# requires MySQL to be compiled with --enable-local-infile
|
16
19
|
# default_ext
|
17
20
|
# extension to append if no file extension is specified
|
18
21
|
# separator
|
@@ -23,38 +26,44 @@
|
|
23
26
|
module DataLoader
|
24
27
|
|
25
28
|
class Loader
|
26
|
-
attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log
|
29
|
+
attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log, :use_local
|
27
30
|
|
28
31
|
def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
|
29
32
|
@folder, @separator = folder, separator
|
30
33
|
@table_prefix, @connection = table_prefix, connection
|
31
34
|
@default_ext = 'csv'
|
32
35
|
@inspect_rows = 10
|
36
|
+
@use_local = false # with MySQL INFILE
|
33
37
|
@log = true
|
34
38
|
yield(self) if block_given?
|
35
39
|
@logfile = File.expand_path(File.join(@folder, 'data_loader.textile'))
|
36
40
|
puts @logfile
|
37
41
|
end
|
38
42
|
|
39
|
-
|
43
|
+
# load
|
44
|
+
# - filename - name of file to load (in folder and default_ext)
|
45
|
+
# - table - table to load file into (with table_prefix), derives from filename by default
|
46
|
+
# - hints - hash of column name => data type (one of :text, :string, :datetime, :integer)
|
47
|
+
def load(filename, table = nil, hints = {})
|
40
48
|
filename = [filename, default_ext].join('.') if File.extname(filename).empty?
|
41
49
|
full_file = File.expand_path(File.join(@folder, filename))
|
42
50
|
table = Migrator.derive_table_name(filename) if table.nil?
|
43
51
|
table = [@table_prefix, table].join('_') unless @table_prefix.blank?
|
44
|
-
columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
|
52
|
+
columns = Inspector.inspect_file(full_file, @separator, @inspect_rows, hints)
|
53
|
+
row_sep = Inspector.row_sep
|
45
54
|
log_columns(table, columns)
|
46
|
-
Migrator.migrate(full_file, columns, table, @separator, @connection)
|
55
|
+
Migrator.migrate(full_file, columns, table, @separator, @connection, @use_local, row_sep)
|
47
56
|
table
|
48
57
|
end
|
49
|
-
|
58
|
+
|
50
59
|
def log(text)
|
51
60
|
return unless @log
|
52
|
-
|
61
|
+
|
53
62
|
File.open(@logfile, 'a') do |file|
|
54
63
|
file << text
|
55
64
|
end
|
56
65
|
end
|
57
|
-
|
66
|
+
|
58
67
|
def clear_log
|
59
68
|
FileUtils.remove(@logfile) if File.exist?(@logfile)
|
60
69
|
end
|
@@ -63,7 +72,7 @@ module DataLoader
|
|
63
72
|
|
64
73
|
def log_columns(table, columns)
|
65
74
|
return unless @log
|
66
|
-
|
75
|
+
|
67
76
|
File.open(@logfile, 'a') do |file|
|
68
77
|
file << "\ntable{width:80%}.\n|_\\2. #{table} |\n" # table header (textile)
|
69
78
|
columns.each_with_index do |column, index|
|
@@ -75,7 +84,7 @@ module DataLoader
|
|
75
84
|
end
|
76
85
|
end
|
77
86
|
end
|
78
|
-
|
87
|
+
|
79
88
|
end
|
80
89
|
end
|
81
90
|
|
data/lib/data_loader/migrator.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
module DataLoader
|
2
2
|
|
3
3
|
class Migrator
|
4
|
-
def self.migrate(file, columns, table, separator = ',', conn = :root)
|
4
|
+
def self.migrate(file, columns, table, separator = ',', conn = :root, local = false, row_sep = "\r\n")
|
5
5
|
with_connection(conn) do
|
6
6
|
create_schema(table, columns)
|
7
7
|
puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
|
8
|
-
load_data(file, table, separator)
|
8
|
+
load_data(file, table, local, separator, row_sep)
|
9
|
+
nullify_dates(table, columns)
|
9
10
|
end
|
10
11
|
end
|
11
12
|
|
@@ -20,12 +21,26 @@ module DataLoader
|
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
24
|
+
# empty strings import as 0000-00-00 00:00:00, convert to nil
|
25
|
+
def self.nullify_dates(table_name, data_struct)
|
26
|
+
date_columns = data_struct.map {|column| column[:name] if column[:type] == :datetime }.compact!
|
27
|
+
date_columns.each do |column|
|
28
|
+
sql = <<-SQL
|
29
|
+
UPDATE #{table_name}
|
30
|
+
SET #{column} = NULL
|
31
|
+
WHERE #{column} = 0
|
32
|
+
SQL
|
33
|
+
ActiveRecord::Base.connection.execute(sql)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
23
37
|
# uses MySQL LOAD DATA to import the whole file, ignoring the header line
|
24
|
-
def self.load_data(file, table_name, separator = ',')
|
38
|
+
def self.load_data(file, table_name, local, separator = ',', row_sep = "\r\n")
|
39
|
+
local_txt = local ? "LOCAL" : ''
|
25
40
|
sql = <<-SQL
|
26
|
-
LOAD DATA
|
41
|
+
LOAD DATA #{local_txt} INFILE '#{file}' INTO TABLE #{table_name.to_s}
|
27
42
|
FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
|
28
|
-
LINES TERMINATED BY '
|
43
|
+
LINES TERMINATED BY '#{row_sep}'
|
29
44
|
IGNORE 1 LINES;
|
30
45
|
SQL
|
31
46
|
ActiveRecord::Base.connection.execute(sql)
|
@@ -36,7 +51,7 @@ module DataLoader
|
|
36
51
|
if Rails.env.development?
|
37
52
|
yield
|
38
53
|
else
|
39
|
-
ActiveRecord::Base.establish_connection(conn)
|
54
|
+
ActiveRecord::Base.establish_connection(conn)
|
40
55
|
yield
|
41
56
|
ActiveRecord::Base.establish_connection(RAILS_ENV)
|
42
57
|
end
|
@@ -48,4 +63,4 @@ module DataLoader
|
|
48
63
|
name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
|
49
64
|
end
|
50
65
|
end
|
51
|
-
end
|
66
|
+
end
|
data/lib/data_loader/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_loader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 4
|
10
|
+
version: 0.2.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Nathan Youngman
|
@@ -15,11 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-10-22 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
|
21
|
+
type: :runtime
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
@@ -32,25 +31,26 @@ dependencies:
|
|
32
31
|
- 5
|
33
32
|
- 4
|
34
33
|
version: 1.5.4
|
35
|
-
type: :runtime
|
36
34
|
version_requirements: *id001
|
35
|
+
name: fastercsv
|
37
36
|
- !ruby/object:Gem::Dependency
|
38
|
-
|
37
|
+
type: :runtime
|
39
38
|
prerelease: false
|
40
39
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
40
|
none: false
|
42
41
|
requirements:
|
43
|
-
- -
|
42
|
+
- - ">="
|
44
43
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
44
|
+
hash: 15
|
46
45
|
segments:
|
47
46
|
- 2
|
48
|
-
-
|
49
|
-
|
50
|
-
|
47
|
+
- 0
|
48
|
+
- 0
|
49
|
+
version: 2.0.0
|
51
50
|
version_requirements: *id002
|
51
|
+
name: activerecord
|
52
52
|
- !ruby/object:Gem::Dependency
|
53
|
-
|
53
|
+
type: :development
|
54
54
|
prerelease: false
|
55
55
|
requirement: &id003 !ruby/object:Gem::Requirement
|
56
56
|
none: false
|
@@ -62,8 +62,24 @@ dependencies:
|
|
62
62
|
- 1
|
63
63
|
- 3
|
64
64
|
version: "1.3"
|
65
|
-
type: :development
|
66
65
|
version_requirements: *id003
|
66
|
+
name: rspec
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
type: :development
|
69
|
+
prerelease: false
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 63
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
- 9
|
79
|
+
- 2
|
80
|
+
version: 0.9.2
|
81
|
+
version_requirements: *id004
|
82
|
+
name: rake
|
67
83
|
description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
|
68
84
|
email:
|
69
85
|
- git@nathany.com
|
@@ -87,7 +103,6 @@ files:
|
|
87
103
|
- lib/data_loader/version.rb
|
88
104
|
- spec/lib/data_loader/inspector_spec.rb
|
89
105
|
- spec/spec_helper.rb
|
90
|
-
has_rdoc: true
|
91
106
|
homepage: https://github.com/nathany/data_loader
|
92
107
|
licenses: []
|
93
108
|
|
@@ -119,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
134
|
requirements: []
|
120
135
|
|
121
136
|
rubyforge_project:
|
122
|
-
rubygems_version: 1.
|
137
|
+
rubygems_version: 1.8.11
|
123
138
|
signing_key:
|
124
139
|
specification_version: 3
|
125
140
|
summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
|