data_loader 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -1
- data/README.markdown +5 -5
- data/data_loader.gemspec +3 -2
- data/lib/data_loader/inspector.rb +53 -11
- data/lib/data_loader/loader.rb +18 -9
- data/lib/data_loader/migrator.rb +22 -7
- data/lib/data_loader/version.rb +1 -1
- metadata +32 -17
data/.gitignore
CHANGED
data/README.markdown
CHANGED
@@ -7,8 +7,9 @@ Features:
|
|
7
7
|
|
8
8
|
* Uses MySQL LOAD DATA to efficiently load very large files
|
9
9
|
* Fastercsv is used to inspect the first few rows and choose datatypes
|
10
|
+
* Datatypes can be overridden (types are :text, :string, :datetime, :integer)
|
10
11
|
* Converts header row in to nice ruby-esque column names
|
11
|
-
* Builds a schema using ActiveRecord
|
12
|
+
* Builds a schema using ActiveRecord
|
12
13
|
* If table names are unspecified, they will be derived from the file name
|
13
14
|
* Will prefix table names to avoid collisions (it overwrites existing tables)
|
14
15
|
* Can run under a different connection, as defined in your database.yml
|
@@ -24,20 +25,19 @@ Features:
|
|
24
25
|
config.connection = :development
|
25
26
|
config.separator = ','
|
26
27
|
config.default_ext = 'csv'
|
28
|
+
config.use_local = true
|
27
29
|
end
|
28
30
|
|
29
31
|
# Load data
|
30
|
-
loader.load 'my_csv_file', :my_table
|
32
|
+
loader.load 'my_csv_file', :my_table, :cancel_at => :datetime
|
31
33
|
|
32
34
|
|
33
35
|
### TODO
|
34
36
|
|
35
37
|
* A task to clean up all these temporary tables when we're done.
|
36
38
|
|
37
|
-
* Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
|
38
|
-
|
39
39
|
* Broader support for Rubies, Databases, and ORM/tools for building the schema.
|
40
40
|
|
41
41
|
* More options for the log file (txt vs textile, filename).
|
42
42
|
|
43
|
-
* Better tests!
|
43
|
+
* Better tests!
|
data/data_loader.gemspec
CHANGED
@@ -12,10 +12,11 @@ Gem::Specification.new do |s|
|
|
12
12
|
s.homepage = "https://github.com/nathany/data_loader"
|
13
13
|
s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
|
14
14
|
s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
|
15
|
-
|
15
|
+
|
16
16
|
s.add_dependency('fastercsv', '~> 1.5.4')
|
17
|
-
s.add_dependency('activerecord', '
|
17
|
+
s.add_dependency('activerecord', '>= 2.0.0')
|
18
18
|
s.add_development_dependency('rspec', '~> 1.3')
|
19
|
+
s.add_development_dependency('rake', '~> 0.9.2')
|
19
20
|
|
20
21
|
s.files = `git ls-files`.split("\n")
|
21
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
@@ -1,11 +1,20 @@
|
|
1
1
|
require 'fastercsv'
|
2
2
|
require 'active_support'
|
3
3
|
|
4
|
+
# FasterCSV will auto-detect the line separator, which we'd like to pass to MySQL
|
5
|
+
class FasterCSV
|
6
|
+
attr_reader :row_sep
|
7
|
+
end
|
8
|
+
|
4
9
|
module DataLoader
|
5
10
|
|
6
11
|
class Inspector
|
12
|
+
class << self
|
13
|
+
attr_reader :row_sep # set after inspect_file
|
14
|
+
end
|
15
|
+
|
7
16
|
# read a csv and return the columns and types in an ordered array
|
8
|
-
def self.inspect_file(file, separator = ',', inspect_rows = 10)
|
17
|
+
def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
|
9
18
|
fields = nil
|
10
19
|
FasterCSV.open(file,
|
11
20
|
:col_sep => separator,
|
@@ -13,31 +22,64 @@ module DataLoader
|
|
13
22
|
:headers => true,
|
14
23
|
:header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
|
15
24
|
:skip_blanks => true) do |csv|
|
16
|
-
|
25
|
+
@row_sep = csv.row_sep
|
26
|
+
fields = scan_rows(csv, inspect_rows, hints)
|
17
27
|
end
|
18
28
|
fields
|
19
29
|
end
|
20
30
|
|
21
31
|
# scan a few rows to determine data types
|
22
|
-
def self.scan_rows(csv, inspect_rows)
|
32
|
+
def self.scan_rows(csv, inspect_rows, hints = {})
|
23
33
|
first_row = nil
|
24
|
-
columns = {} # unordered hash containing
|
34
|
+
columns = {} # unordered hash containing data types for each header
|
25
35
|
|
26
36
|
1.upto(inspect_rows) do
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
37
|
+
begin
|
38
|
+
row = csv.gets
|
39
|
+
break unless row
|
40
|
+
row.each do |header, value|
|
41
|
+
columns[header] = promote_type(columns[header], dbtype(value))
|
42
|
+
end
|
43
|
+
first_row ||= row # save for later
|
44
|
+
rescue FasterCSV::MalformedCSVError => boom
|
45
|
+
# Don't care about the error but let's retry, since fastercsv will skip this line
|
46
|
+
retry
|
31
47
|
end
|
32
|
-
first_row ||= row # save for later
|
33
48
|
end
|
34
49
|
|
35
50
|
# form an ordered array based on the first row read:
|
36
51
|
fields = []
|
37
52
|
first_row.each do |header, value|
|
38
|
-
data_type = columns[header]
|
53
|
+
data_type = columns[header]
|
39
54
|
fields << {:name => header, :type => data_type}
|
40
55
|
end
|
56
|
+
|
57
|
+
# validate hints
|
58
|
+
hints.stringify_keys!
|
59
|
+
invalid_columns = hints.keys - fields.map {|f| f[:name]}
|
60
|
+
puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
|
61
|
+
invalid_types = hints.values - [:text, :string, :datetime, :integer]
|
62
|
+
abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?
|
63
|
+
|
64
|
+
fields.each do |field|
|
65
|
+
name, field_type = field[:name], field[:type]
|
66
|
+
# override columns with hints
|
67
|
+
if hints.has_key?(name)
|
68
|
+
hint_type = hints[name].to_sym
|
69
|
+
if field_type.nil?
|
70
|
+
puts "Note: undertermined type for #{name} hinted as #{hint_type}."
|
71
|
+
elsif hint_type != field_type
|
72
|
+
puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
|
73
|
+
end
|
74
|
+
field[:type] = hint_type
|
75
|
+
end
|
76
|
+
# default to :string if everything was nil (and no hint)
|
77
|
+
if field[:type].nil?
|
78
|
+
puts "Warning: type could not be determined for #{name}, defaulting to string."
|
79
|
+
field[:type] = :string
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
41
83
|
fields
|
42
84
|
end
|
43
85
|
|
@@ -79,4 +121,4 @@ module DataLoader
|
|
79
121
|
end
|
80
122
|
end
|
81
123
|
|
82
|
-
end
|
124
|
+
end
|
data/lib/data_loader/loader.rb
CHANGED
@@ -13,6 +13,9 @@
|
|
13
13
|
# how many rows to scan the CSV file to determine the data types
|
14
14
|
# connection
|
15
15
|
# a connection name from database.yml to run it under (e.g. :production)
|
16
|
+
# use_local
|
17
|
+
# when true, use LOAD DATA LOCAL INFILE with MySQL if server can't access file
|
18
|
+
# requires MySQL to be compiled with --enable-local-infile
|
16
19
|
# default_ext
|
17
20
|
# extension to append if no file extension is specified
|
18
21
|
# separator
|
@@ -23,38 +26,44 @@
|
|
23
26
|
module DataLoader
|
24
27
|
|
25
28
|
class Loader
|
26
|
-
attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log
|
29
|
+
attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log, :use_local
|
27
30
|
|
28
31
|
def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
|
29
32
|
@folder, @separator = folder, separator
|
30
33
|
@table_prefix, @connection = table_prefix, connection
|
31
34
|
@default_ext = 'csv'
|
32
35
|
@inspect_rows = 10
|
36
|
+
@use_local = false # with MySQL INFILE
|
33
37
|
@log = true
|
34
38
|
yield(self) if block_given?
|
35
39
|
@logfile = File.expand_path(File.join(@folder, 'data_loader.textile'))
|
36
40
|
puts @logfile
|
37
41
|
end
|
38
42
|
|
39
|
-
|
43
|
+
# load
|
44
|
+
# - filename - name of file to load (in folder and default_ext)
|
45
|
+
# - table - table to load file into (with table_prefix), derives from filename by default
|
46
|
+
# - hints - hash of column name => data type (one of :text, :string, :datetime, :integer)
|
47
|
+
def load(filename, table = nil, hints = {})
|
40
48
|
filename = [filename, default_ext].join('.') if File.extname(filename).empty?
|
41
49
|
full_file = File.expand_path(File.join(@folder, filename))
|
42
50
|
table = Migrator.derive_table_name(filename) if table.nil?
|
43
51
|
table = [@table_prefix, table].join('_') unless @table_prefix.blank?
|
44
|
-
columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
|
52
|
+
columns = Inspector.inspect_file(full_file, @separator, @inspect_rows, hints)
|
53
|
+
row_sep = Inspector.row_sep
|
45
54
|
log_columns(table, columns)
|
46
|
-
Migrator.migrate(full_file, columns, table, @separator, @connection)
|
55
|
+
Migrator.migrate(full_file, columns, table, @separator, @connection, @use_local, row_sep)
|
47
56
|
table
|
48
57
|
end
|
49
|
-
|
58
|
+
|
50
59
|
def log(text)
|
51
60
|
return unless @log
|
52
|
-
|
61
|
+
|
53
62
|
File.open(@logfile, 'a') do |file|
|
54
63
|
file << text
|
55
64
|
end
|
56
65
|
end
|
57
|
-
|
66
|
+
|
58
67
|
def clear_log
|
59
68
|
FileUtils.remove(@logfile) if File.exist?(@logfile)
|
60
69
|
end
|
@@ -63,7 +72,7 @@ module DataLoader
|
|
63
72
|
|
64
73
|
def log_columns(table, columns)
|
65
74
|
return unless @log
|
66
|
-
|
75
|
+
|
67
76
|
File.open(@logfile, 'a') do |file|
|
68
77
|
file << "\ntable{width:80%}.\n|_\\2. #{table} |\n" # table header (textile)
|
69
78
|
columns.each_with_index do |column, index|
|
@@ -75,7 +84,7 @@ module DataLoader
|
|
75
84
|
end
|
76
85
|
end
|
77
86
|
end
|
78
|
-
|
87
|
+
|
79
88
|
end
|
80
89
|
end
|
81
90
|
|
data/lib/data_loader/migrator.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
module DataLoader
|
2
2
|
|
3
3
|
class Migrator
|
4
|
-
def self.migrate(file, columns, table, separator = ',', conn = :root)
|
4
|
+
def self.migrate(file, columns, table, separator = ',', conn = :root, local = false, row_sep = "\r\n")
|
5
5
|
with_connection(conn) do
|
6
6
|
create_schema(table, columns)
|
7
7
|
puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
|
8
|
-
load_data(file, table, separator)
|
8
|
+
load_data(file, table, local, separator, row_sep)
|
9
|
+
nullify_dates(table, columns)
|
9
10
|
end
|
10
11
|
end
|
11
12
|
|
@@ -20,12 +21,26 @@ module DataLoader
|
|
20
21
|
end
|
21
22
|
end
|
22
23
|
|
24
|
+
# empty strings import as 0000-00-00 00:00:00, convert to nil
|
25
|
+
def self.nullify_dates(table_name, data_struct)
|
26
|
+
date_columns = data_struct.map {|column| column[:name] if column[:type] == :datetime }.compact!
|
27
|
+
date_columns.each do |column|
|
28
|
+
sql = <<-SQL
|
29
|
+
UPDATE #{table_name}
|
30
|
+
SET #{column} = NULL
|
31
|
+
WHERE #{column} = 0
|
32
|
+
SQL
|
33
|
+
ActiveRecord::Base.connection.execute(sql)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
23
37
|
# uses MySQL LOAD DATA to import the whole file, ignoring the header line
|
24
|
-
def self.load_data(file, table_name, separator = ',')
|
38
|
+
def self.load_data(file, table_name, local, separator = ',', row_sep = "\r\n")
|
39
|
+
local_txt = local ? "LOCAL" : ''
|
25
40
|
sql = <<-SQL
|
26
|
-
LOAD DATA
|
41
|
+
LOAD DATA #{local_txt} INFILE '#{file}' INTO TABLE #{table_name.to_s}
|
27
42
|
FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
|
28
|
-
LINES TERMINATED BY '
|
43
|
+
LINES TERMINATED BY '#{row_sep}'
|
29
44
|
IGNORE 1 LINES;
|
30
45
|
SQL
|
31
46
|
ActiveRecord::Base.connection.execute(sql)
|
@@ -36,7 +51,7 @@ module DataLoader
|
|
36
51
|
if Rails.env.development?
|
37
52
|
yield
|
38
53
|
else
|
39
|
-
ActiveRecord::Base.establish_connection(conn)
|
54
|
+
ActiveRecord::Base.establish_connection(conn)
|
40
55
|
yield
|
41
56
|
ActiveRecord::Base.establish_connection(RAILS_ENV)
|
42
57
|
end
|
@@ -48,4 +63,4 @@ module DataLoader
|
|
48
63
|
name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
|
49
64
|
end
|
50
65
|
end
|
51
|
-
end
|
66
|
+
end
|
data/lib/data_loader/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_loader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 4
|
10
|
+
version: 0.2.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Nathan Youngman
|
@@ -15,11 +15,10 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-10-22 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
|
-
|
21
|
+
type: :runtime
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
24
|
none: false
|
@@ -32,25 +31,26 @@ dependencies:
|
|
32
31
|
- 5
|
33
32
|
- 4
|
34
33
|
version: 1.5.4
|
35
|
-
type: :runtime
|
36
34
|
version_requirements: *id001
|
35
|
+
name: fastercsv
|
37
36
|
- !ruby/object:Gem::Dependency
|
38
|
-
|
37
|
+
type: :runtime
|
39
38
|
prerelease: false
|
40
39
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
40
|
none: false
|
42
41
|
requirements:
|
43
|
-
- -
|
42
|
+
- - ">="
|
44
43
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
44
|
+
hash: 15
|
46
45
|
segments:
|
47
46
|
- 2
|
48
|
-
-
|
49
|
-
|
50
|
-
|
47
|
+
- 0
|
48
|
+
- 0
|
49
|
+
version: 2.0.0
|
51
50
|
version_requirements: *id002
|
51
|
+
name: activerecord
|
52
52
|
- !ruby/object:Gem::Dependency
|
53
|
-
|
53
|
+
type: :development
|
54
54
|
prerelease: false
|
55
55
|
requirement: &id003 !ruby/object:Gem::Requirement
|
56
56
|
none: false
|
@@ -62,8 +62,24 @@ dependencies:
|
|
62
62
|
- 1
|
63
63
|
- 3
|
64
64
|
version: "1.3"
|
65
|
-
type: :development
|
66
65
|
version_requirements: *id003
|
66
|
+
name: rspec
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
type: :development
|
69
|
+
prerelease: false
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 63
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
- 9
|
79
|
+
- 2
|
80
|
+
version: 0.9.2
|
81
|
+
version_requirements: *id004
|
82
|
+
name: rake
|
67
83
|
description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
|
68
84
|
email:
|
69
85
|
- git@nathany.com
|
@@ -87,7 +103,6 @@ files:
|
|
87
103
|
- lib/data_loader/version.rb
|
88
104
|
- spec/lib/data_loader/inspector_spec.rb
|
89
105
|
- spec/spec_helper.rb
|
90
|
-
has_rdoc: true
|
91
106
|
homepage: https://github.com/nathany/data_loader
|
92
107
|
licenses: []
|
93
108
|
|
@@ -119,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
134
|
requirements: []
|
120
135
|
|
121
136
|
rubyforge_project:
|
122
|
-
rubygems_version: 1.
|
137
|
+
rubygems_version: 1.8.11
|
123
138
|
signing_key:
|
124
139
|
specification_version: 3
|
125
140
|
summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
|