data_loader 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -4,4 +4,5 @@ Gemfile.lock
4
4
  pkg/*
5
5
  .rvmrc
6
6
  bin/*
7
- *.tmproj
7
+ *.tmproj
8
+ *.bbprojectd
@@ -7,8 +7,9 @@ Features:
7
7
 
8
8
  * Uses MySQL LOAD DATA to efficiently load very large files
9
9
  * Fastercsv is used to inspect the first few rows and choose datatypes
10
+ * Datatypes can be overridden (types are :text, :string, :datetime, :integer)
10
11
  * Converts header row in to nice ruby-esque column names
11
- * Builds a schema using ActiveRecord 2.x
12
+ * Builds a schema using ActiveRecord
12
13
  * If table names are unspecified, they will be derived from the file name
13
14
  * Will prefix table names to avoid collisions (it overwrites existing tables)
14
15
  * Can run under a different connection, as defined in your database.yml
@@ -24,20 +25,19 @@ Features:
24
25
  config.connection = :development
25
26
  config.separator = ','
26
27
  config.default_ext = 'csv'
28
+ config.use_local = true
27
29
  end
28
30
 
29
31
  # Load data
30
- loader.load 'my_csv_file', :my_table
32
+ loader.load 'my_csv_file', :my_table, :cancel_at => :datetime
31
33
 
32
34
 
33
35
  ### TODO
34
36
 
35
37
  * A task to clean up all these temporary tables when we're done.
36
38
 
37
- * Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
38
-
39
39
  * Broader support for Rubies, Databases, and ORM/tools for building the schema.
40
40
 
41
41
  * More options for the log file (txt vs textile, filename).
42
42
 
43
- * Better tests!
43
+ * Better tests!
@@ -12,10 +12,11 @@ Gem::Specification.new do |s|
12
12
  s.homepage = "https://github.com/nathany/data_loader"
13
13
  s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
14
14
  s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
15
-
15
+
16
16
  s.add_dependency('fastercsv', '~> 1.5.4')
17
- s.add_dependency('activerecord', '~> 2.3')
17
+ s.add_dependency('activerecord', '>= 2.0.0')
18
18
  s.add_development_dependency('rspec', '~> 1.3')
19
+ s.add_development_dependency('rake', '~> 0.9.2')
19
20
 
20
21
  s.files = `git ls-files`.split("\n")
21
22
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -1,11 +1,20 @@
1
1
  require 'fastercsv'
2
2
  require 'active_support'
3
3
 
4
+ # FasterCSV will auto-detect the line separator, which we'd like to pass to MySQL
5
+ class FasterCSV
6
+ attr_reader :row_sep
7
+ end
8
+
4
9
  module DataLoader
5
10
 
6
11
  class Inspector
12
+ class << self
13
+ attr_reader :row_sep # set after inspect_file
14
+ end
15
+
7
16
  # read a csv and return the columns and types in an ordered array
8
- def self.inspect_file(file, separator = ',', inspect_rows = 10)
17
+ def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
9
18
  fields = nil
10
19
  FasterCSV.open(file,
11
20
  :col_sep => separator,
@@ -13,31 +22,64 @@ module DataLoader
13
22
  :headers => true,
14
23
  :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
15
24
  :skip_blanks => true) do |csv|
16
- fields = scan_rows(csv, inspect_rows)
25
+ @row_sep = csv.row_sep
26
+ fields = scan_rows(csv, inspect_rows, hints)
17
27
  end
18
28
  fields
19
29
  end
20
30
 
21
31
  # scan a few rows to determine data types
22
- def self.scan_rows(csv, inspect_rows)
32
+ def self.scan_rows(csv, inspect_rows, hints = {})
23
33
  first_row = nil
24
- columns = {} # unordered hash containing date types for each header
34
+ columns = {} # unordered hash containing data types for each header
25
35
 
26
36
  1.upto(inspect_rows) do
27
- row = csv.gets
28
- break unless row
29
- row.each do |header, value|
30
- columns[header] = promote_type(columns[header], dbtype(value))
37
+ begin
38
+ row = csv.gets
39
+ break unless row
40
+ row.each do |header, value|
41
+ columns[header] = promote_type(columns[header], dbtype(value))
42
+ end
43
+ first_row ||= row # save for later
44
+ rescue FasterCSV::MalformedCSVError => boom
45
+ # Don't care about the error but let's retry, since fastercsv will skip this line
46
+ retry
31
47
  end
32
- first_row ||= row # save for later
33
48
  end
34
49
 
35
50
  # form an ordered array based on the first row read:
36
51
  fields = []
37
52
  first_row.each do |header, value|
38
- data_type = columns[header] || :string # default to :string if everything was nil
53
+ data_type = columns[header]
39
54
  fields << {:name => header, :type => data_type}
40
55
  end
56
+
57
+ # validate hints
58
+ hints.stringify_keys!
59
+ invalid_columns = hints.keys - fields.map {|f| f[:name]}
60
+ puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
61
+ invalid_types = hints.values - [:text, :string, :datetime, :integer]
62
+ abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?
63
+
64
+ fields.each do |field|
65
+ name, field_type = field[:name], field[:type]
66
+ # override columns with hints
67
+ if hints.has_key?(name)
68
+ hint_type = hints[name].to_sym
69
+ if field_type.nil?
70
+ puts "Note: undertermined type for #{name} hinted as #{hint_type}."
71
+ elsif hint_type != field_type
72
+ puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
73
+ end
74
+ field[:type] = hint_type
75
+ end
76
+ # default to :string if everything was nil (and no hint)
77
+ if field[:type].nil?
78
+ puts "Warning: type could not be determined for #{name}, defaulting to string."
79
+ field[:type] = :string
80
+ end
81
+ end
82
+
41
83
  fields
42
84
  end
43
85
 
@@ -79,4 +121,4 @@ module DataLoader
79
121
  end
80
122
  end
81
123
 
82
- end
124
+ end
@@ -13,6 +13,9 @@
13
13
  # how many rows to scan the CSV file to determine the data types
14
14
  # connection
15
15
  # a connection name from database.yml to run it under (e.g. :production)
16
+ # use_local
17
+ # when true, use LOAD DATA LOCAL INFILE with MySQL if server can't access file
18
+ # requires MySQL to be compiled with --enable-local-infile
16
19
  # default_ext
17
20
  # extension to append if no file extension is specified
18
21
  # separator
@@ -23,38 +26,44 @@
23
26
  module DataLoader
24
27
 
25
28
  class Loader
26
- attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log
29
+ attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log, :use_local
27
30
 
28
31
  def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
29
32
  @folder, @separator = folder, separator
30
33
  @table_prefix, @connection = table_prefix, connection
31
34
  @default_ext = 'csv'
32
35
  @inspect_rows = 10
36
+ @use_local = false # with MySQL INFILE
33
37
  @log = true
34
38
  yield(self) if block_given?
35
39
  @logfile = File.expand_path(File.join(@folder, 'data_loader.textile'))
36
40
  puts @logfile
37
41
  end
38
42
 
39
- def load(filename, table = nil)
43
+ # load
44
+ # - filename - name of file to load (in folder and default_ext)
45
+ # - table - table to load file into (with table_prefix), derives from filename by default
46
+ # - hints - hash of column name => data type (one of :text, :string, :datetime, :integer)
47
+ def load(filename, table = nil, hints = {})
40
48
  filename = [filename, default_ext].join('.') if File.extname(filename).empty?
41
49
  full_file = File.expand_path(File.join(@folder, filename))
42
50
  table = Migrator.derive_table_name(filename) if table.nil?
43
51
  table = [@table_prefix, table].join('_') unless @table_prefix.blank?
44
- columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
52
+ columns = Inspector.inspect_file(full_file, @separator, @inspect_rows, hints)
53
+ row_sep = Inspector.row_sep
45
54
  log_columns(table, columns)
46
- Migrator.migrate(full_file, columns, table, @separator, @connection)
55
+ Migrator.migrate(full_file, columns, table, @separator, @connection, @use_local, row_sep)
47
56
  table
48
57
  end
49
-
58
+
50
59
  def log(text)
51
60
  return unless @log
52
-
61
+
53
62
  File.open(@logfile, 'a') do |file|
54
63
  file << text
55
64
  end
56
65
  end
57
-
66
+
58
67
  def clear_log
59
68
  FileUtils.remove(@logfile) if File.exist?(@logfile)
60
69
  end
@@ -63,7 +72,7 @@ module DataLoader
63
72
 
64
73
  def log_columns(table, columns)
65
74
  return unless @log
66
-
75
+
67
76
  File.open(@logfile, 'a') do |file|
68
77
  file << "\ntable{width:80%}.\n|_\\2. #{table} |\n" # table header (textile)
69
78
  columns.each_with_index do |column, index|
@@ -75,7 +84,7 @@ module DataLoader
75
84
  end
76
85
  end
77
86
  end
78
-
87
+
79
88
  end
80
89
  end
81
90
 
@@ -1,11 +1,12 @@
1
1
  module DataLoader
2
2
 
3
3
  class Migrator
4
- def self.migrate(file, columns, table, separator = ',', conn = :root)
4
+ def self.migrate(file, columns, table, separator = ',', conn = :root, local = false, row_sep = "\r\n")
5
5
  with_connection(conn) do
6
6
  create_schema(table, columns)
7
7
  puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
8
- load_data(file, table, separator)
8
+ load_data(file, table, local, separator, row_sep)
9
+ nullify_dates(table, columns)
9
10
  end
10
11
  end
11
12
 
@@ -20,12 +21,26 @@ module DataLoader
20
21
  end
21
22
  end
22
23
 
24
+ # empty strings import as 0000-00-00 00:00:00, convert to nil
25
+ def self.nullify_dates(table_name, data_struct)
26
+ date_columns = data_struct.map {|column| column[:name] if column[:type] == :datetime }.compact!
27
+ date_columns.each do |column|
28
+ sql = <<-SQL
29
+ UPDATE #{table_name}
30
+ SET #{column} = NULL
31
+ WHERE #{column} = 0
32
+ SQL
33
+ ActiveRecord::Base.connection.execute(sql)
34
+ end
35
+ end
36
+
23
37
  # uses MySQL LOAD DATA to import the whole file, ignoring the header line
24
- def self.load_data(file, table_name, separator = ',')
38
+ def self.load_data(file, table_name, local, separator = ',', row_sep = "\r\n")
39
+ local_txt = local ? "LOCAL" : ''
25
40
  sql = <<-SQL
26
- LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
41
+ LOAD DATA #{local_txt} INFILE '#{file}' INTO TABLE #{table_name.to_s}
27
42
  FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
28
- LINES TERMINATED BY '\r\n'
43
+ LINES TERMINATED BY '#{row_sep}'
29
44
  IGNORE 1 LINES;
30
45
  SQL
31
46
  ActiveRecord::Base.connection.execute(sql)
@@ -36,7 +51,7 @@ module DataLoader
36
51
  if Rails.env.development?
37
52
  yield
38
53
  else
39
- ActiveRecord::Base.establish_connection(conn)
54
+ ActiveRecord::Base.establish_connection(conn)
40
55
  yield
41
56
  ActiveRecord::Base.establish_connection(RAILS_ENV)
42
57
  end
@@ -48,4 +63,4 @@ module DataLoader
48
63
  name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
49
64
  end
50
65
  end
51
- end
66
+ end
@@ -1,3 +1,3 @@
1
1
  module DataLoader
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_loader
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 4
10
+ version: 0.2.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Nathan Youngman
@@ -15,11 +15,10 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-04-05 00:00:00 -06:00
19
- default_executable:
18
+ date: 2011-10-22 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
- name: fastercsv
21
+ type: :runtime
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
24
  none: false
@@ -32,25 +31,26 @@ dependencies:
32
31
  - 5
33
32
  - 4
34
33
  version: 1.5.4
35
- type: :runtime
36
34
  version_requirements: *id001
35
+ name: fastercsv
37
36
  - !ruby/object:Gem::Dependency
38
- name: activerecord
37
+ type: :runtime
39
38
  prerelease: false
40
39
  requirement: &id002 !ruby/object:Gem::Requirement
41
40
  none: false
42
41
  requirements:
43
- - - ~>
42
+ - - ">="
44
43
  - !ruby/object:Gem::Version
45
- hash: 5
44
+ hash: 15
46
45
  segments:
47
46
  - 2
48
- - 3
49
- version: "2.3"
50
- type: :runtime
47
+ - 0
48
+ - 0
49
+ version: 2.0.0
51
50
  version_requirements: *id002
51
+ name: activerecord
52
52
  - !ruby/object:Gem::Dependency
53
- name: rspec
53
+ type: :development
54
54
  prerelease: false
55
55
  requirement: &id003 !ruby/object:Gem::Requirement
56
56
  none: false
@@ -62,8 +62,24 @@ dependencies:
62
62
  - 1
63
63
  - 3
64
64
  version: "1.3"
65
- type: :development
66
65
  version_requirements: *id003
66
+ name: rspec
67
+ - !ruby/object:Gem::Dependency
68
+ type: :development
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ hash: 63
76
+ segments:
77
+ - 0
78
+ - 9
79
+ - 2
80
+ version: 0.9.2
81
+ version_requirements: *id004
82
+ name: rake
67
83
  description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
68
84
  email:
69
85
  - git@nathany.com
@@ -87,7 +103,6 @@ files:
87
103
  - lib/data_loader/version.rb
88
104
  - spec/lib/data_loader/inspector_spec.rb
89
105
  - spec/spec_helper.rb
90
- has_rdoc: true
91
106
  homepage: https://github.com/nathany/data_loader
92
107
  licenses: []
93
108
 
@@ -119,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
134
  requirements: []
120
135
 
121
136
  rubyforge_project:
122
- rubygems_version: 1.6.2
137
+ rubygems_version: 1.8.11
123
138
  signing_key:
124
139
  specification_version: 3
125
140
  summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.