data_loader 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -4,4 +4,5 @@ Gemfile.lock
4
4
  pkg/*
5
5
  .rvmrc
6
6
  bin/*
7
- *.tmproj
7
+ *.tmproj
8
+ *.bbprojectd
@@ -7,8 +7,9 @@ Features:
7
7
 
8
8
  * Uses MySQL LOAD DATA to efficiently load very large files
9
9
  * Fastercsv is used to inspect the first few rows and choose datatypes
10
+ * Datatypes can be overridden (types are :text, :string, :datetime, :integer)
10
11
  * Converts header row in to nice ruby-esque column names
11
- * Builds a schema using ActiveRecord 2.x
12
+ * Builds a schema using ActiveRecord
12
13
  * If table names are unspecified, they will be derived from the file name
13
14
  * Will prefix table names to avoid collisions (it overwrites existing tables)
14
15
  * Can run under a different connection, as defined in your database.yml
@@ -24,20 +25,19 @@ Features:
24
25
  config.connection = :development
25
26
  config.separator = ','
26
27
  config.default_ext = 'csv'
28
+ config.use_local = true
27
29
  end
28
30
 
29
31
  # Load data
30
- loader.load 'my_csv_file', :my_table
32
+ loader.load 'my_csv_file', :my_table, :cancel_at => :datetime
31
33
 
32
34
 
33
35
  ### TODO
34
36
 
35
37
  * A task to clean up all these temporary tables when we're done.
36
38
 
37
- * Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
38
-
39
39
  * Broader support for Rubies, Databases, and ORM/tools for building the schema.
40
40
 
41
41
  * More options for the log file (txt vs textile, filename).
42
42
 
43
- * Better tests!
43
+ * Better tests!
@@ -12,10 +12,11 @@ Gem::Specification.new do |s|
12
12
  s.homepage = "https://github.com/nathany/data_loader"
13
13
  s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
14
14
  s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
15
-
15
+
16
16
  s.add_dependency('fastercsv', '~> 1.5.4')
17
- s.add_dependency('activerecord', '~> 2.3')
17
+ s.add_dependency('activerecord', '>= 2.0.0')
18
18
  s.add_development_dependency('rspec', '~> 1.3')
19
+ s.add_development_dependency('rake', '~> 0.9.2')
19
20
 
20
21
  s.files = `git ls-files`.split("\n")
21
22
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -1,11 +1,20 @@
1
1
  require 'fastercsv'
2
2
  require 'active_support'
3
3
 
4
+ # FasterCSV will auto-detect the line separator, which we'd like to pass to MySQL
5
+ class FasterCSV
6
+ attr_reader :row_sep
7
+ end
8
+
4
9
  module DataLoader
5
10
 
6
11
  class Inspector
12
+ class << self
13
+ attr_reader :row_sep # set after inspect_file
14
+ end
15
+
7
16
  # read a csv and return the columns and types in an ordered array
8
- def self.inspect_file(file, separator = ',', inspect_rows = 10)
17
+ def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
9
18
  fields = nil
10
19
  FasterCSV.open(file,
11
20
  :col_sep => separator,
@@ -13,31 +22,64 @@ module DataLoader
13
22
  :headers => true,
14
23
  :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
15
24
  :skip_blanks => true) do |csv|
16
- fields = scan_rows(csv, inspect_rows)
25
+ @row_sep = csv.row_sep
26
+ fields = scan_rows(csv, inspect_rows, hints)
17
27
  end
18
28
  fields
19
29
  end
20
30
 
21
31
  # scan a few rows to determine data types
22
- def self.scan_rows(csv, inspect_rows)
32
+ def self.scan_rows(csv, inspect_rows, hints = {})
23
33
  first_row = nil
24
- columns = {} # unordered hash containing date types for each header
34
+ columns = {} # unordered hash containing data types for each header
25
35
 
26
36
  1.upto(inspect_rows) do
27
- row = csv.gets
28
- break unless row
29
- row.each do |header, value|
30
- columns[header] = promote_type(columns[header], dbtype(value))
37
+ begin
38
+ row = csv.gets
39
+ break unless row
40
+ row.each do |header, value|
41
+ columns[header] = promote_type(columns[header], dbtype(value))
42
+ end
43
+ first_row ||= row # save for later
44
+ rescue FasterCSV::MalformedCSVError => boom
45
+ # Don't care about the error but let's retry, since fastercsv will skip this line
46
+ retry
31
47
  end
32
- first_row ||= row # save for later
33
48
  end
34
49
 
35
50
  # form an ordered array based on the first row read:
36
51
  fields = []
37
52
  first_row.each do |header, value|
38
- data_type = columns[header] || :string # default to :string if everything was nil
53
+ data_type = columns[header]
39
54
  fields << {:name => header, :type => data_type}
40
55
  end
56
+
57
+ # validate hints
58
+ hints.stringify_keys!
59
+ invalid_columns = hints.keys - fields.map {|f| f[:name]}
60
+ puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
61
+ invalid_types = hints.values - [:text, :string, :datetime, :integer]
62
+ abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?
63
+
64
+ fields.each do |field|
65
+ name, field_type = field[:name], field[:type]
66
+ # override columns with hints
67
+ if hints.has_key?(name)
68
+ hint_type = hints[name].to_sym
69
+ if field_type.nil?
70
+ puts "Note: undertermined type for #{name} hinted as #{hint_type}."
71
+ elsif hint_type != field_type
72
+ puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
73
+ end
74
+ field[:type] = hint_type
75
+ end
76
+ # default to :string if everything was nil (and no hint)
77
+ if field[:type].nil?
78
+ puts "Warning: type could not be determined for #{name}, defaulting to string."
79
+ field[:type] = :string
80
+ end
81
+ end
82
+
41
83
  fields
42
84
  end
43
85
 
@@ -79,4 +121,4 @@ module DataLoader
79
121
  end
80
122
  end
81
123
 
82
- end
124
+ end
@@ -13,6 +13,9 @@
13
13
  # how many rows to scan the CSV file to determine the data types
14
14
  # connection
15
15
  # a connection name from database.yml to run it under (e.g. :production)
16
+ # use_local
17
+ # when true, use LOAD DATA LOCAL INFILE with MySQL if server can't access file
18
+ # requires MySQL to be compiled with --enable-local-infile
16
19
  # default_ext
17
20
  # extension to append if no file extension is specified
18
21
  # separator
@@ -23,38 +26,44 @@
23
26
  module DataLoader
24
27
 
25
28
  class Loader
26
- attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log
29
+ attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator, :log, :use_local
27
30
 
28
31
  def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
29
32
  @folder, @separator = folder, separator
30
33
  @table_prefix, @connection = table_prefix, connection
31
34
  @default_ext = 'csv'
32
35
  @inspect_rows = 10
36
+ @use_local = false # with MySQL INFILE
33
37
  @log = true
34
38
  yield(self) if block_given?
35
39
  @logfile = File.expand_path(File.join(@folder, 'data_loader.textile'))
36
40
  puts @logfile
37
41
  end
38
42
 
39
- def load(filename, table = nil)
43
+ # load
44
+ # - filename - name of file to load (in folder and default_ext)
45
+ # - table - table to load file into (with table_prefix), derives from filename by default
46
+ # - hints - hash of column name => data type (one of :text, :string, :datetime, :integer)
47
+ def load(filename, table = nil, hints = {})
40
48
  filename = [filename, default_ext].join('.') if File.extname(filename).empty?
41
49
  full_file = File.expand_path(File.join(@folder, filename))
42
50
  table = Migrator.derive_table_name(filename) if table.nil?
43
51
  table = [@table_prefix, table].join('_') unless @table_prefix.blank?
44
- columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
52
+ columns = Inspector.inspect_file(full_file, @separator, @inspect_rows, hints)
53
+ row_sep = Inspector.row_sep
45
54
  log_columns(table, columns)
46
- Migrator.migrate(full_file, columns, table, @separator, @connection)
55
+ Migrator.migrate(full_file, columns, table, @separator, @connection, @use_local, row_sep)
47
56
  table
48
57
  end
49
-
58
+
50
59
  def log(text)
51
60
  return unless @log
52
-
61
+
53
62
  File.open(@logfile, 'a') do |file|
54
63
  file << text
55
64
  end
56
65
  end
57
-
66
+
58
67
  def clear_log
59
68
  FileUtils.remove(@logfile) if File.exist?(@logfile)
60
69
  end
@@ -63,7 +72,7 @@ module DataLoader
63
72
 
64
73
  def log_columns(table, columns)
65
74
  return unless @log
66
-
75
+
67
76
  File.open(@logfile, 'a') do |file|
68
77
  file << "\ntable{width:80%}.\n|_\\2. #{table} |\n" # table header (textile)
69
78
  columns.each_with_index do |column, index|
@@ -75,7 +84,7 @@ module DataLoader
75
84
  end
76
85
  end
77
86
  end
78
-
87
+
79
88
  end
80
89
  end
81
90
 
@@ -1,11 +1,12 @@
1
1
  module DataLoader
2
2
 
3
3
  class Migrator
4
- def self.migrate(file, columns, table, separator = ',', conn = :root)
4
+ def self.migrate(file, columns, table, separator = ',', conn = :root, local = false, row_sep = "\r\n")
5
5
  with_connection(conn) do
6
6
  create_schema(table, columns)
7
7
  puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
8
- load_data(file, table, separator)
8
+ load_data(file, table, local, separator, row_sep)
9
+ nullify_dates(table, columns)
9
10
  end
10
11
  end
11
12
 
@@ -20,12 +21,26 @@ module DataLoader
20
21
  end
21
22
  end
22
23
 
24
+ # empty strings import as 0000-00-00 00:00:00, convert to nil
25
+ def self.nullify_dates(table_name, data_struct)
26
+ date_columns = data_struct.map {|column| column[:name] if column[:type] == :datetime }.compact!
27
+ date_columns.each do |column|
28
+ sql = <<-SQL
29
+ UPDATE #{table_name}
30
+ SET #{column} = NULL
31
+ WHERE #{column} = 0
32
+ SQL
33
+ ActiveRecord::Base.connection.execute(sql)
34
+ end
35
+ end
36
+
23
37
  # uses MySQL LOAD DATA to import the whole file, ignoring the header line
24
- def self.load_data(file, table_name, separator = ',')
38
+ def self.load_data(file, table_name, local, separator = ',', row_sep = "\r\n")
39
+ local_txt = local ? "LOCAL" : ''
25
40
  sql = <<-SQL
26
- LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
41
+ LOAD DATA #{local_txt} INFILE '#{file}' INTO TABLE #{table_name.to_s}
27
42
  FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
28
- LINES TERMINATED BY '\r\n'
43
+ LINES TERMINATED BY '#{row_sep}'
29
44
  IGNORE 1 LINES;
30
45
  SQL
31
46
  ActiveRecord::Base.connection.execute(sql)
@@ -36,7 +51,7 @@ module DataLoader
36
51
  if Rails.env.development?
37
52
  yield
38
53
  else
39
- ActiveRecord::Base.establish_connection(conn)
54
+ ActiveRecord::Base.establish_connection(conn)
40
55
  yield
41
56
  ActiveRecord::Base.establish_connection(RAILS_ENV)
42
57
  end
@@ -48,4 +63,4 @@ module DataLoader
48
63
  name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
49
64
  end
50
65
  end
51
- end
66
+ end
@@ -1,3 +1,3 @@
1
1
  module DataLoader
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_loader
3
3
  version: !ruby/object:Gem::Version
4
- hash: 23
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 0
10
- version: 0.2.0
9
+ - 4
10
+ version: 0.2.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Nathan Youngman
@@ -15,11 +15,10 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-04-05 00:00:00 -06:00
19
- default_executable:
18
+ date: 2011-10-22 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
- name: fastercsv
21
+ type: :runtime
23
22
  prerelease: false
24
23
  requirement: &id001 !ruby/object:Gem::Requirement
25
24
  none: false
@@ -32,25 +31,26 @@ dependencies:
32
31
  - 5
33
32
  - 4
34
33
  version: 1.5.4
35
- type: :runtime
36
34
  version_requirements: *id001
35
+ name: fastercsv
37
36
  - !ruby/object:Gem::Dependency
38
- name: activerecord
37
+ type: :runtime
39
38
  prerelease: false
40
39
  requirement: &id002 !ruby/object:Gem::Requirement
41
40
  none: false
42
41
  requirements:
43
- - - ~>
42
+ - - ">="
44
43
  - !ruby/object:Gem::Version
45
- hash: 5
44
+ hash: 15
46
45
  segments:
47
46
  - 2
48
- - 3
49
- version: "2.3"
50
- type: :runtime
47
+ - 0
48
+ - 0
49
+ version: 2.0.0
51
50
  version_requirements: *id002
51
+ name: activerecord
52
52
  - !ruby/object:Gem::Dependency
53
- name: rspec
53
+ type: :development
54
54
  prerelease: false
55
55
  requirement: &id003 !ruby/object:Gem::Requirement
56
56
  none: false
@@ -62,8 +62,24 @@ dependencies:
62
62
  - 1
63
63
  - 3
64
64
  version: "1.3"
65
- type: :development
66
65
  version_requirements: *id003
66
+ name: rspec
67
+ - !ruby/object:Gem::Dependency
68
+ type: :development
69
+ prerelease: false
70
+ requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ hash: 63
76
+ segments:
77
+ - 0
78
+ - 9
79
+ - 2
80
+ version: 0.9.2
81
+ version_requirements: *id004
82
+ name: rake
67
83
  description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
68
84
  email:
69
85
  - git@nathany.com
@@ -87,7 +103,6 @@ files:
87
103
  - lib/data_loader/version.rb
88
104
  - spec/lib/data_loader/inspector_spec.rb
89
105
  - spec/spec_helper.rb
90
- has_rdoc: true
91
106
  homepage: https://github.com/nathany/data_loader
92
107
  licenses: []
93
108
 
@@ -119,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
134
  requirements: []
120
135
 
121
136
  rubyforge_project:
122
- rubygems_version: 1.6.2
137
+ rubygems_version: 1.8.11
123
138
  signing_key:
124
139
  specification_version: 3
125
140
  summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.