data_loader 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rvmrc
6
+ bin/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_loader.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2011 Nathan Youngman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,42 @@
1
+ ## Data Loader
2
+
3
+ Data Loader is a tool to load CSV files into a MySQL database. It was designed
4
+ to import raw data into tables that could then be manipulated with SQL.
5
+
6
+ Features:
7
+
8
+ * Uses MySQL LOAD DATA to efficiently load very large files
9
+ * Fastercsv is used to inspect the first few rows and choose datatypes
10
+ * Converts header row in to nice ruby-esque column names
11
+ * Builds a schema using ActiveRecord 2.x
12
+ * If table names are unspecified, they will be derived from the file name
13
+ * Will prefix table names to avoid collisions (it overwrites existing tables)
14
+ * Can run under a different connection, as defined in your database.yml
15
+
16
+ ### Usage
17
+
18
+ # Configure (everything has defaults, see loader.rb)
19
+ loader = DataLoader::Loader.new do |config|
20
+ config.table_prefix = :import
21
+ config.folder = 'path/to/csv/files/'
22
+ config.inspect_rows = 10
23
+ config.connection = :development
24
+ config.separator = ','
25
+ config.default_ext = 'csv'
26
+ end
27
+
28
+ # Load data
29
+ loader.load 'my_csv_file', :my_table
30
+
31
+
32
+ ### TODO
33
+
34
+ * Write the column structure for each table to a text file. This file can be stored in Git, so that if the CSV files change, the diff will make it obvious what changed.
35
+
36
+ * A task to clean up all these temporary tables when we're done.
37
+
38
+ * Post-data load step in Migrator to NULLify 0000-00-00 dates, which is how MySQL reads empty strings in (integers would remain 0).
39
+
40
+ * Broader support for Rubies, Databases, and ORM/tools for building the schema.
41
+
42
+ * Better tests!
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "data_loader/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "data_loader"
7
+ s.version = DataLoader::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.required_ruby_version = '>= 1.8.7'
10
+ s.authors = ["Nathan Youngman"]
11
+ s.email = ["git@nathany.com"]
12
+ s.homepage = "https://github.com/nathany/data_loader"
13
+ s.summary = %q{Loads CSV data into MySQL, doing an initial scan to determine datatypes.}
14
+ s.description = %q{Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.}
15
+
16
+ s.add_dependency('fastercsv', '~> 1.5.4')
17
+ s.add_dependency('activerecord', '~> 2.3')
18
+ s.add_development_dependency('rspec', '~> 1.3')
19
+
20
+ s.files = `git ls-files`.split("\n")
21
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
22
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
23
+ s.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,3 @@
1
+ require 'data_loader/inspector'
2
+ require 'data_loader/migrator'
3
+ require 'data_loader/loader' # <= start here
@@ -0,0 +1,82 @@
1
+ require 'fastercsv'
2
+ require 'active_support'
3
+
4
+ module DataLoader
5
+
6
+ class Inspector
7
+ # read a csv and return the columns and types in an ordered array
8
+ def self.inspect_file(file, separator = ',', inspect_rows = 10)
9
+ fields = nil
10
+ FasterCSV.open(file,
11
+ :col_sep => separator,
12
+ :converters => [:date_time, :integer], # :integer, :float, :date, :date_time
13
+ :headers => true,
14
+ :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
15
+ :skip_blanks => true) do |csv|
16
+ fields = scan_rows(csv, inspect_rows)
17
+ end
18
+ fields
19
+ end
20
+
21
+ # scan a few rows to determine data types
22
+ def self.scan_rows(csv, inspect_rows)
23
+ first_row = nil
24
+ columns = {} # unordered hash containing date types for each header
25
+
26
+ 1.upto(inspect_rows) do
27
+ row = csv.gets
28
+ break unless row
29
+ row.each do |header, value|
30
+ columns[header] = promote_type(columns[header], dbtype(value))
31
+ end
32
+ first_row ||= row # save for later
33
+ end
34
+
35
+ # form an ordered array based on the first row read:
36
+ fields = []
37
+ first_row.each do |header, value|
38
+ data_type = columns[header] || :string # default to :string if everything was nil
39
+ fields << {:name => header, :type => data_type}
40
+ end
41
+ fields
42
+ end
43
+
44
+ # determine what datatype is most suitable for the value
45
+ def self.dbtype(value)
46
+ if value.is_a?(Fixnum)
47
+ :integer
48
+ elsif value.is_a?(DateTime)
49
+ :datetime
50
+ elsif value.is_a?(String)
51
+ if value.blank?
52
+ nil
53
+ elsif value.length <= 255
54
+ :string
55
+ else
56
+ :text
57
+ end
58
+ elsif value.nil?
59
+ nil
60
+ else
61
+ raise 'Unknown type'
62
+ end
63
+ end
64
+
65
+ # given two datatypes choose what fits them both
66
+ def self.promote_type(*types)
67
+ types.compact!
68
+ if types.empty?
69
+ nil
70
+ elsif (types - [:text, :string, :datetime, :integer]).length > 0 # unknown types
71
+ raise 'Unknown type'
72
+ elsif Set.new(types).length == 1 # one type
73
+ types.first
74
+ elsif types.include?(:text)
75
+ :text
76
+ else
77
+ :string
78
+ end
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,45 @@
1
+ # DataLoader::Loader
2
+ #
3
+ # Loads CSV files into MySQL
4
+ #
5
+ # Config:
6
+ #
7
+ # folder
8
+ # base folder for calls to load()
9
+ # table_prefix
10
+ # prefix for derived table names
11
+ # very important because an existing table will be overwritten!!!
12
+ # inspect_rows
13
+ # how many rows to scan the CSV file to determine the data types
14
+ # connection
15
+ # a connection name from database.yml to run it under (e.g. :production)
16
+ # default_ext
17
+ # extension to append if no file extension is specified
18
+ # separator
19
+ # a comma (,)
20
+
21
+ module DataLoader
22
+
23
+ class Loader
24
+ attr_accessor :folder, :table_prefix, :default_ext, :inspect_rows, :connection, :separator
25
+
26
+ def initialize(folder = '', separator = ',', table_prefix = 'load', connection = :root)
27
+ @folder, @separator = folder, separator
28
+ @table_prefix, @connection = table_prefix, connection
29
+ @default_ext = 'csv'
30
+ @inspect_rows = 10
31
+ yield(self) if block_given?
32
+ end
33
+
34
+ def load(filename, table = nil)
35
+ filename = [filename, default_ext].join('.') if File.extname(filename).empty?
36
+ full_file = File.expand_path(File.join(@folder, filename))
37
+ table = Migrator.derive_table_name(filename) if table.nil?
38
+ table = [@table_prefix, table].join('_') unless @table_prefix.blank?
39
+ columns = Inspector.inspect_file(full_file, @separator, @inspect_rows)
40
+ Migrator.migrate(full_file, columns, table, @separator, @connection)
41
+ table
42
+ end
43
+ end
44
+
45
+ end
@@ -0,0 +1,51 @@
1
+ module DataLoader
2
+
3
+ class Migrator
4
+ def self.migrate(file, columns, table, separator = ',', conn = :root)
5
+ with_connection(conn) do
6
+ create_schema(table, columns)
7
+ puts "-- load_data('#{File.basename(file)}', :#{table.to_s})"
8
+ load_data(file, table, separator)
9
+ end
10
+ end
11
+
12
+ # takes a column,type data structre and makes a table
13
+ def self.create_schema(table_name, data_struct)
14
+ ActiveRecord::Schema.define do
15
+ create_table table_name, :force => true, :id => false do |t|
16
+ data_struct.each do |column|
17
+ t.column(column[:name], column[:type])
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ # uses MySQL LOAD DATA to import the whole file, ignoring the header line
24
+ def self.load_data(file, table_name, separator = ',')
25
+ sql = <<-SQL
26
+ LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name.to_s}
27
+ FIELDS TERMINATED BY '#{separator}' ENCLOSED BY '"'
28
+ LINES TERMINATED BY '\r\n'
29
+ IGNORE 1 LINES;
30
+ SQL
31
+ ActiveRecord::Base.connection.execute(sql)
32
+ end
33
+
34
+ # runs a block under a different connection from database.yml
35
+ def self.with_connection(conn = :root)
36
+ if Rails.env.development?
37
+ yield
38
+ else
39
+ ActiveRecord::Base.establish_connection(conn)
40
+ yield
41
+ ActiveRecord::Base.establish_connection(RAILS_ENV)
42
+ end
43
+ end
44
+
45
+ # a pretty table name
46
+ def self.derive_table_name(file)
47
+ name = File.basename(file, File.extname(file)) # just file
48
+ name.underscore.sub(/[0-9_]*$/, '') # remove trailing numbers
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,3 @@
1
+ module DataLoader
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,82 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataLoader::Inspector, "data typer" do
4
+ before(:each) do
5
+ @loader = DataLoader::Inspector
6
+ end
7
+
8
+ it "should use :integer for Fixnum" do
9
+ @loader.dbtype(3).should == :integer
10
+ end
11
+
12
+ it "should use :datetime for DateTime" do
13
+ @loader.dbtype(DateTime.new).should == :datetime
14
+ end
15
+
16
+ it "should use :string for String under 255 characters" do
17
+ @loader.dbtype('s' * 255).should == :string
18
+ @loader.dbtype('s').should == :string
19
+ end
20
+
21
+ it "should use :text for larger Strings" do
22
+ @loader.dbtype('s' * 256).should == :text
23
+ @loader.dbtype('s' * 4096).should == :text
24
+ end
25
+
26
+ it "should use nil for nil" do
27
+ @loader.dbtype(nil).should be_nil
28
+ end
29
+
30
+ it "should use nil for empty strings" do
31
+ @loader.dbtype('').should be_nil
32
+ end
33
+
34
+ it "should use nil for empty looking strings" do
35
+ @loader.dbtype(' ').should be_nil
36
+ end
37
+
38
+ it "should raise an error for unknown types" do
39
+ lambda { @loader.dbtype(Float) }.should raise_exception
40
+ end
41
+ end
42
+
43
+
44
+ describe DataLoader::Inspector, "promoter" do
45
+ before(:each) do
46
+ @loader = DataLoader::Inspector
47
+ end
48
+
49
+ it "should choose :text over :string, :datetime, or :integer" do
50
+ [:string, :datetime, :integer].each do |t|
51
+ @loader.promote_type(:text, t).should == :text
52
+ end
53
+ end
54
+
55
+ it "should choose :string over :datetime or :integer" do
56
+ @loader.promote_type(:string, :integer).should == :string
57
+ @loader.promote_type(:string, :datetime).should == :string
58
+ end
59
+
60
+ it "should choose :string for :datetime and :integer" do
61
+ @loader.promote_type(:datetime, :integer).should == :string
62
+ end
63
+
64
+ it "should choose keep the type if both the same" do
65
+ [:text, :string, :datetime, :integer].each do |t|
66
+ @loader.promote_type(t, t).should == t
67
+ end
68
+ end
69
+
70
+ it "should ignore nils" do
71
+ @loader.promote_type(:integer, nil).should == :integer
72
+ end
73
+
74
+ it "should return nil if everything is nil" do
75
+ @loader.promote_type(nil).should be_nil
76
+ @loader.promote_type(nil, nil).should be_nil
77
+ end
78
+
79
+ it "should raise an error for unknown types" do
80
+ lambda { @loader.promote_type(:string, :blarg) }.should raise_exception
81
+ end
82
+ end
@@ -0,0 +1 @@
1
+ require 'lib/data_loader/inspector'
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_loader
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Nathan Youngman
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-03-25 00:00:00 -06:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: fastercsv
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 11
30
+ segments:
31
+ - 1
32
+ - 5
33
+ - 4
34
+ version: 1.5.4
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: activerecord
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ hash: 5
46
+ segments:
47
+ - 2
48
+ - 3
49
+ version: "2.3"
50
+ type: :runtime
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: rspec
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 9
61
+ segments:
62
+ - 1
63
+ - 3
64
+ version: "1.3"
65
+ type: :development
66
+ version_requirements: *id003
67
+ description: Uses fastercsv to scan a few lines of a CSV and create a schema with ActiveRecord. It does the actual file load with MySQL LOAD DATA.
68
+ email:
69
+ - git@nathany.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files: []
75
+
76
+ files:
77
+ - .gitignore
78
+ - Gemfile
79
+ - LICENSE
80
+ - README.markdown
81
+ - Rakefile
82
+ - data_loader.gemspec
83
+ - lib/data_loader.rb
84
+ - lib/data_loader/inspector.rb
85
+ - lib/data_loader/loader.rb
86
+ - lib/data_loader/migrator.rb
87
+ - lib/data_loader/version.rb
88
+ - spec/lib/data_loader/inspector_spec.rb
89
+ - spec/spec_helper.rb
90
+ has_rdoc: true
91
+ homepage: https://github.com/nathany/data_loader
92
+ licenses: []
93
+
94
+ post_install_message:
95
+ rdoc_options: []
96
+
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ hash: 57
105
+ segments:
106
+ - 1
107
+ - 8
108
+ - 7
109
+ version: 1.8.7
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ hash: 3
116
+ segments:
117
+ - 0
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project:
122
+ rubygems_version: 1.6.2
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: Loads CSV data into MySQL, doing an initial scan to determine datatypes.
126
+ test_files:
127
+ - spec/lib/data_loader/inspector_spec.rb
128
+ - spec/spec_helper.rb