csv_fast_importer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +36 -0
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +15 -0
  5. data/CONTRIBUTING.md +24 -0
  6. data/Gemfile +3 -0
  7. data/Gemfile.lock +128 -0
  8. data/LICENSE +21 -0
  9. data/README.md +186 -0
  10. data/Rakefile +44 -0
  11. data/benchmark/NPRI-SubsDisp-Normalized-Since1993.csv +10000 -0
  12. data/benchmark/README.md +140 -0
  13. data/benchmark/benchmark.rb +26 -0
  14. data/benchmark/results.png +0 -0
  15. data/benchmark/results.xlsx +0 -0
  16. data/benchmark/strategies.rb +115 -0
  17. data/benchmark/tools.rb +61 -0
  18. data/csv_fast_importer.gemspec +42 -0
  19. data/lib/csv_fast_importer.rb +12 -0
  20. data/lib/csv_fast_importer/configuration.rb +57 -0
  21. data/lib/csv_fast_importer/database/mysql.rb +28 -0
  22. data/lib/csv_fast_importer/database/postgres.rb +36 -0
  23. data/lib/csv_fast_importer/database/queryable.rb +51 -0
  24. data/lib/csv_fast_importer/database_connection.rb +19 -0
  25. data/lib/csv_fast_importer/database_factory.rb +19 -0
  26. data/lib/csv_fast_importer/import.rb +58 -0
  27. data/lib/csv_fast_importer/version.rb +3 -0
  28. data/sample-app/.gitignore +10 -0
  29. data/sample-app/Gemfile +50 -0
  30. data/sample-app/Gemfile.lock +172 -0
  31. data/sample-app/README.md +23 -0
  32. data/sample-app/Rakefile +6 -0
  33. data/sample-app/app/assets/images/.keep +0 -0
  34. data/sample-app/app/assets/javascripts/application.js +16 -0
  35. data/sample-app/app/assets/stylesheets/application.css +15 -0
  36. data/sample-app/app/controllers/application_controller.rb +5 -0
  37. data/sample-app/app/controllers/concerns/.keep +0 -0
  38. data/sample-app/app/helpers/application_helper.rb +2 -0
  39. data/sample-app/app/mailers/.keep +0 -0
  40. data/sample-app/app/models/.keep +0 -0
  41. data/sample-app/app/models/concerns/.keep +0 -0
  42. data/sample-app/app/models/knight.rb +2 -0
  43. data/sample-app/app/views/layouts/application.html.erb +14 -0
  44. data/sample-app/bin/bundle +3 -0
  45. data/sample-app/bin/rails +9 -0
  46. data/sample-app/bin/rake +9 -0
  47. data/sample-app/bin/setup +29 -0
  48. data/sample-app/bin/spring +17 -0
  49. data/sample-app/config.ru +4 -0
  50. data/sample-app/config/application.rb +26 -0
  51. data/sample-app/config/boot.rb +3 -0
  52. data/sample-app/config/database.yml +21 -0
  53. data/sample-app/config/environment.rb +5 -0
  54. data/sample-app/config/environments/development.rb +41 -0
  55. data/sample-app/config/environments/production.rb +79 -0
  56. data/sample-app/config/environments/test.rb +42 -0
  57. data/sample-app/config/initializers/assets.rb +11 -0
  58. data/sample-app/config/initializers/backtrace_silencers.rb +7 -0
  59. data/sample-app/config/initializers/cookies_serializer.rb +3 -0
  60. data/sample-app/config/initializers/filter_parameter_logging.rb +4 -0
  61. data/sample-app/config/initializers/inflections.rb +16 -0
  62. data/sample-app/config/initializers/mime_types.rb +4 -0
  63. data/sample-app/config/initializers/session_store.rb +3 -0
  64. data/sample-app/config/initializers/wrap_parameters.rb +14 -0
  65. data/sample-app/config/locales/en.yml +23 -0
  66. data/sample-app/config/routes.rb +56 -0
  67. data/sample-app/config/secrets.yml +22 -0
  68. data/sample-app/db/development.sqlite3 +0 -0
  69. data/sample-app/db/migrate/20170818134706_create_knights.rb +8 -0
  70. data/sample-app/db/schema.rb +24 -0
  71. data/sample-app/db/seeds.rb +7 -0
  72. data/sample-app/knights.csv +3 -0
  73. data/sample-app/lib/assets/.keep +0 -0
  74. data/sample-app/lib/tasks/.keep +0 -0
  75. data/sample-app/lib/tasks/csv_fast_importer.rake +9 -0
  76. data/sample-app/log/.keep +0 -0
  77. data/sample-app/public/404.html +67 -0
  78. data/sample-app/public/422.html +67 -0
  79. data/sample-app/public/500.html +66 -0
  80. data/sample-app/public/favicon.ico +0 -0
  81. data/sample-app/public/robots.txt +5 -0
  82. data/sample-app/test/controllers/.keep +0 -0
  83. data/sample-app/test/fixtures/.keep +0 -0
  84. data/sample-app/test/fixtures/knights.yml +9 -0
  85. data/sample-app/test/helpers/.keep +0 -0
  86. data/sample-app/test/integration/.keep +0 -0
  87. data/sample-app/test/mailers/.keep +0 -0
  88. data/sample-app/test/models/.keep +0 -0
  89. data/sample-app/test/models/knight_test.rb +7 -0
  90. data/sample-app/test/test_helper.rb +10 -0
  91. metadata +331 -0
@@ -0,0 +1,140 @@
1
+ # Benchmark
2
+
3
+ ## Description
4
+
5
+ There are many ways to import CSV files in a database. Some are based on native ruby libraries, other on dedicated gems.
6
+ We tried here to build a big picture on all main strategies.
7
+
8
+ :point_right: If you think one is missing, do not hesitate to create an issue, or better, submit pull request.
9
+
10
+ ## Modus operandi
11
+
12
+ With each identified strategy, a **10 000 lines** CSV file (created from first lines of `NPRI-SubsDisp-Normalized-Since1993.csv`) is imported in a **PostgreSQL** database. A small file is used here because some strategies would have take hours to import a file with a millions of lines.
13
+
14
+ :information_source: `NPRI-SubsDisp-Normalized-Since1993.csv` was downloaded from [canadian open data](http://ouvert.canada.ca/data/fr/dataset).
15
+
16
+ :information_source: Duration measure includes file reading and database writing, after transaction commit.
17
+
18
+ ## Strategies
19
+
20
+ `Dataset` is an ActiveRecord model.
21
+
22
+ `file` is the file to import.
23
+
24
+ ### CSV.foreach + ActiveRecord .create
25
+
26
+ ```ruby
27
+ require 'csv'
28
+ Dataset.transaction do
29
+ CSV.foreach(file, headers: true) do |row|
30
+ Dataset.create!(row.to_hash)
31
+ end
32
+ end
33
+ ```
34
+
35
+ ### [SmarterCSV](https://github.com/tilo/smarter_csv) 1.1.4 + ActiveRecord .create
36
+
37
+ CSV file reading can be customized with chunk size (this may affect performance).
38
+
39
+ ```ruby
40
+ require 'smarter_csv'
41
+ Dataset.transaction do
42
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
43
+ Dataset.create! dataset_attributes
44
+ end
45
+ end
46
+ ```
47
+
48
+ ### [SmarterCSV](https://github.com/tilo/smarter_csv) 1.1.4 + [activerecord-import](https://github.com/zdennis/activerecord-import) 0.10.0
49
+
50
+ `activerecord-import` becomes efficient when importing multiple rows in same time. But importing the whole CSV file is not a solution because of memory foot print :boom:. So, we read here the CSV file by batch. This is done with `SmarterCSV` which have a small effect on global performances (see results).
51
+
52
+ :information_source: Model validations are skipped here to improve performances even if no validation was defined.
53
+
54
+ ```ruby
55
+ require 'smarter_csv'
56
+ require 'activerecord-import/base'
57
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
58
+ datasets = dataset_attributes.map { |attributes| Dataset.new attributes }
59
+ Dataset.import dataset_attributes.first.keys, datasets, batch_size: 100, validate: false
60
+ end
61
+ ```
62
+
63
+ ### [SmarterCSV](https://github.com/tilo/smarter_csv) 1.1.4 + [bulk_insert](https://github.com/jamis/bulk_insert) 1.5.0
64
+
65
+ Same constraints than `activerecord-import`: batch processing improves performances.
66
+
67
+ ```ruby
68
+ require 'smarter_csv'
69
+ require 'bulk_insert'
70
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
71
+ Dataset.bulk_insert values: dataset_attributes
72
+ end
73
+ ```
74
+
75
+ ### CSV.foreach + [upsert](https://github.com/seamusabshere/upsert) 2.2.1
76
+
77
+ ```ruby
78
+ require 'csv'
79
+ require 'upsert'
80
+ Upsert.batch(Dataset.connection, Dataset.table_name) do |upsert|
81
+ CSV.foreach(file, headers: true) do |row|
82
+ upsert.row(row.to_hash)
83
+ end
84
+ end
85
+ ```
86
+
87
+ ### [CSVImporter](https://github.com/pcreux/csv-importer) 0.3.2
88
+
89
+ ```ruby
90
+ DatasetCSVImporter.new(path: file.path).run!
91
+ ```
92
+
93
+ ### [ActiveImporter](https://github.com/continuum/active_importer) 0.2.6
94
+
95
+ ```ruby
96
+ DatasetActiveImporter.import file.path
97
+ ```
98
+
99
+ ### [Ferry](https://github.com/cmu-is-projects/ferry) 2.0.0
100
+
101
+ :information_source: `Ferry` is more than juste a gem which import CSV files but it can also be used to do that.
102
+
103
+ ```ruby
104
+ require 'ferry'
105
+ Ferry::Importer.new.import_csv "benchmark_env", "datasets", file.path
106
+ ```
107
+
108
+ ## Results
109
+
110
+ ![Benchmark](results.png?raw=true "Benchmark")
111
+
112
+ Produced on a MacBookPro (OSX 10.12.6, i5 2.4GHz, 8Go RAM, Flash drive), with local PostgreSQL **9.6.1.0** instance.
113
+
114
+ :information_source: Results variability accros multiple executions is lower then 5%.
115
+
116
+ ## Explanations
117
+
118
+ First of all, CSV reading took approximatively **400ms** with `CSV.foreach`, and **1000ms** with `SmarterCSV`.
119
+
120
+ We also can notice that all strategies based on Rails' `create!` are very slow. Indeed, this strategy execute each SQL `INSERT` in a dedicated statement, and all ActiveRecord process (validations, callbacks...) is also executed. This last point could be very usefull in a Rails application, but is the main drawback when you look for performance.
121
+
122
+ `upsert` could be more efficient with an id column in imported file (and a unique constraint in database schema), which is not the case here. To give some idea, duration would be divided by 2 with such additional column.
123
+
124
+ Finally, `CSVFastImport` executes one single statement (with `COPY` command) which delegates operation to PostgreSQL instance. Then, CSV file is directly read by database engine witout any constraints (SQL standards, communication protocol...). This is the fastest way to import data in a database :rocket:.
125
+
126
+ ## How to execute this benchmark?
127
+
128
+ Start local PostgreSQL instance.
129
+
130
+ Create database
131
+ ```shell
132
+ bundle exec rake test:db:create
133
+ ```
134
+
135
+ Execute benchmark
136
+ ```
137
+ bundle exec rake benchmark
138
+ ```
139
+
140
+ :information_source: Environment variables `DB_USERNAME` and `DB_PASSWORD` will be used for database authentication. Default is anonymous connection (works great with OSX and [Postgres.app](https://postgresapp.com)).
@@ -0,0 +1,26 @@
1
+ require 'active_record'
2
+ require_relative './tools'
3
+
4
+ class Dataset < ActiveRecord::Base
5
+ end
6
+
7
+ db = database_connect
8
+ build_dataset(db, 'datasets', ENV['DATASET_SIZE'] || 10_000) do |file|
9
+ lines_count = count(file)
10
+ puts "Start benchmark with a #{lines_count} lines file."
11
+
12
+ puts "Running benchmark..."
13
+ require_relative './strategies'
14
+ STRATEGIES.each do |label, strategy|
15
+ db.execute 'TRUNCATE TABLE datasets'
16
+ printf "%-35s: ", label
17
+
18
+ duration = measure_duration { strategy.call(file) }
19
+
20
+ warning_message = '(file partially imported)' if Dataset.count < lines_count - 1 # Header
21
+ printf "%20d ms %s\n", duration, warning_message
22
+ end
23
+ end
24
+
25
+ puts
26
+ puts "Benchmark finished."
Binary file
Binary file
@@ -0,0 +1,115 @@
1
+ # -----------------------------------------------------------------------------
2
+ # All tested strategy (implementations).
3
+ # -----------------------------------------------------------------------------
4
+
5
+ STRATEGIES = {}
6
+
7
+ # CSVFastImporter -------------------------------------------------------------
8
+ STRATEGIES['CSVFastImporter'] = lambda do |file|
9
+ CsvFastImporter.import file, col_sep: ','
10
+ end
11
+
12
+ # CSV.foreach + ActiveRecord create -------------------------------------------
13
+ STRATEGIES['CSV.foreach + ActiveRecord create'] = lambda do |file|
14
+ require 'csv'
15
+ Dataset.transaction do
16
+ CSV.foreach(file, headers: true) do |row|
17
+ Dataset.create!(row.to_hash)
18
+ end
19
+ end
20
+ end
21
+
22
+ # SmarterCSV + ActiveRecord create --------------------------------------------
23
+ STRATEGIES['SmarterCSV + ActiveRecord create'] = lambda do |file|
24
+ require 'smarter_csv'
25
+ Dataset.transaction do
26
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
27
+ Dataset.create! dataset_attributes
28
+ end
29
+ end
30
+ end
31
+
32
+ # SmarterCSV + activerecord-import --------------------------------------------
33
+ STRATEGIES['SmarterCSV + activerecord-import'] = lambda do |file|
34
+ require 'smarter_csv'
35
+ require 'activerecord-import/base'
36
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
37
+ datasets = dataset_attributes.map { |attributes| Dataset.new attributes }
38
+ Dataset.import dataset_attributes.first.keys, datasets, batch_size: 100, validate: false
39
+ end
40
+ end
41
+
42
+ # SmarterCSV + BulkInsert -----------------------------------------------------
43
+ STRATEGIES['SmarterCSV + BulkInsert'] = lambda do |file|
44
+ require 'smarter_csv'
45
+ require 'bulk_insert'
46
+ SmarterCSV.process(file.path, chunk_size: 1000) do |dataset_attributes|
47
+ Dataset.bulk_insert values: dataset_attributes
48
+ end
49
+ # Nearly same performance with following code:
50
+ # Dataset.bulk_insert(set_size: 500) do |worker|
51
+ # SmarterCSV.process(file.path, chunk_size: 500) do |dataset_attributes|
52
+ # dataset_attributes.each do |attributes|
53
+ # worker.add attributes
54
+ # end
55
+ # end
56
+ # end
57
+ end
58
+
59
+ STRATEGIES['CSV.foreach + upsert'] = lambda do |file|
60
+ require 'csv'
61
+ require 'upsert'
62
+ Upsert.logger.level = Logger::ERROR
63
+ Upsert.batch(Dataset.connection, Dataset.table_name) do |upsert|
64
+ CSV.foreach(file, headers: true) do |row|
65
+ upsert.row(row.to_hash)
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ # CSVImporter -----------------------------------------------------------------
72
+ require 'csv-importer'
73
+ class DatasetCSVImporter
74
+ include CSVImporter
75
+
76
+ model Dataset
77
+ end
78
+
79
+ STRATEGIES['CSVImporter'] = lambda do |file|
80
+ DatasetCSVImporter.new(path: file.path).run!
81
+ end
82
+
83
+ # ActiveImporter --------------------------------------------------------------
84
+ require 'active_importer'
85
+ class DatasetActiveImporter < ActiveImporter::Base
86
+ imports Dataset
87
+ end
88
+
89
+ STRATEGIES['ActiveImporter'] = lambda do |file|
90
+ DatasetActiveImporter.import file.path
91
+ end
92
+
93
+ # ferry -----------------------------------------------------------------------
94
+ STRATEGIES['ferry'] = lambda do |file|
95
+ # Required to make ferry work without a rails application
96
+ require 'yaml'
97
+ FileUtils.mkdir_p('config') unless File.exists?('config')
98
+ config_file = 'config/database.yml'
99
+ FileUtils.touch(config_file)
100
+ config = YAML.load(<<-EOT)
101
+ benchmark_env:
102
+ adapter: postgresql
103
+ database: csv_fast_importer_test
104
+ EOT
105
+ File.open(config_file, 'w') { |f| f.write config.to_yaml }
106
+
107
+ # Prevent progress output
108
+ $stderr.reopen(Tempfile.new('benchmark_ferry').path, "w")
109
+
110
+ require 'ferry'
111
+ Ferry::Importer.new.import_csv "benchmark_env", "datasets", file.path
112
+
113
+ FileUtils.rm(config_file)
114
+ end
115
+
@@ -0,0 +1,61 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Set of usefull methods
3
+ # -----------------------------------------------------------------------------
4
+
5
+ def database_connect
6
+ require_relative '../spec/config/test_database.rb'
7
+ test_db = TestDatabase.new
8
+ test_db.connect
9
+ require 'csv_fast_importer'
10
+ CsvFastImporter::DatabaseFactory.build
11
+ end
12
+
13
+ # Downloaded from http://ouvert.canada.ca/data/fr/dataset
14
+ ORIGINAL_DATASET_FILE = File.new('benchmark/NPRI-SubsDisp-Normalized-Since1993.csv')
15
+
16
+ def build_dataset(db, file_name, lines_count)
17
+ puts "Database schema generation..."
18
+ db.execute "DROP TABLE IF EXISTS #{file_name}"
19
+ db.execute <<-SQL
20
+ CREATE TABLE #{file_name} (
21
+ Reporting_Year smallint NULL,
22
+ NPRI_ID integer NULL,
23
+ Facility_Name varchar(255) NULL,
24
+ Company_Name varchar(255) NULL,
25
+ NAICS integer NULL,
26
+ Province varchar(255) NULL,
27
+ CAS_Number varchar(255) NULL,
28
+ substance_name varchar(255) NULL,
29
+ group_escaped varchar(255) NULL,
30
+ Category varchar(255) NULL,
31
+ Quantity decimal NULL,
32
+ Units varchar(255) NULL,
33
+ Estimation_Method varchar(255) NULL
34
+ )
35
+ SQL
36
+
37
+ dataset_file = File.new("benchmark/#{file_name}.csv", 'w+')
38
+ `head -n #{lines_count} #{ORIGINAL_DATASET_FILE.path} > #{dataset_file.path}`
39
+ yield dataset_file
40
+ File.delete(dataset_file)
41
+ end
42
+
43
+ def count(file)
44
+ `wc -l "#{file.path}"`.strip.split(' ')[0].to_i
45
+ end
46
+
47
+ # In milliseconds
48
+ def measure_duration
49
+ start_time = Time.now
50
+ block_stdout { yield }
51
+ (1000 * (Time.now - start_time)).to_i
52
+ end
53
+
54
+ def block_stdout
55
+ original_stdout = $stdout
56
+ File.open(File::NULL, "w") do |file|
57
+ $stdout = file
58
+ yield
59
+ $stdout = original_stdout
60
+ end
61
+ end
@@ -0,0 +1,42 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'csv_fast_importer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "csv_fast_importer"
8
+ spec.version = CSVFastImporter::VERSION
9
+ spec.authors = ["Sogilis"]
10
+ spec.email = ["sogilis@sogilis.com"]
11
+
12
+ spec.summary = "Fast CSV Importer"
13
+ spec.description = "Import CSV files' content into a PostgreSQL database. It is based on the Postgre COPY command which is designed to be as faster as possible."
14
+ spec.homepage = "https://github.com/sogilis/csv_fast_importer"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.required_ruby_version = ">= 2.0"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.10"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "pg", ">= 0.18.4"
28
+ spec.add_development_dependency "mysql2", ">= 0.3.10"
29
+ spec.add_development_dependency "codacy-coverage"
30
+ spec.add_development_dependency "rspec"
31
+
32
+ # Only for benchmark
33
+ spec.add_development_dependency "smarter_csv"
34
+ spec.add_development_dependency "activerecord-import"
35
+ spec.add_development_dependency "bulk_insert"
36
+ spec.add_development_dependency "upsert"
37
+ spec.add_development_dependency "csv-importer"
38
+ spec.add_development_dependency "active_importer"
39
+ spec.add_development_dependency "ferry"
40
+
41
+ spec.add_runtime_dependency "activerecord", [">= 3.0"]
42
+ end
@@ -0,0 +1,12 @@
1
+ require 'csv_fast_importer/version'
2
+ require 'csv_fast_importer/configuration'
3
+ require 'csv_fast_importer/import'
4
+
5
+ module CsvFastImporter
6
+
7
+ def self.import(file, parameters = {})
8
+ configuration = CsvFastImporter::Configuration.new file, parameters
9
+ CsvFastImporter::Import.new(configuration).run
10
+ end
11
+
12
+ end
@@ -0,0 +1,57 @@
1
+ module CsvFastImporter
2
+
3
+ # Gather all import configurations based on given file and additional parameters.
4
+ # This class is also responsible for default configuration.
5
+ class Configuration
6
+
7
+ attr_accessor :file
8
+
9
+ def initialize(file, parameters = {})
10
+ @file = file
11
+ @parameters = parameters
12
+ end
13
+
14
+ def encoding
15
+ @encoding ||= @parameters[:encoding] || 'UTF-8'
16
+ end
17
+
18
+ def column_separator
19
+ @column_separator ||= @parameters[:col_sep] || ';'
20
+ end
21
+
22
+ def mapping
23
+ @mapping ||= downcase_keys_and_values(@parameters[:mapping] || {})
24
+ end
25
+
26
+ def destination_table
27
+ @destination_table ||= (@parameters[:destination] || File.basename(@file, '.*'))
28
+ end
29
+
30
+ def row_index_column
31
+ @row_index_column ||= @parameters[:row_index_column]
32
+ end
33
+
34
+ def transactional?
35
+ @transactional ||= !(@parameters[:transaction] == :disabled)
36
+ end
37
+
38
+ def transactional_forced?
39
+ @transactional_forced ||= (@parameters[:transaction] == :enabled)
40
+ end
41
+
42
+ def truncate?
43
+ @deletion ||= @parameters[:deletion] == :truncate
44
+ end
45
+
46
+ def deletion?
47
+ @deletion ||= !(@parameters[:deletion] == :none)
48
+ end
49
+
50
+ private
51
+
52
+ def downcase_keys_and_values(hash)
53
+ Hash[hash.map{ |k, v| [k.to_s.downcase, v.to_s.downcase] }]
54
+ end
55
+
56
+ end
57
+ end