smarter_csv 1.0.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/.rvmrc +1 -0
- data/Gemfile +4 -0
- data/LICENSE +23 -0
- data/README.md +98 -0
- data/Rakefile +2 -0
- data/lib/extensions/hash.rb +9 -0
- data/lib/smarter_csv.rb +4 -0
- data/lib/smarter_csv/smarter_csv.rb +111 -0
- data/lib/smarter_csv/version.rb +3 -0
- data/smarter_csv.gemspec +17 -0
- metadata +63 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm gemset use smarter_csv
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2012 Tilo Sloboda
|
2
|
+
|
3
|
+
|
4
|
+
MIT License
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
a copy of this software and associated documentation files (the
|
8
|
+
"Software"), to deal in the Software without restriction, including
|
9
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be
|
15
|
+
included in all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
21
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
22
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
23
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# SmarterCSV
|
2
|
+
|
3
|
+
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
4
|
+
and parallel processing with Resque or Sidekiq.
|
5
|
+
|
6
|
+
`smarter_csv` has lots of optional features:
|
7
|
+
* able to process large CSV-files
|
8
|
+
* able to chunk the input from the CSV file to avoid loading the whole CSV file into memory
|
9
|
+
* return a Hash for each line of the CSV file, so we can quickly use the results for either creating MongoDB or ActiveRecord entries, or further processing with Resque
|
10
|
+
* able to pass a block to the method, so data from the CSV file can be directly processed (e.g. Resque.enqueue )
|
11
|
+
* have a bit more flexible input format, where comments are possible, and col_sep,row_sep can be set to any character sequence, including control characters.
|
12
|
+
* able to re-map CSV "column names" to Hash-keys of your choice (normalization)
|
13
|
+
* able to ignore "columns" in the input (delete columns)
|
14
|
+
* able to eliminate nil or empty fields from the result hashes
|
15
|
+
|
16
|
+
### Why?
|
17
|
+
|
18
|
+
Ruby's CSV library's API is pretty old, and it's processing of CSV-files returning Arrays of Arrays feels 'very close to the metal'. The output is not easy to use - especially not if you want to create database records from it. Another shortcoming is that Ruby's CSV library does not have good support for huge CSV-files, e.g. there is no support for 'chunking' and/or parallel processing of the CSV-content (e.g. with Resque or Sidekiq),
|
19
|
+
|
20
|
+
As the existing CSV libraries didn't fit my needs, I was writing my own CSV processing - specifically for use in connection with Rails ORMs like Mongoid, MongoMapper or ActiveRecord. In those ORMs you can easily pass a hash with attribute/value pairs to the create() method. The lower-level Mongo driver and Moped also accept larger arrays of such hashes to create a larger amount of records quickly with just one call.
|
21
|
+
|
22
|
+
### Examples
|
23
|
+
#### Example 1: Reading a CSV-File in one Chunk, returning one Array of Hashes:
|
24
|
+
|
25
|
+
filename = '/tmp/input_file.txt' # TAB delimited file, each row ending with Control-M
|
26
|
+
recordsA = SmarterCSV.process_csv(filename, {:col_sep => "\t", :row_sep => "\cM"}
|
27
|
+
|
28
|
+
=> returns an array of hashes
|
29
|
+
|
30
|
+
#### Example 2: Populate a MySQL or MongoDB Database with SmarterCSV:
|
31
|
+
|
32
|
+
# without using chunks:
|
33
|
+
filename = '/tmp/some.csv'
|
34
|
+
n = SmarterCSV.process_csv(filename, {:key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}}) do |array|
|
35
|
+
# we're passing a block in, to process each resulting hash / =row (the block takes array of hashes)
|
36
|
+
# when chunking is not enabled, there is only one hash in each array
|
37
|
+
MyModel.create( array.first )
|
38
|
+
end
|
39
|
+
|
40
|
+
=> returns number of chunks / rows we processed
|
41
|
+
|
42
|
+
|
43
|
+
#### Example 3: Populate a MongoDB Database in Chunks of 100 records with SmarterCSV:
|
44
|
+
|
45
|
+
# using chunks:
|
46
|
+
filename = '/tmp/some.csv'
|
47
|
+
n = SmarterCSV.process_csv(filename, {:key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}, :chunk_size => 100}) do |array|
|
48
|
+
# we're passing a block in, to process each resulting hash / row (block takes array of hashes)
|
49
|
+
# when chunking is enabled, there are up to :chunk_size hashes in each array
|
50
|
+
MyModel.collection.insert( array ) # insert up to 100 records at a time
|
51
|
+
end
|
52
|
+
|
53
|
+
=> returns number of chunks we processed
|
54
|
+
|
55
|
+
|
56
|
+
#### Example 4: Reading a CSV-like File, and Processing it with Resque:
|
57
|
+
|
58
|
+
filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes)
|
59
|
+
n = SmarterCSV.process_csv(filename, {:col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
|
60
|
+
:chunk_size => '5' , :key_mapping => {:export_date => nil, :name => :genre}}) do |x|
|
61
|
+
puts "Resque.enque( ResqueWorkerClass, #{x.size}, #{x.inspect} )" # simulate processing each chunk
|
62
|
+
end
|
63
|
+
=> returns number of chunks
|
64
|
+
|
65
|
+
|
66
|
+
## Installation
|
67
|
+
|
68
|
+
Add this line to your application's Gemfile:
|
69
|
+
|
70
|
+
gem 'smarter_csv'
|
71
|
+
|
72
|
+
And then execute:
|
73
|
+
|
74
|
+
$ bundle
|
75
|
+
|
76
|
+
Or install it yourself as:
|
77
|
+
|
78
|
+
$ gem install smarter_csv
|
79
|
+
|
80
|
+
## Usage
|
81
|
+
|
82
|
+
TODO: Write usage instructions here
|
83
|
+
|
84
|
+
## Contributing
|
85
|
+
|
86
|
+
1. Fork it
|
87
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
88
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
89
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
90
|
+
5. Create new Pull Request
|
91
|
+
|
92
|
+
|
93
|
+
## See also:
|
94
|
+
|
95
|
+
http://www.unixgods.org/~tilo/Ruby/process_csv_as_hashes.html
|
96
|
+
|
97
|
+
https://gist.github.com/3101950
|
98
|
+
|
data/Rakefile
ADDED
data/lib/smarter_csv.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
module SmarterCSV
|
2
|
+
# this reads and processes a "generalized" CSV file and returns the contents either as an Array of Hashes,
|
3
|
+
# or an Array of Arrays, which contain Hashes, or processes Chunks of Hashes via a given block
|
4
|
+
#
|
5
|
+
# File.read_csv supports the following options:
|
6
|
+
# * :col_sep : column separator , which defaults to ','
|
7
|
+
# * :row_sep : row separator or record separator , defaults to system's $/ , which defaults to "\n"
|
8
|
+
# * :quote_char : quotation character , defaults to '"' (currently not used)
|
9
|
+
# * :comment_regexp : regular expression which matches comment lines , defaults to /^#/ (see NOTE about the CSV header)
|
10
|
+
# * :chunk_size : if set, determines the desired chunk-size (defaults to nil, no chunk processing)
|
11
|
+
# * :remove_empty_fields : remove fields which have nil or empty strings as values (default: true)
|
12
|
+
#
|
13
|
+
# NOTES about CSV Headers:
|
14
|
+
# - as this method parses CSV files, it is assumed that the first line of any file will contain a valid header
|
15
|
+
# - the first line with the CSV header may or may not be commented out according to the :comment_regexp
|
16
|
+
# - any occurences of :comment_regexp or :row_sep will be stripped from the first line with the CSV header
|
17
|
+
# - any of the keys in the header line will be converted to Ruby symbols before being used in the returned Hashes
|
18
|
+
#
|
19
|
+
# NOTES on Key Mapping:
|
20
|
+
# - keys in the header line of the file can be re-mapped to a chosen set of symbols, so the resulting Hashes
|
21
|
+
# can be better used internally in our application (e.g. when directly creating MongoDB entries with them)
|
22
|
+
# - if you want to completely delete a key, then map it to nil or to '', they will be automatically deleted from any result Hash
|
23
|
+
#
|
24
|
+
# NOTES on the use of Chunking and Blocks:
|
25
|
+
# - chunking can be VERY USEFUL if used in combination with passing a block to File.read_csv FOR LARGE FILES
|
26
|
+
# - if you pass a block to File.read_csv, that block will be executed and given an Array of Hashes as the parameter.
|
27
|
+
# If the chunk_size is not set, then the array will only contain one Hash.
|
28
|
+
# If the chunk_size is > 0 , then the array may contain up to chunk_size Hashes.
|
29
|
+
# This can be very useful when passing chunked data to a post-processing step, e.g. through Resque
|
30
|
+
#
|
31
|
+
|
32
|
+
def SmarterCSV.process_csv(filename, options={}, &block)
|
33
|
+
default_options = {:col_sep => ',' , :row_sep => $/ , :quote_char => '"', :remove_empty_fields => true,
|
34
|
+
:comment_regexp => /^#/, :chunk_size => nil , :key_mapping_hash => nil
|
35
|
+
}
|
36
|
+
options = default_options.merge(options)
|
37
|
+
headerA = []
|
38
|
+
result = []
|
39
|
+
old_row_sep = $/
|
40
|
+
begin
|
41
|
+
$/ = options[:row_sep]
|
42
|
+
f = File.open(filename, "r")
|
43
|
+
|
44
|
+
# process the header line in the CSV file..
|
45
|
+
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
46
|
+
headerA = f.readline.sub(options[:comment_regexp],'').chomp(options[:row_sep]).split(options[:col_sep]).map{|x| x.gsub(%r/options[:quote_char]/,'').gsub(/\s+/,'_').to_sym}
|
47
|
+
key_mappingH = options[:key_mapping]
|
48
|
+
|
49
|
+
# do some key mapping on the keys in the file header
|
50
|
+
# if you want to completely delete a key, then map it to nil or to ''
|
51
|
+
if ! key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
|
52
|
+
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x].to_sym) : x}
|
53
|
+
end
|
54
|
+
|
55
|
+
# in case we use chunking.. we'll need to set it up..
|
56
|
+
if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
|
57
|
+
use_chunks = true
|
58
|
+
chunk_size = options[:chunk_size].to_i
|
59
|
+
chunk_count = 0
|
60
|
+
chunk = []
|
61
|
+
else
|
62
|
+
use_chunks = false
|
63
|
+
end
|
64
|
+
|
65
|
+
# now on to processing all the rest of the lines in the CSV file:
|
66
|
+
while ! f.eof? # we can't use f.readlines() here, because this would read the whole file into memory at once, and eof => true
|
67
|
+
line = f.readline # read one line.. this uses the input_record_separator $/ which we set previously!
|
68
|
+
next if line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
69
|
+
line.chomp! # will use $/ which is set to options[:col_sep]
|
70
|
+
|
71
|
+
dataA = line.split(options[:col_sep])
|
72
|
+
hash = Hash.zip(headerA,dataA) # from Facets of Ruby library
|
73
|
+
# make sure we delete any key/value pairs from the hash, which the user wanted to delete:
|
74
|
+
hash.delete(nil); hash.delete(''); hash.delete(:"") # delete any hash keys which were mapped to be deleted
|
75
|
+
hash.delete_if{|k,v| v.nil? || v =~ /^\s*$/} if options[:remove_empty_fields]
|
76
|
+
|
77
|
+
if use_chunks
|
78
|
+
chunk << hash # append temp result to chunk
|
79
|
+
|
80
|
+
if chunk.size >= chunk_size || f.eof? # if chunk if full, or EOF reached
|
81
|
+
# do something with the chunk
|
82
|
+
if block_given?
|
83
|
+
yield chunk # do something with the hashes in the chunk in the block
|
84
|
+
else
|
85
|
+
result << chunk # not sure yet, why anybody would want to do this without a block
|
86
|
+
end
|
87
|
+
chunk_count += 1
|
88
|
+
chunk = [] # initialize for next chunk of data
|
89
|
+
end
|
90
|
+
# while a chunk is being filled up we don't need to do anything else here
|
91
|
+
|
92
|
+
else # no chunk handling
|
93
|
+
if block_given?
|
94
|
+
yield [hash] # do something with the hash in the block (better to use chunking here)
|
95
|
+
else
|
96
|
+
result << hash
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
ensure
|
101
|
+
$/ = old_row_sep # make sure this stupid global variable is always reset to it's previous value after we're done!
|
102
|
+
end
|
103
|
+
if block_given?
|
104
|
+
return chunk_count # when we do processing through a block we only care how many chunks we processed
|
105
|
+
else
|
106
|
+
return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
data/smarter_csv.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/smarter_csv/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Tilo Sloboda\n"]
|
6
|
+
gem.email = ["tilo.sloboda@gmail.com\n"]
|
7
|
+
gem.description = %q{Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with optional features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys}
|
8
|
+
gem.summary = %q{Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots of optional features, e.g. chunked processing for huge CSV files}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "smarter_csv"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = SmarterCSV::VERSION
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: smarter_csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0.pre1
|
5
|
+
prerelease: 6
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- ! 'Tilo Sloboda
|
9
|
+
|
10
|
+
'
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2012-07-29 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with
|
17
|
+
optional features for processing large files in parallel, embedded comments, unusual
|
18
|
+
field- and record-separators, flexible mapping of CSV-headers to Hash-keys
|
19
|
+
email:
|
20
|
+
- ! 'tilo.sloboda@gmail.com
|
21
|
+
|
22
|
+
'
|
23
|
+
executables: []
|
24
|
+
extensions: []
|
25
|
+
extra_rdoc_files: []
|
26
|
+
files:
|
27
|
+
- .gitignore
|
28
|
+
- .rvmrc
|
29
|
+
- Gemfile
|
30
|
+
- LICENSE
|
31
|
+
- README.md
|
32
|
+
- Rakefile
|
33
|
+
- lib/extensions/hash.rb
|
34
|
+
- lib/smarter_csv.rb
|
35
|
+
- lib/smarter_csv/smarter_csv.rb
|
36
|
+
- lib/smarter_csv/version.rb
|
37
|
+
- smarter_csv.gemspec
|
38
|
+
homepage: ''
|
39
|
+
licenses: []
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>'
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.3.1
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.8.15
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots
|
62
|
+
of optional features, e.g. chunked processing for huge CSV files
|
63
|
+
test_files: []
|