smarter_csv 1.0.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ *~
2
+ #*#
3
+ *old
4
+ *.bak
5
+ *.gem
6
+ .bundle
7
+ Gemfile.lock
8
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm gemset use smarter_csv
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in smarter_csv.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2012 Tilo Sloboda
2
+
3
+
4
+ MIT License
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining
7
+ a copy of this software and associated documentation files (the
8
+ "Software"), to deal in the Software without restriction, including
9
+ without limitation the rights to use, copy, modify, merge, publish,
10
+ distribute, sublicense, and/or sell copies of the Software, and to
11
+ permit persons to whom the Software is furnished to do so, subject to
12
+ the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be
15
+ included in all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,98 @@
1
+ # SmarterCSV
2
+
3
+ `smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
4
+ and parallel processing with Resque or Sidekiq.
5
+
6
+ `smarter_csv` has lots of optional features:
7
+ * able to process large CSV-files
8
+ * able to chunk the input from the CSV file to avoid loading the whole CSV file into memory
9
+ * return a Hash for each line of the CSV file, so we can quickly use the results for either creating MongoDB or ActiveRecord entries, or further processing with Resque
10
+ * able to pass a block to the method, so data from the CSV file can be directly processed (e.g. Resque.enqueue )
11
+ * have a bit more flexible input format, where comments are possible, and col_sep,row_sep can be set to any character sequence, including control characters.
12
+ * able to re-map CSV "column names" to Hash-keys of your choice (normalization)
13
+ * able to ignore "columns" in the input (delete columns)
14
+ * able to eliminate nil or empty fields from the result hashes
15
+
16
+ ### Why?
17
+
18
+ Ruby's CSV library's API is pretty old, and it's processing of CSV-files returning Arrays of Arrays feels 'very close to the metal'. The output is not easy to use - especially not if you want to create database records from it. Another shortcoming is that Ruby's CSV library does not have good support for huge CSV-files, e.g. there is no support for 'chunking' and/or parallel processing of the CSV-content (e.g. with Resque or Sidekiq),
19
+
20
+ As the existing CSV libraries didn't fit my needs, I was writing my own CSV processing - specifically for use in connection with Rails ORMs like Mongoid, MongoMapper or ActiveRecord. In those ORMs you can easily pass a hash with attribute/value pairs to the create() method. The lower-level Mongo driver and Moped also accept larger arrays of such hashes to create a larger amount of records quickly with just one call.
21
+
22
+ ### Examples
23
+ #### Example 1: Reading a CSV-File in one Chunk, returning one Array of Hashes:
24
+
25
+ filename = '/tmp/input_file.txt' # TAB delimited file, each row ending with Control-M
26
+ recordsA = SmarterCSV.process_csv(filename, {:col_sep => "\t", :row_sep => "\cM"}
27
+
28
+ => returns an array of hashes
29
+
30
+ #### Example 2: Populate a MySQL or MongoDB Database with SmarterCSV:
31
+
32
+ # without using chunks:
33
+ filename = '/tmp/some.csv'
34
+ n = SmarterCSV.process_csv(filename, {:key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}}) do |array|
35
+ # we're passing a block in, to process each resulting hash / =row (the block takes array of hashes)
36
+ # when chunking is not enabled, there is only one hash in each array
37
+ MyModel.create( array.first )
38
+ end
39
+
40
+ => returns number of chunks / rows we processed
41
+
42
+
43
+ #### Example 3: Populate a MongoDB Database in Chunks of 100 records with SmarterCSV:
44
+
45
+ # using chunks:
46
+ filename = '/tmp/some.csv'
47
+ n = SmarterCSV.process_csv(filename, {:key_mapping => {:unwanted_row => nil, :old_row_name => :new_name}, :chunk_size => 100}) do |array|
48
+ # we're passing a block in, to process each resulting hash / row (block takes array of hashes)
49
+ # when chunking is enabled, there are up to :chunk_size hashes in each array
50
+ MyModel.collection.insert( array ) # insert up to 100 records at a time
51
+ end
52
+
53
+ => returns number of chunks we processed
54
+
55
+
56
+ #### Example 4: Reading a CSV-like File, and Processing it with Resque:
57
+
58
+ filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes)
59
+ n = SmarterCSV.process_csv(filename, {:col_sep => "\cA", :row_sep => "\cB\n", :comment_regexp => /^#/,
60
+ :chunk_size => '5' , :key_mapping => {:export_date => nil, :name => :genre}}) do |x|
61
+ puts "Resque.enque( ResqueWorkerClass, #{x.size}, #{x.inspect} )" # simulate processing each chunk
62
+ end
63
+ => returns number of chunks
64
+
65
+
66
+ ## Installation
67
+
68
+ Add this line to your application's Gemfile:
69
+
70
+ gem 'smarter_csv'
71
+
72
+ And then execute:
73
+
74
+ $ bundle
75
+
76
+ Or install it yourself as:
77
+
78
+ $ gem install smarter_csv
79
+
80
+ ## Usage
81
+
82
+ TODO: Write usage instructions here
83
+
84
+ ## Contributing
85
+
86
+ 1. Fork it
87
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
88
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
89
+ 4. Push to the branch (`git push origin my-new-feature`)
90
+ 5. Create new Pull Request
91
+
92
+
93
+ ## See also:
94
+
95
+ http://www.unixgods.org/~tilo/Ruby/process_csv_as_hashes.html
96
+
97
+ https://gist.github.com/3101950
98
+
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,9 @@
1
+ # the following extension for class Hash is needed (from Facets of Ruby library):
2
+
3
+ class Hash
4
+ def self.zip(keys,values) # from Facets of Ruby library
5
+ h = {}
6
+ keys.size.times{ |i| h[ keys[i] ] = values[i] }
7
+ h
8
+ end
9
+ end
@@ -0,0 +1,4 @@
1
+ require "smarter_csv/version"
2
+ require "extensions/hash.rb"
3
+ require "smarter_csv/smarter_csv.rb"
4
+
@@ -0,0 +1,111 @@
1
+ module SmarterCSV
2
+ # this reads and processes a "generalized" CSV file and returns the contents either as an Array of Hashes,
3
+ # or an Array of Arrays, which contain Hashes, or processes Chunks of Hashes via a given block
4
+ #
5
+ # File.read_csv supports the following options:
6
+ # * :col_sep : column separator , which defaults to ','
7
+ # * :row_sep : row separator or record separator , defaults to system's $/ , which defaults to "\n"
8
+ # * :quote_char : quotation character , defaults to '"' (currently not used)
9
+ # * :comment_regexp : regular expression which matches comment lines , defaults to /^#/ (see NOTE about the CSV header)
10
+ # * :chunk_size : if set, determines the desired chunk-size (defaults to nil, no chunk processing)
11
+ # * :remove_empty_fields : remove fields which have nil or empty strings as values (default: true)
12
+ #
13
+ # NOTES about CSV Headers:
14
+ # - as this method parses CSV files, it is assumed that the first line of any file will contain a valid header
15
+ # - the first line with the CSV header may or may not be commented out according to the :comment_regexp
16
+ # - any occurences of :comment_regexp or :row_sep will be stripped from the first line with the CSV header
17
+ # - any of the keys in the header line will be converted to Ruby symbols before being used in the returned Hashes
18
+ #
19
+ # NOTES on Key Mapping:
20
+ # - keys in the header line of the file can be re-mapped to a chosen set of symbols, so the resulting Hashes
21
+ # can be better used internally in our application (e.g. when directly creating MongoDB entries with them)
22
+ # - if you want to completely delete a key, then map it to nil or to '', they will be automatically deleted from any result Hash
23
+ #
24
+ # NOTES on the use of Chunking and Blocks:
25
+ # - chunking can be VERY USEFUL if used in combination with passing a block to File.read_csv FOR LARGE FILES
26
+ # - if you pass a block to File.read_csv, that block will be executed and given an Array of Hashes as the parameter.
27
+ # If the chunk_size is not set, then the array will only contain one Hash.
28
+ # If the chunk_size is > 0 , then the array may contain up to chunk_size Hashes.
29
+ # This can be very useful when passing chunked data to a post-processing step, e.g. through Resque
30
+ #
31
+
32
+ def SmarterCSV.process_csv(filename, options={}, &block)
33
+ default_options = {:col_sep => ',' , :row_sep => $/ , :quote_char => '"', :remove_empty_fields => true,
34
+ :comment_regexp => /^#/, :chunk_size => nil , :key_mapping_hash => nil
35
+ }
36
+ options = default_options.merge(options)
37
+ headerA = []
38
+ result = []
39
+ old_row_sep = $/
40
+ begin
41
+ $/ = options[:row_sep]
42
+ f = File.open(filename, "r")
43
+
44
+ # process the header line in the CSV file..
45
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
46
+ headerA = f.readline.sub(options[:comment_regexp],'').chomp(options[:row_sep]).split(options[:col_sep]).map{|x| x.gsub(%r/options[:quote_char]/,'').gsub(/\s+/,'_').to_sym}
47
+ key_mappingH = options[:key_mapping]
48
+
49
+ # do some key mapping on the keys in the file header
50
+ # if you want to completely delete a key, then map it to nil or to ''
51
+ if ! key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
52
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x].to_sym) : x}
53
+ end
54
+
55
+ # in case we use chunking.. we'll need to set it up..
56
+ if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
57
+ use_chunks = true
58
+ chunk_size = options[:chunk_size].to_i
59
+ chunk_count = 0
60
+ chunk = []
61
+ else
62
+ use_chunks = false
63
+ end
64
+
65
+ # now on to processing all the rest of the lines in the CSV file:
66
+ while ! f.eof? # we can't use f.readlines() here, because this would read the whole file into memory at once, and eof => true
67
+ line = f.readline # read one line.. this uses the input_record_separator $/ which we set previously!
68
+ next if line =~ options[:comment_regexp] # ignore all comment lines if there are any
69
+ line.chomp! # will use $/ which is set to options[:col_sep]
70
+
71
+ dataA = line.split(options[:col_sep])
72
+ hash = Hash.zip(headerA,dataA) # from Facets of Ruby library
73
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
74
+ hash.delete(nil); hash.delete(''); hash.delete(:"") # delete any hash keys which were mapped to be deleted
75
+ hash.delete_if{|k,v| v.nil? || v =~ /^\s*$/} if options[:remove_empty_fields]
76
+
77
+ if use_chunks
78
+ chunk << hash # append temp result to chunk
79
+
80
+ if chunk.size >= chunk_size || f.eof? # if chunk if full, or EOF reached
81
+ # do something with the chunk
82
+ if block_given?
83
+ yield chunk # do something with the hashes in the chunk in the block
84
+ else
85
+ result << chunk # not sure yet, why anybody would want to do this without a block
86
+ end
87
+ chunk_count += 1
88
+ chunk = [] # initialize for next chunk of data
89
+ end
90
+ # while a chunk is being filled up we don't need to do anything else here
91
+
92
+ else # no chunk handling
93
+ if block_given?
94
+ yield [hash] # do something with the hash in the block (better to use chunking here)
95
+ else
96
+ result << hash
97
+ end
98
+ end
99
+ end
100
+ ensure
101
+ $/ = old_row_sep # make sure this stupid global variable is always reset to it's previous value after we're done!
102
+ end
103
+ if block_given?
104
+ return chunk_count # when we do processing through a block we only care how many chunks we processed
105
+ else
106
+ return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
107
+ end
108
+ end
109
+
110
+ end
111
+
@@ -0,0 +1,3 @@
1
+ module SmarterCSV
2
+ VERSION = "1.0.0.pre1"
3
+ end
@@ -0,0 +1,17 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/smarter_csv/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Tilo Sloboda\n"]
6
+ gem.email = ["tilo.sloboda@gmail.com\n"]
7
+ gem.description = %q{Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with optional features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys}
8
+ gem.summary = %q{Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots of optional features, e.g. chunked processing for huge CSV files}
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "smarter_csv"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = SmarterCSV::VERSION
17
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: smarter_csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.pre1
5
+ prerelease: 6
6
+ platform: ruby
7
+ authors:
8
+ - ! 'Tilo Sloboda
9
+
10
+ '
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2012-07-29 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with
17
+ optional features for processing large files in parallel, embedded comments, unusual
18
+ field- and record-separators, flexible mapping of CSV-headers to Hash-keys
19
+ email:
20
+ - ! 'tilo.sloboda@gmail.com
21
+
22
+ '
23
+ executables: []
24
+ extensions: []
25
+ extra_rdoc_files: []
26
+ files:
27
+ - .gitignore
28
+ - .rvmrc
29
+ - Gemfile
30
+ - LICENSE
31
+ - README.md
32
+ - Rakefile
33
+ - lib/extensions/hash.rb
34
+ - lib/smarter_csv.rb
35
+ - lib/smarter_csv/smarter_csv.rb
36
+ - lib/smarter_csv/version.rb
37
+ - smarter_csv.gemspec
38
+ homepage: ''
39
+ licenses: []
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ! '>='
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>'
54
+ - !ruby/object:Gem::Version
55
+ version: 1.3.1
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 1.8.15
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots
62
+ of optional features, e.g. chunked processing for huge CSV files
63
+ test_files: []