smarter_csv 1.1.5 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. checksums.yaml +5 -5
  2. data/.rspec +1 -2
  3. data/.rubocop.yml +154 -0
  4. data/CHANGELOG.md +364 -0
  5. data/CONTRIBUTORS.md +56 -0
  6. data/Gemfile +7 -2
  7. data/LICENSE.txt +21 -0
  8. data/README.md +44 -441
  9. data/Rakefile +39 -19
  10. data/TO_DO_v2.md +14 -0
  11. data/docs/_introduction.md +56 -0
  12. data/docs/basic_api.md +157 -0
  13. data/docs/batch_processing.md +68 -0
  14. data/docs/data_transformations.md +50 -0
  15. data/docs/examples.md +75 -0
  16. data/docs/header_transformations.md +113 -0
  17. data/docs/header_validations.md +36 -0
  18. data/docs/options.md +98 -0
  19. data/docs/row_col_sep.md +104 -0
  20. data/docs/value_converters.md +68 -0
  21. data/ext/smarter_csv/extconf.rb +14 -0
  22. data/ext/smarter_csv/smarter_csv.c +97 -0
  23. data/lib/smarter_csv/auto_detection.rb +78 -0
  24. data/lib/smarter_csv/errors.rb +16 -0
  25. data/lib/smarter_csv/file_io.rb +50 -0
  26. data/lib/smarter_csv/hash_transformations.rb +91 -0
  27. data/lib/smarter_csv/header_transformations.rb +63 -0
  28. data/lib/smarter_csv/header_validations.rb +34 -0
  29. data/lib/smarter_csv/headers.rb +68 -0
  30. data/lib/smarter_csv/options.rb +95 -0
  31. data/lib/smarter_csv/parser.rb +90 -0
  32. data/lib/smarter_csv/reader.rb +243 -0
  33. data/lib/smarter_csv/version.rb +3 -1
  34. data/lib/smarter_csv/writer.rb +116 -0
  35. data/lib/smarter_csv.rb +91 -3
  36. data/smarter_csv.gemspec +43 -20
  37. metadata +122 -137
  38. data/.gitignore +0 -8
  39. data/.travis.yml +0 -19
  40. data/lib/extensions/hash.rb +0 -7
  41. data/lib/smarter_csv/smarter_csv.rb +0 -281
  42. data/spec/fixtures/basic.csv +0 -8
  43. data/spec/fixtures/binary.csv +0 -1
  44. data/spec/fixtures/carriage_returns_n.csv +0 -18
  45. data/spec/fixtures/carriage_returns_quoted.csv +0 -3
  46. data/spec/fixtures/carriage_returns_r.csv +0 -1
  47. data/spec/fixtures/carriage_returns_rn.csv +0 -18
  48. data/spec/fixtures/chunk_cornercase.csv +0 -10
  49. data/spec/fixtures/empty.csv +0 -5
  50. data/spec/fixtures/line_endings_n.csv +0 -4
  51. data/spec/fixtures/line_endings_r.csv +0 -1
  52. data/spec/fixtures/line_endings_rn.csv +0 -4
  53. data/spec/fixtures/lots_of_columns.csv +0 -2
  54. data/spec/fixtures/malformed.csv +0 -3
  55. data/spec/fixtures/malformed_header.csv +0 -3
  56. data/spec/fixtures/money.csv +0 -3
  57. data/spec/fixtures/no_header.csv +0 -7
  58. data/spec/fixtures/numeric.csv +0 -5
  59. data/spec/fixtures/pets.csv +0 -5
  60. data/spec/fixtures/quoted.csv +0 -5
  61. data/spec/fixtures/separator.csv +0 -4
  62. data/spec/fixtures/skip_lines.csv +0 -8
  63. data/spec/fixtures/valid_unicode.csv +0 -5
  64. data/spec/fixtures/with_dashes.csv +0 -8
  65. data/spec/fixtures/with_dates.csv +0 -4
  66. data/spec/smarter_csv/binary_file2_spec.rb +0 -24
  67. data/spec/smarter_csv/binary_file_spec.rb +0 -22
  68. data/spec/smarter_csv/carriage_return_spec.rb +0 -170
  69. data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
  70. data/spec/smarter_csv/close_file_spec.rb +0 -15
  71. data/spec/smarter_csv/column_separator_spec.rb +0 -11
  72. data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
  73. data/spec/smarter_csv/extenstions_spec.rb +0 -17
  74. data/spec/smarter_csv/header_transformation_spec.rb +0 -21
  75. data/spec/smarter_csv/keep_headers_spec.rb +0 -24
  76. data/spec/smarter_csv/key_mapping_spec.rb +0 -25
  77. data/spec/smarter_csv/line_ending_spec.rb +0 -43
  78. data/spec/smarter_csv/load_basic_spec.rb +0 -20
  79. data/spec/smarter_csv/malformed_spec.rb +0 -21
  80. data/spec/smarter_csv/no_header_spec.rb +0 -24
  81. data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
  82. data/spec/smarter_csv/quoted_spec.rb +0 -23
  83. data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
  84. data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
  85. data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
  86. data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
  87. data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
  88. data/spec/smarter_csv/skip_lines_spec.rb +0 -29
  89. data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
  90. data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
  91. data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
  92. data/spec/smarter_csv/value_converters_spec.rb +0 -52
  93. data/spec/spec/spec_helper.rb +0 -17
  94. data/spec/spec.opts +0 -2
  95. data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ #
5
+ # NOTE: this is not called when "parse" methods are tested by themselves
6
+ #
7
+ # ONLY FOR BACKWARDS-COMPATIBILITY
8
+ def self.default_options
9
+ Options::DEFAULT_OPTIONS
10
+ end
11
+
12
+ module Options
13
+ DEFAULT_OPTIONS = {
14
+ acceleration: true, # if user wants to use accelleration or not
15
+ auto_row_sep_chars: 500,
16
+ chunk_size: nil,
17
+ col_sep: :auto, # was: ',',
18
+ comment_regexp: nil, # was: /\A#/,
19
+ convert_values_to_numeric: true,
20
+ downcase_header: true,
21
+ duplicate_header_suffix: '', # was: nil,
22
+ file_encoding: 'utf-8',
23
+ force_simple_split: false,
24
+ force_utf8: false,
25
+ headers_in_file: true,
26
+ invalid_byte_sequence: '',
27
+ keep_original_headers: false,
28
+ key_mapping: nil,
29
+ quote_char: '"',
30
+ remove_empty_hashes: true,
31
+ remove_empty_values: true,
32
+ remove_unmapped_keys: false,
33
+ remove_values_matching: nil,
34
+ remove_zero_values: false,
35
+ required_headers: nil,
36
+ required_keys: nil,
37
+ row_sep: :auto, # was: $/,
38
+ silence_missing_keys: false,
39
+ skip_lines: nil,
40
+ strings_as_keys: false,
41
+ strip_chars_from_headers: nil,
42
+ strip_whitespace: true,
43
+ user_provided_headers: nil,
44
+ value_converters: nil,
45
+ verbose: false,
46
+ with_line_numbers: false,
47
+ }.freeze
48
+
49
+ # NOTE: this is not called when "parse" methods are tested by themselves
50
+ def process_options(given_options = {})
51
+ puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
52
+
53
+ @options = DEFAULT_OPTIONS.dup.merge!(given_options)
54
+
55
+ # fix invalid input
56
+ @options[:invalid_byte_sequence] ||= ''
57
+
58
+ puts "Computed options:\n#{pp(@options)}\n" if @options[:verbose]
59
+
60
+ validate_options!(@options)
61
+ @options
62
+ end
63
+
64
+ private
65
+
66
+ def validate_options!(options)
67
+ # deprecate required_headers
68
+ unless options[:required_headers].nil?
69
+ puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
70
+ if options[:required_keys].nil?
71
+ options[:required_keys] = options[:required_headers]
72
+ options[:required_headers] = nil
73
+ end
74
+ end
75
+
76
+ keys = options.keys
77
+ errors = []
78
+ errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep])
79
+ errors << "invalid col_sep" if keys.include?(:col_sep) && !option_valid?(options[:col_sep])
80
+ errors << "invalid quote_char" if keys.include?(:quote_char) && !option_valid?(options[:quote_char])
81
+ raise SmarterCSV::ValidationError, errors.inspect if errors.any?
82
+ end
83
+
84
+ def option_valid?(str)
85
+ return true if str.is_a?(Symbol) && str == :auto
86
+ return true if str.is_a?(String) && !str.empty?
87
+
88
+ false
89
+ end
90
+
91
+ def pp(value)
92
+ defined?(AwesomePrint) ? value.awesome_inspect(index: nil) : value.inspect
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ module Parser
5
+ protected
6
+
7
+ ###
8
+ ### Thin wrapper around C-extension
9
+ ###
10
+ def parse(line, options, header_size = nil)
11
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
+
13
+ if options[:acceleration] && has_acceleration
14
+ # :nocov:
15
+ has_quotes = line =~ /#{options[:quote_char]}/
16
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
17
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
18
+ [elements, elements.size]
19
+ # :nocov:
20
+ else
21
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
22
+ parse_csv_line_ruby(line, options, header_size)
23
+ end
24
+ end
25
+
26
+ # ------------------------------------------------------------------
27
+ # Ruby equivalent of the C-extension for parse_line
28
+ #
29
+ # parses a single line: either a CSV header and body line
30
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
31
+ # - we are not assuming that quotes inside a fields need to be doubled
32
+ # - we are not assuming that all fields need to be quoted (0 is even)
33
+ # - works with multi-char col_sep
34
+ # - if header_size is given, only up to header_size fields are parsed
35
+ #
36
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
37
+ # in case there are trailing col_sep characters in line
38
+ #
39
+ # Our convention is that empty fields are returned as empty strings, not as nil.
40
+ #
41
+ #
42
+ # the purpose of the max_size parameter is to handle a corner case where
43
+ # CSV lines contain more fields than the header.
44
+ # In which case the remaining fields in the line are ignored
45
+ #
46
+ def parse_csv_line_ruby(line, options, header_size = nil)
47
+ return [] if line.nil?
48
+
49
+ line_size = line.size
50
+ col_sep = options[:col_sep]
51
+ col_sep_size = col_sep.size
52
+ quote = options[:quote_char]
53
+ quote_count = 0
54
+ elements = []
55
+ start = 0
56
+ i = 0
57
+
58
+ previous_char = ''
59
+ while i < line_size
60
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
61
+ break if !header_size.nil? && elements.size >= header_size
62
+
63
+ elements << cleanup_quotes(line[start...i], quote)
64
+ previous_char = line[i]
65
+ i += col_sep.size
66
+ start = i
67
+ else
68
+ quote_count += 1 if line[i] == quote && previous_char != '\\'
69
+ previous_char = line[i]
70
+ i += 1
71
+ end
72
+ end
73
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
74
+ [elements, elements.size]
75
+ end
76
+
77
+ def cleanup_quotes(field, quote)
78
+ return field if field.nil?
79
+
80
+ # return if field !~ /#{quote}/ # this check can probably eliminated
81
+
82
+ if field.start_with?(quote) && field.end_with?(quote)
83
+ field.delete_prefix!(quote)
84
+ field.delete_suffix!(quote)
85
+ end
86
+ field.gsub!("#{quote}#{quote}", quote)
87
+ field
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,243 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Reader
5
+ include ::SmarterCSV::Options
6
+ include ::SmarterCSV::FileIO
7
+ include ::SmarterCSV::AutoDetection
8
+ include ::SmarterCSV::Headers
9
+ include ::SmarterCSV::HeaderTransformations
10
+ include ::SmarterCSV::HeaderValidations
11
+ include ::SmarterCSV::HashTransformations
12
+ include ::SmarterCSV::Parser
13
+
14
+ attr_reader :input, :options
15
+ attr_reader :csv_line_count, :chunk_count, :file_line_count
16
+ attr_reader :enforce_utf8, :has_rails, :has_acceleration
17
+ attr_reader :errors, :warnings, :headers, :raw_header, :result
18
+
19
+ # :nocov:
20
+ # rubocop:disable Naming/MethodName
21
+ def headerA
22
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
23
+ @headerA
24
+ end
25
+ # rubocop:enable Naming/MethodName
26
+ # :nocov:
27
+
28
+ # first parameter: filename or input object which responds to readline method
29
+ def initialize(input, given_options = {})
30
+ @input = input
31
+ @has_rails = !!defined?(Rails)
32
+ @csv_line_count = 0
33
+ @chunk_count = 0
34
+ @errors = {}
35
+ @file_line_count = 0
36
+ @headerA = []
37
+ @headers = nil
38
+ @raw_header = nil # header as it appears in the file
39
+ @result = []
40
+ @warnings = {}
41
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
42
+ @options = process_options(given_options)
43
+ # true if it is compiled with accelleration
44
+ @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
45
+ end
46
+
47
+ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
48
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
49
+ @verbose = options[:verbose]
50
+
51
+ begin
52
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
53
+
54
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
55
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
56
+ end
57
+
58
+ # auto-detect the row separator
59
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
60
+ # attempt to auto-detect column separator
61
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
62
+
63
+ skip_lines(fh, options)
64
+
65
+ @headers, header_size = process_headers(fh, options)
66
+ @headerA = @headers # @headerA is deprecated, use @headers
67
+
68
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
69
+
70
+ header_validations(@headers, options)
71
+
72
+ # in case we use chunking.. we'll need to set it up..
73
+ if options[:chunk_size].to_i > 0
74
+ use_chunks = true
75
+ chunk_size = options[:chunk_size].to_i
76
+ @chunk_count = 0
77
+ chunk = []
78
+ else
79
+ use_chunks = false
80
+ end
81
+
82
+ # now on to processing all the rest of the lines in the CSV file:
83
+ # fh.each_line |line|
84
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
85
+ line = readline_with_counts(fh, options)
86
+
87
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
88
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
89
+
90
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
91
+
92
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
93
+
94
+ # cater for the quoted csv data containing the row separator carriage return character
95
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
96
+ # by detecting the existence of an uneven number of quote characters
97
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
98
+
99
+ while multiline
100
+ next_line = fh.readline(options[:row_sep])
101
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
102
+ line += next_line
103
+ @file_line_count += 1
104
+
105
+ break if fh.eof? # Exit loop if end of file is reached
106
+
107
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
108
+ end
109
+
110
+ # :nocov:
111
+ if multiline && @verbose
112
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
113
+ end
114
+ # :nocov:
115
+
116
+ line.chomp!(options[:row_sep])
117
+
118
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
119
+ dataA, _data_size = parse(line, options, header_size)
120
+
121
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
122
+
123
+ # if all values are blank, then ignore this line
124
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
125
+
126
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
127
+ hash = @headers.zip(dataA).to_h
128
+
129
+ hash = hash_transformations(hash, options)
130
+
131
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
132
+ # will go here, and be able to:
133
+ # - validate correct format of the values for fields
134
+ # - required fields to be non-empty
135
+ # - ...
136
+ # -------------------------------------------------------------------------------------
137
+
138
+ next if options[:remove_empty_hashes] && hash.empty?
139
+
140
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
141
+ # optional adding of csv_line_number to the hash to help debugging
142
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
143
+
144
+ # process the chunks or the resulting hash
145
+ if use_chunks
146
+ chunk << hash # append temp result to chunk
147
+
148
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
149
+ # do something with the chunk
150
+ if block_given?
151
+ yield chunk # do something with the hashes in the chunk in the block
152
+ else
153
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
154
+ end
155
+ @chunk_count += 1
156
+ chunk.clear # re-initialize for next chunk of data
157
+ else
158
+ # the last chunk may contain partial data, which is handled below
159
+ end
160
+ # while a chunk is being filled up we don't need to do anything else here
161
+
162
+ else # no chunk handling
163
+ if block_given?
164
+ yield [hash] # do something with the hash in the block (better to use chunking here)
165
+ else
166
+ @result << hash
167
+ end
168
+ end
169
+ end
170
+
171
+ # print new line to retain last processing line message
172
+ print "\n" if @verbose
173
+
174
+ # handling of last chunk:
175
+ if !chunk.nil? && chunk.size > 0
176
+ # do something with the chunk
177
+ if block_given?
178
+ yield chunk # do something with the hashes in the chunk in the block
179
+ else
180
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
181
+ end
182
+ @chunk_count += 1
183
+ # chunk = [] # initialize for next chunk of data
184
+ end
185
+ ensure
186
+ fh.close if fh.respond_to?(:close)
187
+ end
188
+
189
+ if block_given?
190
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
191
+ else
192
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
193
+ end
194
+ end
195
+
196
+ def count_quote_chars(line, quote_char)
197
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
198
+
199
+ count = 0
200
+ escaped = false
201
+
202
+ line.each_char do |char|
203
+ if char == '\\' && !escaped
204
+ escaped = true
205
+ else
206
+ count += 1 if char == quote_char && !escaped
207
+ escaped = false
208
+ end
209
+ end
210
+
211
+ count
212
+ end
213
+
214
+ protected
215
+
216
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
217
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
218
+ BLANK_RE = /\A\s*\z/.freeze
219
+
220
+ def blank?(value)
221
+ case value
222
+ when String
223
+ BLANK_RE.match?(value)
224
+ when NilClass
225
+ true
226
+ when Array
227
+ value.all? { |elem| blank?(elem) }
228
+ when Hash
229
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
230
+ else
231
+ false
232
+ end
233
+ end
234
+
235
+ private
236
+
237
+ def enforce_utf8_encoding(line, options)
238
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
239
+
240
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
241
+ end
242
+ end
243
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module SmarterCSV
2
- VERSION = "1.1.5"
4
+ VERSION = "1.12.1"
3
5
  end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ #
5
+ # Generate CSV files
6
+ #
7
+ # Create an instance of the Writer class with the filename and options.
8
+ # call `<<` one or mulltiple times to append data to the file.
9
+ # call `finalize` to save the file.
10
+ #
11
+ # The `<<` method can take different arguments:
12
+ # * a signle Hash
13
+ # * an array of Hashes
14
+ # * nested arrays of arrays of Hashes
15
+ #
16
+ # By default SmarterCSV::Writer automatically discovers all headers that are present
17
+ # in the data on-the-fly. This can be disabled, then only given headers are used.
18
+ # Disabling can be useful when you want to select attributes from hashes, or ActiveRecord instances.
19
+ #
20
+ # If `discover_headers` is enabled, and headers are given, any new headers that are found in the data will still be appended.
21
+ #
22
+ # The Writer automatically quotes fields containing the col_sep, row_sep, or the quote_char.
23
+ #
24
+ # Options:
25
+ # col_sep : defaults to , but can be set to any other character
26
+ # row_sep : defaults to LF \n , but can be set to \r\n or \r or anything else
27
+ # quote_char : defaults to "
28
+ # discover_headers : defaults to true
29
+ # headers : defaults to []
30
+ # force_quotes: defaults to false
31
+ # map_headers: defaults to {}, can be a hash of key -> value mappings
32
+
33
+ # IMPORTANT NOTES:
34
+ # * Data hashes could contain strings or symbols as keys.
35
+ # Make sure to use the correct form when specifying headers manually,
36
+ # in combination with the :discover_headers option
37
+
38
+ attr_reader :options, :row_sep, :col_sep, :quote_char, :force_quotes, :discover_headers, :headers, :map_headers, :output_file
39
+
40
+ class Writer
41
+ def initialize(file_path, options = {})
42
+ @options = options
43
+
44
+ @row_sep = options[:row_sep] || $/ # Defaults to system's row separator. RFC4180 "\r\n"
45
+ @col_sep = options[:col_sep] || ','
46
+ @quote_char = options[:quote_char] || '"'
47
+ @force_quotes = options[:force_quotes] == true
48
+ @discover_headers = true # defaults to true
49
+ if options.has_key?(:discover_headers)
50
+ # passing in the option overrides the default behavior
51
+ @discover_headers = options[:discover_headers] == true
52
+ else
53
+ # disable discover_headers when headers are given explicitly
54
+ @discover_headers = !(options.has_key?(:map_headers) || options.has_key?(:headers))
55
+ end
56
+ @headers = [] # start with empty headers
57
+ @headers = options[:headers] if options.has_key?(:headers) # unless explicitly given
58
+ @headers = options[:map_headers].keys if options.has_key?(:map_headers) && !options.has_key?(:headers)
59
+ @map_headers = options[:map_headers] || {}
60
+
61
+ @output_file = File.open(file_path, 'w+')
62
+ # hidden state:
63
+ @temp_file = Tempfile.new('tempfile', '/tmp')
64
+ @quote_regex = Regexp.union(@col_sep, @row_sep, @quote_char)
65
+ end
66
+
67
+ # this can be called many times in order to append lines to the csv file
68
+ def <<(data)
69
+ case data
70
+ when Hash
71
+ process_hash(data)
72
+ when Array
73
+ data.each { |item| self << item }
74
+ when NilClass
75
+ # ignore
76
+ else
77
+ raise InvalidInputData, "Invalid data type: #{data.class}. Must be a Hash or an Array."
78
+ end
79
+ end
80
+
81
+ def finalize
82
+ # Map headers if :map_headers option is provided
83
+ mapped_headers = @headers.map { |header| @map_headers[header] || header }
84
+
85
+ @temp_file.rewind
86
+ @output_file.write(mapped_headers.join(@col_sep) + @row_sep)
87
+ @output_file.write(@temp_file.read)
88
+ @output_file.flush
89
+ @output_file.close
90
+ @temp_file.delete
91
+ end
92
+
93
+ private
94
+
95
+ def process_hash(hash)
96
+ if @discover_headers
97
+ hash_keys = hash.keys
98
+ new_keys = hash_keys - @headers
99
+ @headers.concat(new_keys)
100
+ end
101
+
102
+ # Reorder the hash to match the current headers order and fill missing fields
103
+ ordered_row = @headers.map { |header| hash[header] || '' }
104
+
105
+ @temp_file.write ordered_row.map { |value| escape_csv_field(value) }.join(@col_sep) + @row_sep
106
+ end
107
+
108
+ def escape_csv_field(field)
109
+ if @force_quotes || field.to_s.match(@quote_regex)
110
+ "\"#{field}\""
111
+ else
112
+ field.to_s
113
+ end
114
+ end
115
+ end
116
+ end
data/lib/smarter_csv.rb CHANGED
@@ -1,4 +1,92 @@
1
- require 'csv'
1
+ # frozen_string_literal: true
2
+
2
3
  require "smarter_csv/version"
3
- require "extensions/hash.rb"
4
- require "smarter_csv/smarter_csv.rb"
4
+ require "smarter_csv/errors"
5
+
6
+ require "smarter_csv/file_io"
7
+ require "smarter_csv/options"
8
+ require "smarter_csv/auto_detection"
9
+ require 'smarter_csv/header_transformations'
10
+ require 'smarter_csv/header_validations'
11
+ require "smarter_csv/headers"
12
+ require "smarter_csv/hash_transformations"
13
+
14
+ require "smarter_csv/parser"
15
+ require "smarter_csv/writer"
16
+ require "smarter_csv/reader"
17
+
18
+ # load the C-extension:
19
+ case RUBY_ENGINE
20
+ when 'ruby'
21
+ begin
22
+ if `uname -s`.chomp == 'Darwin'
23
+ #
24
+ # Please report if you see cases where the rake-compiler is building x86_64 code on arm64 cpus:
25
+ # https://github.com/rake-compiler/rake-compiler/issues/231
26
+ #
27
+ require 'smarter_csv/smarter_csv.bundle'
28
+ else
29
+ # :nocov:
30
+ require_relative "smarter_csv/smarter_csv"
31
+ # :nocov:
32
+ end
33
+ rescue Exception # rubocop:disable Lint/RescueException
34
+ # require_relative 'smarter_csv/smarter_csv'
35
+ end
36
+ # :nocov:
37
+ # when 'truffleruby'
38
+ # puts "\n\n truffleruby case in the load path | RUBY_ENGINE: #{RUBY_ENGINE} , #{RUBY_VERSION}\n\n"
39
+ # # this might not work - if you encounter problems, please contribute and create a PR
40
+ # # require 'truffleruby/smarter_csv'
41
+ else
42
+ puts <<-BLOCK_COMMENT
43
+
44
+ -------------------------------------------------------------------------
45
+ RUBY_ENGINE: #{RUBY_ENGINE} , #{RUBY_VERSION}
46
+
47
+ Acceleration via C-Extension is currently not supported for #{RUBY_ENGINE}
48
+
49
+ Please contribute and create a pull request if you need this
50
+ -------------------------------------------------------------------------
51
+
52
+ BLOCK_COMMENT
53
+ end
54
+ # :nocov:
55
+
56
+ module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
74
+ # SmarterCSV.generate(filename, options) do |csv_writer|
75
+ # MyModel.find_in_batches(batch_size: 100) do |batch|
76
+ # batch.pluck(:name, :description, :instructor).each do |record|
77
+ # csv_writer << record
78
+ # end
79
+ # end
80
+ # end
81
+ #
82
+ # rubocop:disable Lint/UnusedMethodArgument
83
+ def self.generate(filename, options = {}, &block)
84
+ raise unless block_given?
85
+
86
+ writer = Writer.new(filename, options)
87
+ yield writer
88
+ ensure
89
+ writer.finalize
90
+ end
91
+ # rubocop:enable Lint/UnusedMethodArgument
92
+ end