csv 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/csv.rb CHANGED
@@ -93,36 +93,22 @@ require "forwardable"
93
93
  require "English"
94
94
  require "date"
95
95
  require "stringio"
96
- require_relative "csv/table"
97
- require_relative "csv/row"
98
-
99
- # This provides String#match? and Regexp#match? for Ruby 2.3.
100
- unless String.method_defined?(:match?)
101
- class CSV
102
- module MatchP
103
- refine String do
104
- def match?(pattern)
105
- self =~ pattern
106
- end
107
- end
108
96
 
109
- refine Regexp do
110
- def match?(string)
111
- self =~ string
112
- end
113
- end
114
- end
115
- end
97
+ require_relative "csv/fields_converter"
98
+ require_relative "csv/match_p"
99
+ require_relative "csv/parser"
100
+ require_relative "csv/row"
101
+ require_relative "csv/table"
102
+ require_relative "csv/writer"
116
103
 
117
- using CSV::MatchP
118
- end
104
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
119
105
 
120
106
  #
121
107
  # This class provides a complete interface to CSV files and data. It offers
122
108
  # tools to enable you to read and write to and from Strings or IO objects, as
123
109
  # needed.
124
110
  #
125
- # The most generic interface of a class is:
111
+ # The most generic interface of the library is:
126
112
  #
127
113
  # csv = CSV.new(string_or_io, **options)
128
114
  #
@@ -141,7 +127,7 @@ end
141
127
  # There are several specialized class methods for one-statement reading or writing,
142
128
  # described in the Specialized Methods section.
143
129
  #
144
- # If a String passed into ::new, it is internally wrapped into a StringIO object.
130
+ # If a String is passed into ::new, it is internally wrapped into a StringIO object.
145
131
  #
146
132
  # +options+ can be used for specifying the particular CSV flavor (column
147
133
  # separators, row separators, value quoting and so on), and for data conversion,
@@ -204,18 +190,18 @@ end
204
190
  # # Headers are part of data
205
191
  # data = CSV.parse(<<~ROWS, headers: true)
206
192
  # Name,Department,Salary
207
- # Bob,Engeneering,1000
193
+ # Bob,Engineering,1000
208
194
  # Jane,Sales,2000
209
195
  # John,Management,5000
210
196
  # ROWS
211
197
  #
212
198
  # data.class #=> CSV::Table
213
- # data.first #=> #<CSV::Row "Name":"Bob" "Department":"Engeneering" "Salary":"1000">
214
- # data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engeneering", "Salary"=>"1000"}
199
+ # data.first #=> #<CSV::Row "Name":"Bob" "Department":"Engineering" "Salary":"1000">
200
+ # data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engineering", "Salary"=>"1000"}
215
201
  #
216
202
  # # Headers provided by developer
217
203
  # data = CSV.parse('Bob,Engeneering,1000', headers: %i[name department salary])
218
- # data.first #=> #<CSV::Row name:"Bob" department:"Engeneering" salary:"1000">
204
+ # data.first #=> #<CSV::Row name:"Bob" department:"Engineering" salary:"1000">
219
205
  #
220
206
  # === Typed data reading
221
207
  #
@@ -411,6 +397,7 @@ class CSV
411
397
  # <b><tt>:force_quotes</tt></b>:: +false+
412
398
  # <b><tt>:skip_lines</tt></b>:: +nil+
413
399
  # <b><tt>:liberal_parsing</tt></b>:: +false+
400
+ # <b><tt>:quote_empty</tt></b>:: +true+
414
401
  #
415
402
  DEFAULT_OPTIONS = {
416
403
  col_sep: ",",
@@ -426,6 +413,7 @@ class CSV
426
413
  force_quotes: false,
427
414
  skip_lines: nil,
428
415
  liberal_parsing: false,
416
+ quote_empty: true,
429
417
  }.freeze
430
418
 
431
419
  #
@@ -516,9 +504,9 @@ class CSV
516
504
  # <tt>encoding: "UTF-32BE:UTF-8"</tt> would read UTF-32BE data from the file
517
505
  # but transcode it to UTF-8 before CSV parses it.
518
506
  #
519
- def self.foreach(path, **options, &block)
520
- return to_enum(__method__, path, options) unless block_given?
521
- open(path, options) do |csv|
507
+ def self.foreach(path, mode="r", **options, &block)
508
+ return to_enum(__method__, path, mode, options) unless block_given?
509
+ open(path, mode, options) do |csv|
522
510
  csv.each(&block)
523
511
  end
524
512
  end
@@ -548,7 +536,7 @@ class CSV
548
536
  str.seek(0, IO::SEEK_END)
549
537
  else
550
538
  encoding = options[:encoding]
551
- str = String.new
539
+ str = +""
552
540
  str.force_encoding(encoding) if encoding
553
541
  end
554
542
  csv = new(str, options) # wrap
@@ -571,11 +559,11 @@ class CSV
571
559
  #
572
560
  def self.generate_line(row, **options)
573
561
  options = {row_sep: $INPUT_RECORD_SEPARATOR}.merge(options)
574
- str = String.new
562
+ str = +""
575
563
  if options[:encoding]
576
564
  str.force_encoding(options[:encoding])
577
- elsif field = row.find { |f| not f.nil? }
578
- str.force_encoding(String(field).encoding)
565
+ elsif field = row.find {|f| f.is_a?(String)}
566
+ str.force_encoding(field.encoding)
579
567
  end
580
568
  (new(str, options) << row).string
581
569
  end
@@ -890,83 +878,135 @@ class CSV
890
878
  # attempt to parse input not conformant
891
879
  # with RFC 4180, such as double quotes
892
880
  # in unquoted fields.
893
- # <b><tt>:nil_value</tt></b>:: TODO: WRITE ME.
894
- # <b><tt>:empty_value</tt></b>:: TODO: WRITE ME.
881
+ # <b><tt>:nil_value</tt></b>:: When set an object, any values of an
882
+ # empty field are replaced by the set
883
+ # object, not nil.
884
+ # <b><tt>:empty_value</tt></b>:: When set an object, any values of a
885
+ # blank string field is replaced by
886
+ # the set object.
887
+ # <b><tt>:quote_empty</tt></b>:: TODO
888
+ # <b><tt>:write_converters</tt></b>:: TODO
889
+ # <b><tt>:write_nil_value</tt></b>:: TODO
890
+ # <b><tt>:write_empty_value</tt></b>:: TODO
891
+ # <b><tt>:strip</tt></b>:: TODO
895
892
  #
896
893
  # See CSV::DEFAULT_OPTIONS for the default settings.
897
894
  #
898
895
  # Options cannot be overridden in the instance methods for performance reasons,
899
896
  # so be sure to set what you want here.
900
897
  #
901
- def initialize(data, col_sep: ",", row_sep: :auto, quote_char: '"', field_size_limit: nil,
902
- converters: nil, unconverted_fields: nil, headers: false, return_headers: false,
903
- write_headers: nil, header_converters: nil, skip_blanks: false, force_quotes: false,
904
- skip_lines: nil, liberal_parsing: false, internal_encoding: nil, external_encoding: nil, encoding: nil,
898
+ def initialize(data,
899
+ col_sep: ",",
900
+ row_sep: :auto,
901
+ quote_char: '"',
902
+ field_size_limit: nil,
903
+ converters: nil,
904
+ unconverted_fields: nil,
905
+ headers: false,
906
+ return_headers: false,
907
+ write_headers: nil,
908
+ header_converters: nil,
909
+ skip_blanks: false,
910
+ force_quotes: false,
911
+ skip_lines: nil,
912
+ liberal_parsing: false,
913
+ internal_encoding: nil,
914
+ external_encoding: nil,
915
+ encoding: nil,
905
916
  nil_value: nil,
906
- empty_value: "")
917
+ empty_value: "",
918
+ quote_empty: true,
919
+ write_converters: nil,
920
+ write_nil_value: nil,
921
+ write_empty_value: "",
922
+ strip: false)
907
923
  raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
908
924
 
909
925
  # create the IO object we will read from
910
926
  @io = data.is_a?(String) ? StringIO.new(data) : data
911
927
  @encoding = determine_encoding(encoding, internal_encoding)
912
- #
913
- # prepare for building safe regular expressions in the target encoding,
914
- # if we can transcode the needed characters
915
- #
916
- @re_esc = "\\".encode(@encoding).freeze rescue ""
917
- @re_chars = /#{%"[-\\]\\[\\.^$?*+{}()|# \r\n\t\f\v]".encode(@encoding)}/
918
- @unconverted_fields = unconverted_fields
919
-
920
- # Stores header row settings and loads header converters, if needed.
921
- @use_headers = headers
922
- @return_headers = return_headers
923
- @write_headers = write_headers
924
-
925
- # headers must be delayed until shift(), in case they need a row of content
926
- @headers = nil
927
-
928
- @nil_value = nil_value
929
- @empty_value = empty_value
930
- @empty_value_is_empty_string = (empty_value == "")
931
-
932
- init_separators(col_sep, row_sep, quote_char, force_quotes)
933
- init_parsers(skip_blanks, field_size_limit, liberal_parsing)
934
- init_converters(converters, :@converters, :convert)
935
- init_converters(header_converters, :@header_converters, :header_convert)
936
- init_comments(skip_lines)
937
-
938
- @force_encoding = !!encoding
939
-
940
- # track our own lineno since IO gets confused about line-ends is CSV fields
941
- @lineno = 0
942
-
943
- # make sure headers have been assigned
944
- if header_row? and [Array, String].include? @use_headers.class and @write_headers
945
- parse_headers # won't read data for Array or String
946
- self << @headers
947
- end
928
+
929
+ @base_fields_converter_options = {
930
+ nil_value: nil_value,
931
+ empty_value: empty_value,
932
+ }
933
+ @write_fields_converter_options = {
934
+ nil_value: write_nil_value,
935
+ empty_value: write_empty_value,
936
+ }
937
+ @initial_converters = converters
938
+ @initial_header_converters = header_converters
939
+ @initial_write_converters = write_converters
940
+
941
+ @parser_options = {
942
+ column_separator: col_sep,
943
+ row_separator: row_sep,
944
+ quote_character: quote_char,
945
+ field_size_limit: field_size_limit,
946
+ unconverted_fields: unconverted_fields,
947
+ headers: headers,
948
+ return_headers: return_headers,
949
+ skip_blanks: skip_blanks,
950
+ skip_lines: skip_lines,
951
+ liberal_parsing: liberal_parsing,
952
+ encoding: @encoding,
953
+ nil_value: nil_value,
954
+ empty_value: empty_value,
955
+ strip: strip,
956
+ }
957
+ @parser = nil
958
+ @parser_enumerator = nil
959
+ @eof_error = nil
960
+
961
+ @writer_options = {
962
+ encoding: @encoding,
963
+ force_encoding: (not encoding.nil?),
964
+ force_quotes: force_quotes,
965
+ headers: headers,
966
+ write_headers: write_headers,
967
+ column_separator: col_sep,
968
+ row_separator: row_sep,
969
+ quote_character: quote_char,
970
+ quote_empty: quote_empty,
971
+ }
972
+
973
+ @writer = nil
974
+ writer if @writer_options[:write_headers]
948
975
  end
949
976
 
950
977
  #
951
978
  # The encoded <tt>:col_sep</tt> used in parsing and writing. See CSV::new
952
979
  # for details.
953
980
  #
954
- attr_reader :col_sep
981
+ def col_sep
982
+ parser.column_separator
983
+ end
984
+
955
985
  #
956
986
  # The encoded <tt>:row_sep</tt> used in parsing and writing. See CSV::new
957
987
  # for details.
958
988
  #
959
- attr_reader :row_sep
989
+ def row_sep
990
+ parser.row_separator
991
+ end
992
+
960
993
  #
961
994
  # The encoded <tt>:quote_char</tt> used in parsing and writing. See CSV::new
962
995
  # for details.
963
996
  #
964
- attr_reader :quote_char
997
+ def quote_char
998
+ parser.quote_character
999
+ end
1000
+
965
1001
  # The limit for field size, if any. See CSV::new for details.
966
- attr_reader :field_size_limit
1002
+ def field_size_limit
1003
+ parser.field_size_limit
1004
+ end
967
1005
 
968
1006
  # The regex marking a line as a comment. See CSV::new for details
969
- attr_reader :skip_lines
1007
+ def skip_lines
1008
+ parser.skip_lines
1009
+ end
970
1010
 
971
1011
  #
972
1012
  # Returns the current list of converters in effect. See CSV::new for details.
@@ -974,7 +1014,7 @@ class CSV
974
1014
  # as is.
975
1015
  #
976
1016
  def converters
977
- @converters.map do |converter|
1017
+ parser_fields_converter.map do |converter|
978
1018
  name = Converters.rassoc(converter)
979
1019
  name ? name.first : converter
980
1020
  end
@@ -983,42 +1023,68 @@ class CSV
983
1023
  # Returns +true+ if unconverted_fields() to parsed results. See CSV::new
984
1024
  # for details.
985
1025
  #
986
- def unconverted_fields?() @unconverted_fields end
1026
+ def unconverted_fields?
1027
+ parser.unconverted_fields?
1028
+ end
1029
+
987
1030
  #
988
1031
  # Returns +nil+ if headers will not be used, +true+ if they will but have not
989
1032
  # yet been read, or the actual headers after they have been read. See
990
1033
  # CSV::new for details.
991
1034
  #
992
1035
  def headers
993
- @headers || true if @use_headers
1036
+ if @writer
1037
+ @writer.headers
1038
+ else
1039
+ parsed_headers = parser.headers
1040
+ return parsed_headers if parsed_headers
1041
+ raw_headers = @parser_options[:headers]
1042
+ raw_headers = nil if raw_headers == false
1043
+ raw_headers
1044
+ end
994
1045
  end
995
1046
  #
996
1047
  # Returns +true+ if headers will be returned as a row of results.
997
1048
  # See CSV::new for details.
998
1049
  #
999
- def return_headers?() @return_headers end
1050
+ def return_headers?
1051
+ parser.return_headers?
1052
+ end
1053
+
1000
1054
  # Returns +true+ if headers are written in output. See CSV::new for details.
1001
- def write_headers?() @write_headers end
1055
+ def write_headers?
1056
+ @writer_options[:write_headers]
1057
+ end
1058
+
1002
1059
  #
1003
1060
  # Returns the current list of converters in effect for headers. See CSV::new
1004
1061
  # for details. Built-in converters will be returned by name, while others
1005
1062
  # will be returned as is.
1006
1063
  #
1007
1064
  def header_converters
1008
- @header_converters.map do |converter|
1065
+ header_fields_converter.map do |converter|
1009
1066
  name = HeaderConverters.rassoc(converter)
1010
1067
  name ? name.first : converter
1011
1068
  end
1012
1069
  end
1070
+
1013
1071
  #
1014
1072
  # Returns +true+ blank lines are skipped by the parser. See CSV::new
1015
1073
  # for details.
1016
1074
  #
1017
- def skip_blanks?() @skip_blanks end
1075
+ def skip_blanks?
1076
+ parser.skip_blanks?
1077
+ end
1078
+
1018
1079
  # Returns +true+ if all output fields are quoted. See CSV::new for details.
1019
- def force_quotes?() @force_quotes end
1080
+ def force_quotes?
1081
+ @writer_options[:force_quotes]
1082
+ end
1083
+
1020
1084
  # Returns +true+ if illegal input is handled. See CSV::new for details.
1021
- def liberal_parsing?() @liberal_parsing end
1085
+ def liberal_parsing?
1086
+ parser.liberal_parsing?
1087
+ end
1022
1088
 
1023
1089
  #
1024
1090
  # The Encoding CSV is parsing or writing in. This will be the Encoding you
@@ -1027,26 +1093,90 @@ class CSV
1027
1093
  attr_reader :encoding
1028
1094
 
1029
1095
  #
1030
- # The line number of the last row read from this file. Fields with nested
1096
+ # The line number of the last row read from this file. Fields with nested
1031
1097
  # line-end characters will not affect this count.
1032
1098
  #
1033
- attr_reader :lineno, :line
1099
+ def lineno
1100
+ if @writer
1101
+ @writer.lineno
1102
+ else
1103
+ parser.lineno
1104
+ end
1105
+ end
1106
+
1107
+ #
1108
+ # The last row read from this file.
1109
+ #
1110
+ def line
1111
+ parser.line
1112
+ end
1034
1113
 
1035
1114
  ### IO and StringIO Delegation ###
1036
1115
 
1037
1116
  extend Forwardable
1038
- def_delegators :@io, :binmode, :binmode?, :close, :close_read, :close_write,
1039
- :closed?, :eof, :eof?, :external_encoding, :fcntl,
1040
- :fileno, :flock, :flush, :fsync, :internal_encoding,
1041
- :ioctl, :isatty, :path, :pid, :pos, :pos=, :reopen,
1042
- :seek, :stat, :string, :sync, :sync=, :tell, :to_i,
1043
- :to_io, :truncate, :tty?
1117
+ def_delegators :@io, :binmode, :close, :close_read, :close_write,
1118
+ :closed?, :external_encoding, :fcntl,
1119
+ :fileno, :flush, :fsync, :internal_encoding,
1120
+ :isatty, :pid, :pos, :pos=, :reopen,
1121
+ :seek, :string, :sync, :sync=, :tell,
1122
+ :truncate, :tty?
1123
+
1124
+ def binmode?
1125
+ if @io.respond_to?(:binmode?)
1126
+ @io.binmode?
1127
+ else
1128
+ false
1129
+ end
1130
+ end
1131
+
1132
+ def flock(*args)
1133
+ raise NotImplementedError unless @io.respond_to?(:flock)
1134
+ @io.flock(*args)
1135
+ end
1136
+
1137
+ def ioctl(*args)
1138
+ raise NotImplementedError unless @io.respond_to?(:ioctl)
1139
+ @io.ioctl(*args)
1140
+ end
1141
+
1142
+ def path
1143
+ @io.path if @io.respond_to?(:path)
1144
+ end
1145
+
1146
+ def stat(*args)
1147
+ raise NotImplementedError unless @io.respond_to?(:stat)
1148
+ @io.stat(*args)
1149
+ end
1150
+
1151
+ def to_i
1152
+ raise NotImplementedError unless @io.respond_to?(:to_i)
1153
+ @io.to_i
1154
+ end
1155
+
1156
+ def to_io
1157
+ @io.respond_to?(:to_io) ? @io.to_io : @io
1158
+ end
1159
+
1160
+ def eof?
1161
+ return false if @eof_error
1162
+ begin
1163
+ parser_enumerator.peek
1164
+ false
1165
+ rescue MalformedCSVError => error
1166
+ @eof_error = error
1167
+ false
1168
+ rescue StopIteration
1169
+ true
1170
+ end
1171
+ end
1172
+ alias_method :eof, :eof?
1044
1173
 
1045
1174
  # Rewinds the underlying IO object and resets CSV's lineno() counter.
1046
1175
  def rewind
1047
- @headers = nil
1048
- @lineno = 0
1049
-
1176
+ @parser = nil
1177
+ @parser_enumerator = nil
1178
+ @eof_error = nil
1179
+ @writer.rewind if @writer
1050
1180
  @io.rewind
1051
1181
  end
1052
1182
 
@@ -1060,34 +1190,8 @@ class CSV
1060
1190
  # The data source must be open for writing.
1061
1191
  #
1062
1192
  def <<(row)
1063
- # make sure headers have been assigned
1064
- if header_row? and [Array, String].include? @use_headers.class and !@write_headers
1065
- parse_headers # won't read data for Array or String
1066
- end
1067
-
1068
- # handle CSV::Row objects and Hashes
1069
- row = case row
1070
- when self.class::Row then row.fields
1071
- when Hash then @headers.map { |header| row[header] }
1072
- else row
1073
- end
1074
-
1075
- @headers = row if header_row?
1076
- @lineno += 1
1077
-
1078
- output = row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
1079
- if @io.is_a?(StringIO) and
1080
- output.encoding != (encoding = raw_encoding)
1081
- if @force_encoding
1082
- output = output.encode(encoding)
1083
- elsif (compatible_encoding = Encoding.compatible?(@io.string, output))
1084
- @io.set_encoding(compatible_encoding)
1085
- @io.seek(0, IO::SEEK_END)
1086
- end
1087
- end
1088
- @io << output
1089
-
1090
- self # for chaining
1193
+ writer << row
1194
+ self
1091
1195
  end
1092
1196
  alias_method :add_row, :<<
1093
1197
  alias_method :puts, :<<
@@ -1108,7 +1212,7 @@ class CSV
1108
1212
  # converted field or the field itself.
1109
1213
  #
1110
1214
  def convert(name = nil, &converter)
1111
- add_converter(:@converters, self.class::Converters, name, &converter)
1215
+ parser_fields_converter.add_converter(name, &converter)
1112
1216
  end
1113
1217
 
1114
1218
  #
@@ -1123,10 +1227,7 @@ class CSV
1123
1227
  # effect.
1124
1228
  #
1125
1229
  def header_convert(name = nil, &converter)
1126
- add_converter( :@header_converters,
1127
- self.class::HeaderConverters,
1128
- name,
1129
- &converter )
1230
+ header_fields_converter.add_converter(name, &converter)
1130
1231
  end
1131
1232
 
1132
1233
  include Enumerable
@@ -1138,14 +1239,8 @@ class CSV
1138
1239
  #
1139
1240
  # The data source must be open for reading.
1140
1241
  #
1141
- def each
1142
- if block_given?
1143
- while row = shift
1144
- yield row
1145
- end
1146
- else
1147
- to_enum
1148
- end
1242
+ def each(&block)
1243
+ parser_enumerator.each(&block)
1149
1244
  end
1150
1245
 
1151
1246
  #
@@ -1155,8 +1250,8 @@ class CSV
1155
1250
  #
1156
1251
  def read
1157
1252
  rows = to_a
1158
- if @use_headers
1159
- Table.new(rows)
1253
+ if parser.use_headers?
1254
+ Table.new(rows, headers: parser.headers)
1160
1255
  else
1161
1256
  rows
1162
1257
  end
@@ -1165,7 +1260,7 @@ class CSV
1165
1260
 
1166
1261
  # Returns +true+ if the next row read will be a header row.
1167
1262
  def header_row?
1168
- @use_headers and @headers.nil?
1263
+ parser.header_row?
1169
1264
  end
1170
1265
 
1171
1266
  #
@@ -1176,171 +1271,14 @@ class CSV
1176
1271
  # The data source must be open for reading.
1177
1272
  #
1178
1273
  def shift
1179
- #########################################################################
1180
- ### This method is purposefully kept a bit long as simple conditional ###
1181
- ### checks are faster than numerous (expensive) method calls. ###
1182
- #########################################################################
1183
-
1184
- # handle headers not based on document content
1185
- if header_row? and @return_headers and
1186
- [Array, String].include? @use_headers.class
1187
- if @unconverted_fields
1188
- return add_unconverted_fields(parse_headers, Array.new)
1189
- else
1190
- return parse_headers
1191
- end
1274
+ if @eof_error
1275
+ eof_error, @eof_error = @eof_error, nil
1276
+ raise eof_error
1192
1277
  end
1193
-
1194
- #
1195
- # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
1196
- # because of \r and/or \n characters embedded in quoted fields
1197
- #
1198
- in_extended_col = false
1199
- csv = Array.new
1200
-
1201
- loop do
1202
- # add another read to the line
1203
- unless parse = @io.gets(@row_sep)
1204
- return nil
1205
- end
1206
-
1207
- if in_extended_col
1208
- @line.concat(parse)
1209
- else
1210
- @line = parse.clone
1211
- end
1212
-
1213
- begin
1214
- parse.sub!(@parsers[:line_end], "")
1215
- rescue ArgumentError
1216
- unless parse.valid_encoding?
1217
- message = "Invalid byte sequence in #{parse.encoding}"
1218
- raise MalformedCSVError.new(message, lineno + 1)
1219
- end
1220
- raise
1221
- end
1222
-
1223
- if csv.empty?
1224
- #
1225
- # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
1226
- # CSV's <tt>[nil]</tt>
1227
- #
1228
- if parse.empty?
1229
- @lineno += 1
1230
- if @skip_blanks
1231
- next
1232
- elsif @unconverted_fields
1233
- return add_unconverted_fields(Array.new, Array.new)
1234
- elsif @use_headers
1235
- return self.class::Row.new(@headers, Array.new)
1236
- else
1237
- return Array.new
1238
- end
1239
- end
1240
- end
1241
-
1242
- next if @skip_lines and @skip_lines.match parse
1243
-
1244
- parts = parse.split(@col_sep_split_separator, -1)
1245
- if parts.empty?
1246
- if in_extended_col
1247
- csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop
1248
- else
1249
- csv << nil
1250
- end
1251
- end
1252
-
1253
- # This loop is the hot path of csv parsing. Some things may be non-dry
1254
- # for a reason. Make sure to benchmark when refactoring.
1255
- parts.each do |part|
1256
- if in_extended_col
1257
- # If we are continuing a previous column
1258
- if part.end_with?(@quote_char) && part.count(@quote_char) % 2 != 0
1259
- # extended column ends
1260
- csv.last << part[0..-2]
1261
- if csv.last.match?(@parsers[:stray_quote])
1262
- raise MalformedCSVError.new("Missing or stray quote",
1263
- lineno + 1)
1264
- end
1265
- csv.last.gsub!(@double_quote_char, @quote_char)
1266
- in_extended_col = false
1267
- else
1268
- csv.last << part << @col_sep
1269
- end
1270
- elsif part.start_with?(@quote_char)
1271
- # If we are starting a new quoted column
1272
- if part.count(@quote_char) % 2 != 0
1273
- # start an extended column
1274
- csv << (part[1..-1] << @col_sep)
1275
- in_extended_col = true
1276
- elsif part.end_with?(@quote_char)
1277
- # regular quoted column
1278
- csv << part[1..-2]
1279
- if csv.last.match?(@parsers[:stray_quote])
1280
- raise MalformedCSVError.new("Missing or stray quote",
1281
- lineno + 1)
1282
- end
1283
- csv.last.gsub!(@double_quote_char, @quote_char)
1284
- elsif @liberal_parsing
1285
- csv << part
1286
- else
1287
- raise MalformedCSVError.new("Missing or stray quote",
1288
- lineno + 1)
1289
- end
1290
- elsif part.match?(@parsers[:quote_or_nl])
1291
- # Unquoted field with bad characters.
1292
- if part.match?(@parsers[:nl_or_lf])
1293
- message = "Unquoted fields do not allow \\r or \\n"
1294
- raise MalformedCSVError.new(message, lineno + 1)
1295
- else
1296
- if @liberal_parsing
1297
- csv << part
1298
- else
1299
- raise MalformedCSVError.new("Illegal quoting", lineno + 1)
1300
- end
1301
- end
1302
- else
1303
- # Regular ole unquoted field.
1304
- csv << (part.empty? ? nil : part)
1305
- end
1306
- end
1307
-
1308
- # Replace tacked on @col_sep with @row_sep if we are still in an extended
1309
- # column.
1310
- csv[-1][-1] = @row_sep if in_extended_col
1311
-
1312
- if in_extended_col
1313
- # if we're at eof?(), a quoted field wasn't closed...
1314
- if @io.eof?
1315
- raise MalformedCSVError.new("Unclosed quoted field",
1316
- lineno + 1)
1317
- elsif @field_size_limit and csv.last.size >= @field_size_limit
1318
- raise MalformedCSVError.new("Field size exceeded",
1319
- lineno + 1)
1320
- end
1321
- # otherwise, we need to loop and pull some more data to complete the row
1322
- else
1323
- @lineno += 1
1324
-
1325
- # save fields unconverted fields, if needed...
1326
- unconverted = csv.dup if @unconverted_fields
1327
-
1328
- if @use_headers
1329
- # parse out header rows and handle CSV::Row conversions...
1330
- csv = parse_headers(csv)
1331
- else
1332
- # convert fields, if needed...
1333
- csv = convert_fields(csv)
1334
- end
1335
-
1336
- # inject unconverted fields and accessor, if requested...
1337
- if @unconverted_fields and not csv.respond_to? :unconverted_fields
1338
- add_unconverted_fields(csv, unconverted)
1339
- end
1340
-
1341
- # return the results
1342
- break csv
1343
- end
1278
+ begin
1279
+ parser_enumerator.next
1280
+ rescue StopIteration
1281
+ nil
1344
1282
  end
1345
1283
  end
1346
1284
  alias_method :gets, :shift
@@ -1365,15 +1303,18 @@ class CSV
1365
1303
  # show encoding
1366
1304
  str << " encoding:" << @encoding.name
1367
1305
  # show other attributes
1368
- %w[ lineno col_sep row_sep
1369
- quote_char skip_blanks liberal_parsing ].each do |attr_name|
1370
- if a = instance_variable_get("@#{attr_name}")
1306
+ ["lineno", "col_sep", "row_sep", "quote_char"].each do |attr_name|
1307
+ if a = __send__(attr_name)
1371
1308
  str << " " << attr_name << ":" << a.inspect
1372
1309
  end
1373
1310
  end
1374
- if @use_headers
1375
- str << " headers:" << headers.inspect
1311
+ ["skip_blanks", "liberal_parsing"].each do |attr_name|
1312
+ if a = __send__("#{attr_name}?")
1313
+ str << " " << attr_name << ":" << a.inspect
1314
+ end
1376
1315
  end
1316
+ _headers = headers
1317
+ str << " headers:" << _headers.inspect if _headers
1377
1318
  str << ">"
1378
1319
  begin
1379
1320
  str.join('')
@@ -1389,7 +1330,7 @@ class CSV
1389
1330
 
1390
1331
  def determine_encoding(encoding, internal_encoding)
1391
1332
  # honor the IO encoding if we can, otherwise default to ASCII-8BIT
1392
- io_encoding = raw_encoding(nil)
1333
+ io_encoding = raw_encoding
1393
1334
  return io_encoding if io_encoding
1394
1335
 
1395
1336
  return Encoding.find(internal_encoding) if internal_encoding
@@ -1402,216 +1343,17 @@ class CSV
1402
1343
  Encoding.default_internal || Encoding.default_external
1403
1344
  end
1404
1345
 
1405
- #
1406
- # Stores the indicated separators for later use.
1407
- #
1408
- # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
1409
- # ahead in the <tt>@io</tt> and try to find one. +ARGF+, +STDIN+, +STDOUT+,
1410
- # +STDERR+ and any stream open for output only with a default
1411
- # <tt>@row_sep</tt> of <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
1412
- #
1413
- # This method also establishes the quoting rules used for CSV output.
1414
- #
1415
- def init_separators(col_sep, row_sep, quote_char, force_quotes)
1416
- # store the selected separators
1417
- @col_sep = col_sep.to_s.encode(@encoding)
1418
- if @col_sep == " "
1419
- @col_sep_split_separator = Regexp.new(/#{Regexp.escape(@col_sep)}/)
1420
- else
1421
- @col_sep_split_separator = @col_sep
1422
- end
1423
- @row_sep = row_sep # encode after resolving :auto
1424
- @quote_char = quote_char.to_s.encode(@encoding)
1425
- @double_quote_char = @quote_char * 2
1426
-
1427
- if @quote_char.length != 1
1428
- raise ArgumentError, ":quote_char has to be a single character String"
1429
- end
1430
-
1431
- #
1432
- # automatically discover row separator when requested
1433
- # (not fully encoding safe)
1434
- #
1435
- if @row_sep == :auto
1436
- if [ARGF, STDIN, STDOUT, STDERR].include?(@io) or
1437
- (defined?(Zlib) and @io.class == Zlib::GzipWriter)
1438
- @row_sep = $INPUT_RECORD_SEPARATOR
1439
- else
1440
- begin
1441
- #
1442
- # remember where we were (pos() will raise an exception if @io is pipe
1443
- # or not opened for reading)
1444
- #
1445
- saved_pos = @io.pos
1446
- while @row_sep == :auto
1447
- #
1448
- # if we run out of data, it's probably a single line
1449
- # (ensure will set default value)
1450
- #
1451
- break unless sample = @io.gets(nil, 1024)
1452
-
1453
- cr = encode_str("\r")
1454
- lf = encode_str("\n")
1455
- # extend sample if we're unsure of the line ending
1456
- if sample.end_with?(cr)
1457
- sample << (@io.gets(nil, 1) || "")
1458
- end
1459
-
1460
- # try to find a standard separator
1461
- sample.each_char.each_cons(2) do |char, next_char|
1462
- case char
1463
- when cr
1464
- if next_char == lf
1465
- @row_sep = encode_str("\r\n")
1466
- else
1467
- @row_sep = cr
1468
- end
1469
- break
1470
- when lf
1471
- @row_sep = lf
1472
- break
1473
- end
1474
- end
1475
- end
1476
-
1477
- # tricky seek() clone to work around GzipReader's lack of seek()
1478
- @io.rewind
1479
- # reset back to the remembered position
1480
- while saved_pos > 1024 # avoid loading a lot of data into memory
1481
- @io.read(1024)
1482
- saved_pos -= 1024
1483
- end
1484
- @io.read(saved_pos) if saved_pos.nonzero?
1485
- rescue IOError # not opened for reading
1486
- # do nothing: ensure will set default
1487
- rescue NoMethodError # Zlib::GzipWriter doesn't have some IO methods
1488
- # do nothing: ensure will set default
1489
- rescue SystemCallError # pipe
1490
- # do nothing: ensure will set default
1491
- ensure
1492
- #
1493
- # set default if we failed to detect
1494
- # (stream not opened for reading, a pipe, or a single line of data)
1495
- #
1496
- @row_sep = $INPUT_RECORD_SEPARATOR if @row_sep == :auto
1497
- end
1498
- end
1499
- end
1500
- @row_sep = @row_sep.to_s.encode(@encoding)
1501
-
1502
- # establish quoting rules
1503
- @force_quotes = force_quotes
1504
- do_quote = lambda do |field|
1505
- field = String(field)
1506
- encoded_quote = @quote_char.encode(field.encoding)
1507
- encoded_quote + field.gsub(encoded_quote, encoded_quote * 2) + encoded_quote
1508
- end
1509
- quotable_chars = encode_str("\r\n", @col_sep, @quote_char)
1510
- @quote = if @force_quotes
1511
- do_quote
1512
- else
1513
- lambda do |field|
1514
- if field.nil? # represent +nil+ fields as empty unquoted fields
1515
- ""
1516
- else
1517
- field = String(field) # Stringify fields
1518
- # represent empty fields as empty quoted fields
1519
- if field.empty? or
1520
- field.count(quotable_chars).nonzero?
1521
- do_quote.call(field)
1522
- else
1523
- field # unquoted field
1524
- end
1525
- end
1526
- end
1527
- end
1528
- end
1529
-
1530
- # Pre-compiles parsers and stores them by name for access during reads.
1531
- def init_parsers(skip_blanks, field_size_limit, liberal_parsing)
1532
- # store the parser behaviors
1533
- @skip_blanks = skip_blanks
1534
- @field_size_limit = field_size_limit
1535
- @liberal_parsing = liberal_parsing
1536
-
1537
- # prebuild Regexps for faster parsing
1538
- esc_row_sep = escape_re(@row_sep)
1539
- esc_quote = escape_re(@quote_char)
1540
- @parsers = {
1541
- # for detecting parse errors
1542
- quote_or_nl: encode_re("[", esc_quote, "\r\n]"),
1543
- nl_or_lf: encode_re("[\r\n]"),
1544
- stray_quote: encode_re( "[^", esc_quote, "]", esc_quote,
1545
- "[^", esc_quote, "]" ),
1546
- # safer than chomp!()
1547
- line_end: encode_re(esc_row_sep, "\\z"),
1548
- # illegal unquoted characters
1549
- return_newline: encode_str("\r\n")
1550
- }
1551
- end
1552
-
1553
- #
1554
- # Loads any converters requested during construction.
1555
- #
1556
- # If +field_name+ is set <tt>:converters</tt> (the default) field converters
1557
- # are set. When +field_name+ is <tt>:header_converters</tt> header converters
1558
- # are added instead.
1559
- #
1560
- # The <tt>:unconverted_fields</tt> option is also activated for
1561
- # <tt>:converters</tt> calls, if requested.
1562
- #
1563
- def init_converters(converters, ivar_name, convert_method)
1564
- converters = case converters
1565
- when nil then []
1566
- when Array then converters
1567
- else [converters]
1568
- end
1569
- instance_variable_set(ivar_name, [])
1570
- convert = method(convert_method)
1571
-
1572
- # load converters
1573
- converters.each do |converter|
1574
- if converter.is_a? Proc # custom code block
1575
- convert.call(&converter)
1576
- else # by name
1577
- convert.call(converter)
1578
- end
1579
- end
1580
- end
1581
-
1582
- # Stores the pattern of comments to skip from the provided options.
1583
- #
1584
- # The pattern must respond to +.match+, else ArgumentError is raised.
1585
- # Strings are converted to a Regexp.
1586
- #
1587
- # See also CSV.new
1588
- def init_comments(skip_lines)
1589
- @skip_lines = skip_lines
1590
- @skip_lines = Regexp.new(Regexp.escape(@skip_lines)) if @skip_lines.is_a? String
1591
- if @skip_lines and not @skip_lines.respond_to?(:match)
1592
- raise ArgumentError, ":skip_lines has to respond to matches"
1346
+ def normalize_converters(converters)
1347
+ converters ||= []
1348
+ unless converters.is_a?(Array)
1349
+ converters = [converters]
1593
1350
  end
1594
- end
1595
- #
1596
- # The actual work method for adding converters, used by both CSV.convert() and
1597
- # CSV.header_convert().
1598
- #
1599
- # This method requires the +var_name+ of the instance variable to place the
1600
- # converters in, the +const+ Hash to lookup named converters in, and the
1601
- # normal parameters of the CSV.convert() and CSV.header_convert() methods.
1602
- #
1603
- def add_converter(var_name, const, name = nil, &converter)
1604
- if name.nil? # custom converter
1605
- instance_variable_get(var_name) << converter
1606
- else # named converter
1607
- combo = const[name]
1608
- case combo
1609
- when Array # combo converter
1610
- combo.each do |converter_name|
1611
- add_converter(var_name, const, converter_name)
1612
- end
1613
- else # individual named converter
1614
- instance_variable_get(var_name) << combo
1351
+ converters.collect do |converter|
1352
+ case converter
1353
+ when Proc # custom code block
1354
+ [nil, converter]
1355
+ else # by name
1356
+ [converter, nil]
1615
1357
  end
1616
1358
  end
1617
1359
  end
@@ -1625,131 +1367,87 @@ class CSV
1625
1367
  #
1626
1368
  def convert_fields(fields, headers = false)
1627
1369
  if headers
1628
- converters = @header_converters
1370
+ header_fields_converter.convert(fields, nil, 0)
1629
1371
  else
1630
- converters = @converters
1631
- if !@use_headers and
1632
- converters.empty? and
1633
- @nil_value.nil? and
1634
- @empty_value_is_empty_string
1635
- return fields
1636
- end
1637
- end
1638
-
1639
- fields.map.with_index do |field, index|
1640
- if field.nil?
1641
- field = @nil_value
1642
- elsif field.empty?
1643
- field = @empty_value unless @empty_value_is_empty_string
1644
- end
1645
- converters.each do |converter|
1646
- break if headers && field.nil?
1647
- field = if converter.arity == 1 # straight field converter
1648
- converter[field]
1649
- else # FieldInfo converter
1650
- header = @use_headers && !headers ? @headers[index] : nil
1651
- converter[field, FieldInfo.new(index, lineno, header)]
1652
- end
1653
- break unless field.is_a? String # short-circuit pipeline for speed
1654
- end
1655
- field # final state of each field, converted or original
1372
+ parser_fields_converter.convert(fields, @headers, lineno)
1656
1373
  end
1657
1374
  end
1658
1375
 
1659
1376
  #
1660
- # This method is used to turn a finished +row+ into a CSV::Row. Header rows
1661
- # are also dealt with here, either by returning a CSV::Row with identical
1662
- # headers and fields (save that the fields do not go through the converters)
1663
- # or by reading past them to return a field row. Headers are also saved in
1664
- # <tt>@headers</tt> for use in future rows.
1665
- #
1666
- # When +nil+, +row+ is assumed to be a header row not based on an actual row
1667
- # of the stream.
1668
- #
1669
- def parse_headers(row = nil)
1670
- if @headers.nil? # header row
1671
- @headers = case @use_headers # save headers
1672
- # Array of headers
1673
- when Array then @use_headers
1674
- # CSV header String
1675
- when String
1676
- self.class.parse_line( @use_headers,
1677
- col_sep: @col_sep,
1678
- row_sep: @row_sep,
1679
- quote_char: @quote_char )
1680
- # first row is headers
1681
- else row
1682
- end
1683
-
1684
- # prepare converted and unconverted copies
1685
- row = @headers if row.nil?
1686
- @headers = convert_fields(@headers, true)
1687
- @headers.each { |h| h.freeze if h.is_a? String }
1688
-
1689
- if @return_headers # return headers
1690
- return self.class::Row.new(@headers, row, true)
1691
- elsif not [Array, String].include? @use_headers.class # skip to field row
1692
- return shift
1693
- end
1377
+ # Returns the encoding of the internal IO object.
1378
+ #
1379
+ def raw_encoding
1380
+ if @io.respond_to? :internal_encoding
1381
+ @io.internal_encoding || @io.external_encoding
1382
+ elsif @io.respond_to? :encoding
1383
+ @io.encoding
1384
+ else
1385
+ nil
1694
1386
  end
1387
+ end
1695
1388
 
1696
- self.class::Row.new(@headers, convert_fields(row)) # field row
1389
+ def parser_fields_converter
1390
+ @parser_fields_converter ||= build_parser_fields_converter
1697
1391
  end
1698
1392
 
1699
- #
1700
- # This method injects an instance variable <tt>unconverted_fields</tt> into
1701
- # +row+ and an accessor method for +row+ called unconverted_fields(). The
1702
- # variable is set to the contents of +fields+.
1703
- #
1704
- def add_unconverted_fields(row, fields)
1705
- class << row
1706
- attr_reader :unconverted_fields
1707
- end
1708
- row.instance_variable_set(:@unconverted_fields, fields)
1709
- row
1393
+ def build_parser_fields_converter
1394
+ specific_options = {
1395
+ builtin_converters: Converters,
1396
+ }
1397
+ options = @base_fields_converter_options.merge(specific_options)
1398
+ build_fields_converter(@initial_converters, options)
1710
1399
  end
1711
1400
 
1712
- #
1713
- # This method is an encoding safe version of Regexp::escape(). It will escape
1714
- # any characters that would change the meaning of a regular expression in the
1715
- # encoding of +str+. Regular expression characters that cannot be transcoded
1716
- # to the target encoding will be skipped and no escaping will be performed if
1717
- # a backslash cannot be transcoded.
1718
- #
1719
- def escape_re(str)
1720
- str.gsub(@re_chars) {|c| @re_esc + c}
1401
+ def header_fields_converter
1402
+ @header_fields_converter ||= build_header_fields_converter
1721
1403
  end
1722
1404
 
1723
- #
1724
- # Builds a regular expression in <tt>@encoding</tt>. All +chunks+ will be
1725
- # transcoded to that encoding.
1726
- #
1727
- def encode_re(*chunks)
1728
- Regexp.new(encode_str(*chunks))
1405
+ def build_header_fields_converter
1406
+ specific_options = {
1407
+ builtin_converters: HeaderConverters,
1408
+ accept_nil: true,
1409
+ }
1410
+ options = @base_fields_converter_options.merge(specific_options)
1411
+ build_fields_converter(@initial_header_converters, options)
1729
1412
  end
1730
1413
 
1731
- #
1732
- # Builds a String in <tt>@encoding</tt>. All +chunks+ will be transcoded to
1733
- # that encoding.
1734
- #
1735
- def encode_str(*chunks)
1736
- chunks.map { |chunk| chunk.encode(@encoding.name) }.join('')
1414
+ def writer_fields_converter
1415
+ @writer_fields_converter ||= build_writer_fields_converter
1737
1416
  end
1738
1417
 
1739
- #
1740
- # Returns the encoding of the internal IO object or the +default+ if the
1741
- # encoding cannot be determined.
1742
- #
1743
- def raw_encoding(default = Encoding::ASCII_8BIT)
1744
- if @io.respond_to? :internal_encoding
1745
- @io.internal_encoding || @io.external_encoding
1746
- elsif @io.is_a? StringIO
1747
- @io.string.encoding
1748
- elsif @io.respond_to? :encoding
1749
- @io.encoding
1750
- else
1751
- default
1418
+ def build_writer_fields_converter
1419
+ build_fields_converter(@initial_write_converters,
1420
+ @write_fields_converter_options)
1421
+ end
1422
+
1423
+ def build_fields_converter(initial_converters, options)
1424
+ fields_converter = FieldsConverter.new(options)
1425
+ normalize_converters(initial_converters).each do |name, converter|
1426
+ fields_converter.add_converter(name, &converter)
1752
1427
  end
1428
+ fields_converter
1429
+ end
1430
+
1431
+ def parser
1432
+ @parser ||= Parser.new(@io, parser_options)
1433
+ end
1434
+
1435
+ def parser_options
1436
+ @parser_options.merge(header_fields_converter: header_fields_converter,
1437
+ fields_converter: parser_fields_converter)
1438
+ end
1439
+
1440
+ def parser_enumerator
1441
+ @parser_enumerator ||= parser.parse
1442
+ end
1443
+
1444
+ def writer
1445
+ @writer ||= Writer.new(@io, writer_options)
1446
+ end
1447
+
1448
+ def writer_options
1449
+ @writer_options.merge(header_fields_converter: header_fields_converter,
1450
+ fields_converter: writer_fields_converter)
1753
1451
  end
1754
1452
  end
1755
1453