csv 3.0.1 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f6ad673a6db13541d439c4798f677ae19e118fb031411b8434ec4534bafc47a9
4
- data.tar.gz: 427070352e63b901d410a70eba6f7073103bc6cbe8e57f7e161e01003ae1598c
3
+ metadata.gz: 25367f06751ab916228ddcffcbc857bc13ca1e7fcc65a908fcfc7c974e5473f6
4
+ data.tar.gz: 92ff4c8f3b96219b9d74fc849311afd0cb97f3d124c77250878402fc006ce2ac
5
5
  SHA512:
6
- metadata.gz: 3d2c69c8b784d79149dfd5999d2c937e037f83ccc806bd4fe5a674b913518ab4be4a13911e2ec3f5590c45dc6ebdd346eed6f85be253fad6dae40cd9eb0cb704
7
- data.tar.gz: 25a0fee13c07fb870831c0b59ea88c9df6844ccf63810925d7ed437000c220b83a212781d5d615529ef487bd3ffdbd90a13b293475e8a4074a2feb249258b156
6
+ metadata.gz: 25584f3c7ccf6ffa990dfa5a58cb4564092f51c3145fccb0b54252210096d8d322044c72647be36d7151362fe02e76c3ab103f99615ac42e9f1621ba6f2e9aa4
7
+ data.tar.gz: 2e0bb6973a005ae822b08bf6c13ff1681bf7c599e2c966f8bfd19ecffc6083fb09a03b74bbb47019a44bda729c0d173bed864815bffa670d19684092da0f128d
@@ -1,5 +1,34 @@
1
1
  # News
2
2
 
3
+ ## 3.0.2 - 2018-12-23
4
+
5
+ ### Improvements
6
+
7
+ * Changed to use strscan in parser.
8
+ [GitHub#52][Patch by 284km]
9
+
10
+ * Improves CSV write performance.
11
+ 3.0.2 will be about 2 times faster than 3.0.1.
12
+
13
+ * Improves CSV parse performance for complex case.
14
+ 3.0.2 will be about 2 times faster than 3.0.1.
15
+
16
+ ### Fixes
17
+
18
+ * Fixed a parse error bug for new line only input with `headers` option.
19
+ [GitHub#53][Reported by Chris Beer]
20
+
21
+ * Fixed some typos in document.
22
+ [GitHub#54][Patch by Victor Shepelev]
23
+
24
+ ### Thanks
25
+
26
+ * 284km
27
+
28
+ * Chris Beer
29
+
30
+ * Victor Shepelev
31
+
3
32
  ## 3.0.1 - 2018-12-07
4
33
 
5
34
  ### Improvements
data/lib/csv.rb CHANGED
@@ -93,36 +93,22 @@ require "forwardable"
93
93
  require "English"
94
94
  require "date"
95
95
  require "stringio"
96
- require_relative "csv/table"
97
- require_relative "csv/row"
98
96
 
99
- # This provides String#match? and Regexp#match? for Ruby 2.3.
100
- unless String.method_defined?(:match?)
101
- class CSV
102
- module MatchP
103
- refine String do
104
- def match?(pattern)
105
- self =~ pattern
106
- end
107
- end
108
-
109
- refine Regexp do
110
- def match?(string)
111
- self =~ string
112
- end
113
- end
114
- end
115
- end
97
+ require_relative "csv/fields_converter"
98
+ require_relative "csv/match_p"
99
+ require_relative "csv/parser"
100
+ require_relative "csv/row"
101
+ require_relative "csv/table"
102
+ require_relative "csv/writer"
116
103
 
117
- using CSV::MatchP
118
- end
104
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
119
105
 
120
106
  #
121
107
  # This class provides a complete interface to CSV files and data. It offers
122
108
  # tools to enable you to read and write to and from Strings or IO objects, as
123
109
  # needed.
124
110
  #
125
- # The most generic interface of a class is:
111
+ # The most generic interface of the library is:
126
112
  #
127
113
  # csv = CSV.new(string_or_io, **options)
128
114
  #
@@ -204,18 +190,18 @@ end
204
190
  # # Headers are part of data
205
191
  # data = CSV.parse(<<~ROWS, headers: true)
206
192
  # Name,Department,Salary
207
- # Bob,Engeneering,1000
193
+ # Bob,Engineering,1000
208
194
  # Jane,Sales,2000
209
195
  # John,Management,5000
210
196
  # ROWS
211
197
  #
212
198
  # data.class #=> CSV::Table
213
- # data.first #=> #<CSV::Row "Name":"Bob" "Department":"Engeneering" "Salary":"1000">
214
- # data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engeneering", "Salary"=>"1000"}
199
+ # data.first #=> #<CSV::Row "Name":"Bob" "Department":"Engineering" "Salary":"1000">
200
+ # data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engineering", "Salary"=>"1000"}
215
201
  #
216
202
  # # Headers provided by developer
217
203
  # data = CSV.parse('Bob,Engeneering,1000', headers: %i[name department salary])
218
- # data.first #=> #<CSV::Row name:"Bob" department:"Engeneering" salary:"1000">
204
+ # data.first #=> #<CSV::Row name:"Bob" department:"Engineering" salary:"1000">
219
205
  #
220
206
  # === Typed data reading
221
207
  #
@@ -902,76 +888,104 @@ class CSV
902
888
  # Options cannot be overridden in the instance methods for performance reasons,
903
889
  # so be sure to set what you want here.
904
890
  #
905
- def initialize(data, col_sep: ",", row_sep: :auto, quote_char: '"', field_size_limit: nil,
906
- converters: nil, unconverted_fields: nil, headers: false, return_headers: false,
907
- write_headers: nil, header_converters: nil, skip_blanks: false, force_quotes: false,
908
- skip_lines: nil, liberal_parsing: false, internal_encoding: nil, external_encoding: nil, encoding: nil,
891
+ def initialize(data,
892
+ col_sep: ",",
893
+ row_sep: :auto,
894
+ quote_char: '"',
895
+ field_size_limit: nil,
896
+ converters: nil,
897
+ unconverted_fields: nil,
898
+ headers: false,
899
+ return_headers: false,
900
+ write_headers: nil,
901
+ header_converters: nil,
902
+ skip_blanks: false,
903
+ force_quotes: false,
904
+ skip_lines: nil,
905
+ liberal_parsing: false,
906
+ internal_encoding: nil,
907
+ external_encoding: nil,
908
+ encoding: nil,
909
909
  nil_value: nil,
910
910
  empty_value: "")
911
911
  raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
912
912
 
913
913
  # create the IO object we will read from
914
914
  @io = data.is_a?(String) ? StringIO.new(data) : data
915
- @prefix_io = nil # cache for input data possibly read by init_separators
916
915
  @encoding = determine_encoding(encoding, internal_encoding)
917
- #
918
- # prepare for building safe regular expressions in the target encoding,
919
- # if we can transcode the needed characters
920
- #
921
- @re_esc = "\\".encode(@encoding).freeze rescue ""
922
- @re_chars = /#{%"[-\\]\\[\\.^$?*+{}()|# \r\n\t\f\v]".encode(@encoding)}/
923
- @unconverted_fields = unconverted_fields
924
-
925
- # Stores header row settings and loads header converters, if needed.
926
- @use_headers = headers
927
- @return_headers = return_headers
928
- @write_headers = write_headers
929
-
930
- # headers must be delayed until shift(), in case they need a row of content
931
- @headers = nil
932
-
933
- @nil_value = nil_value
934
- @empty_value = empty_value
935
- @empty_value_is_empty_string = (empty_value == "")
936
-
937
- init_separators(col_sep, row_sep, quote_char, force_quotes)
938
- init_parsers(skip_blanks, field_size_limit, liberal_parsing)
939
- init_converters(converters, :@converters, :convert)
940
- init_converters(header_converters, :@header_converters, :header_convert)
941
- init_comments(skip_lines)
942
-
943
- @force_encoding = !!encoding
944
-
945
- # track our own lineno since IO gets confused about line-ends is CSV fields
946
- @lineno = 0
947
-
948
- # make sure headers have been assigned
949
- if header_row? and [Array, String].include? @use_headers.class and @write_headers
950
- parse_headers # won't read data for Array or String
951
- self << @headers
952
- end
916
+
917
+ @base_fields_converter_options = {
918
+ nil_value: nil_value,
919
+ empty_value: empty_value,
920
+ }
921
+ @initial_converters = converters
922
+ @initial_header_converters = header_converters
923
+
924
+ @parser_options = {
925
+ column_separator: col_sep,
926
+ row_separator: row_sep,
927
+ quote_character: quote_char,
928
+ field_size_limit: field_size_limit,
929
+ unconverted_fields: unconverted_fields,
930
+ headers: headers,
931
+ return_headers: return_headers,
932
+ skip_blanks: skip_blanks,
933
+ skip_lines: skip_lines,
934
+ liberal_parsing: liberal_parsing,
935
+ encoding: @encoding,
936
+ nil_value: nil_value,
937
+ empty_value: empty_value,
938
+ }
939
+ @parser = nil
940
+
941
+ @writer_options = {
942
+ encoding: @encoding,
943
+ force_encoding: (not encoding.nil?),
944
+ force_quotes: force_quotes,
945
+ headers: headers,
946
+ write_headers: write_headers,
947
+ column_separator: col_sep,
948
+ row_separator: row_sep,
949
+ quote_character: quote_char,
950
+ }
951
+
952
+ @writer = nil
953
+ writer if @writer_options[:write_headers]
953
954
  end
954
955
 
955
956
  #
956
957
  # The encoded <tt>:col_sep</tt> used in parsing and writing. See CSV::new
957
958
  # for details.
958
959
  #
959
- attr_reader :col_sep
960
+ def col_sep
961
+ parser.column_separator
962
+ end
963
+
960
964
  #
961
965
  # The encoded <tt>:row_sep</tt> used in parsing and writing. See CSV::new
962
966
  # for details.
963
967
  #
964
- attr_reader :row_sep
968
+ def row_sep
969
+ parser.row_separator
970
+ end
971
+
965
972
  #
966
973
  # The encoded <tt>:quote_char</tt> used in parsing and writing. See CSV::new
967
974
  # for details.
968
975
  #
969
- attr_reader :quote_char
976
+ def quote_char
977
+ parser.quote_character
978
+ end
979
+
970
980
  # The limit for field size, if any. See CSV::new for details.
971
- attr_reader :field_size_limit
981
+ def field_size_limit
982
+ parser.field_size_limit
983
+ end
972
984
 
973
985
  # The regex marking a line as a comment. See CSV::new for details
974
- attr_reader :skip_lines
986
+ def skip_lines
987
+ parser.skip_lines
988
+ end
975
989
 
976
990
  #
977
991
  # Returns the current list of converters in effect. See CSV::new for details.
@@ -979,7 +993,7 @@ class CSV
979
993
  # as is.
980
994
  #
981
995
  def converters
982
- @converters.map do |converter|
996
+ fields_converter.map do |converter|
983
997
  name = Converters.rassoc(converter)
984
998
  name ? name.first : converter
985
999
  end
@@ -988,42 +1002,68 @@ class CSV
988
1002
  # Returns +true+ if unconverted_fields() to parsed results. See CSV::new
989
1003
  # for details.
990
1004
  #
991
- def unconverted_fields?() @unconverted_fields end
1005
+ def unconverted_fields?
1006
+ parser.unconverted_fields?
1007
+ end
1008
+
992
1009
  #
993
1010
  # Returns +nil+ if headers will not be used, +true+ if they will but have not
994
1011
  # yet been read, or the actual headers after they have been read. See
995
1012
  # CSV::new for details.
996
1013
  #
997
1014
  def headers
998
- @headers || true if @use_headers
1015
+ if @writer
1016
+ @writer.headers
1017
+ else
1018
+ parsed_headers = parser.headers
1019
+ return parsed_headers if parsed_headers
1020
+ raw_headers = @parser_options[:headers]
1021
+ raw_headers = nil if raw_headers == false
1022
+ raw_headers
1023
+ end
999
1024
  end
1000
1025
  #
1001
1026
  # Returns +true+ if headers will be returned as a row of results.
1002
1027
  # See CSV::new for details.
1003
1028
  #
1004
- def return_headers?() @return_headers end
1029
+ def return_headers?
1030
+ parser.return_headers?
1031
+ end
1032
+
1005
1033
  # Returns +true+ if headers are written in output. See CSV::new for details.
1006
- def write_headers?() @write_headers end
1034
+ def write_headers?
1035
+ @writer_options[:write_headers]
1036
+ end
1037
+
1007
1038
  #
1008
1039
  # Returns the current list of converters in effect for headers. See CSV::new
1009
1040
  # for details. Built-in converters will be returned by name, while others
1010
1041
  # will be returned as is.
1011
1042
  #
1012
1043
  def header_converters
1013
- @header_converters.map do |converter|
1044
+ header_fields_converter.map do |converter|
1014
1045
  name = HeaderConverters.rassoc(converter)
1015
1046
  name ? name.first : converter
1016
1047
  end
1017
1048
  end
1049
+
1018
1050
  #
1019
1051
  # Returns +true+ blank lines are skipped by the parser. See CSV::new
1020
1052
  # for details.
1021
1053
  #
1022
- def skip_blanks?() @skip_blanks end
1054
+ def skip_blanks?
1055
+ parser.skip_blanks?
1056
+ end
1057
+
1023
1058
  # Returns +true+ if all output fields are quoted. See CSV::new for details.
1024
- def force_quotes?() @force_quotes end
1059
+ def force_quotes?
1060
+ @writer_options[:force_quotes]
1061
+ end
1062
+
1025
1063
  # Returns +true+ if illegal input is handled. See CSV::new for details.
1026
- def liberal_parsing?() @liberal_parsing end
1064
+ def liberal_parsing?
1065
+ parser.liberal_parsing?
1066
+ end
1027
1067
 
1028
1068
  #
1029
1069
  # The Encoding CSV is parsing or writing in. This will be the Encoding you
@@ -1032,10 +1072,23 @@ class CSV
1032
1072
  attr_reader :encoding
1033
1073
 
1034
1074
  #
1035
- # The line number of the last row read from this file. Fields with nested
1075
+ # The line number of the last row read from this file. Fields with nested
1036
1076
  # line-end characters will not affect this count.
1037
1077
  #
1038
- attr_reader :lineno, :line
1078
+ def lineno
1079
+ if @writer
1080
+ @writer.lineno
1081
+ else
1082
+ parser.lineno
1083
+ end
1084
+ end
1085
+
1086
+ #
1087
+ # The last row read from this file.
1088
+ #
1089
+ def line
1090
+ parser.line
1091
+ end
1039
1092
 
1040
1093
  ### IO and StringIO Delegation ###
1041
1094
 
@@ -1049,9 +1102,9 @@ class CSV
1049
1102
 
1050
1103
  # Rewinds the underlying IO object and resets CSV's lineno() counter.
1051
1104
  def rewind
1052
- @headers = nil
1053
- @lineno = 0
1054
-
1105
+ @parser = nil
1106
+ @parser_enumerator = nil
1107
+ @writer.rewind if @writer
1055
1108
  @io.rewind
1056
1109
  end
1057
1110
 
@@ -1065,34 +1118,8 @@ class CSV
1065
1118
  # The data source must be open for writing.
1066
1119
  #
1067
1120
  def <<(row)
1068
- # make sure headers have been assigned
1069
- if header_row? and [Array, String].include? @use_headers.class and !@write_headers
1070
- parse_headers # won't read data for Array or String
1071
- end
1072
-
1073
- # handle CSV::Row objects and Hashes
1074
- row = case row
1075
- when self.class::Row then row.fields
1076
- when Hash then @headers.map { |header| row[header] }
1077
- else row
1078
- end
1079
-
1080
- @headers = row if header_row?
1081
- @lineno += 1
1082
-
1083
- output = row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
1084
- if @io.is_a?(StringIO) and
1085
- output.encoding != (encoding = raw_encoding)
1086
- if @force_encoding
1087
- output = output.encode(encoding)
1088
- elsif (compatible_encoding = Encoding.compatible?(@io.string, output))
1089
- @io.set_encoding(compatible_encoding)
1090
- @io.seek(0, IO::SEEK_END)
1091
- end
1092
- end
1093
- @io << output
1094
-
1095
- self # for chaining
1121
+ writer << row
1122
+ self
1096
1123
  end
1097
1124
  alias_method :add_row, :<<
1098
1125
  alias_method :puts, :<<
@@ -1113,7 +1140,7 @@ class CSV
1113
1140
  # converted field or the field itself.
1114
1141
  #
1115
1142
  def convert(name = nil, &converter)
1116
- add_converter(:@converters, self.class::Converters, name, &converter)
1143
+ fields_converter.add_converter(name, &converter)
1117
1144
  end
1118
1145
 
1119
1146
  #
@@ -1128,10 +1155,7 @@ class CSV
1128
1155
  # effect.
1129
1156
  #
1130
1157
  def header_convert(name = nil, &converter)
1131
- add_converter( :@header_converters,
1132
- self.class::HeaderConverters,
1133
- name,
1134
- &converter )
1158
+ header_fields_converter.add_converter(name, &converter)
1135
1159
  end
1136
1160
 
1137
1161
  include Enumerable
@@ -1143,14 +1167,8 @@ class CSV
1143
1167
  #
1144
1168
  # The data source must be open for reading.
1145
1169
  #
1146
- def each
1147
- if block_given?
1148
- while row = shift
1149
- yield row
1150
- end
1151
- else
1152
- to_enum
1153
- end
1170
+ def each(&block)
1171
+ parser.parse(&block)
1154
1172
  end
1155
1173
 
1156
1174
  #
@@ -1160,8 +1178,9 @@ class CSV
1160
1178
  #
1161
1179
  def read
1162
1180
  rows = to_a
1163
- if @use_headers
1164
- Table.new(rows, headers: @headers)
1181
+ headers = parser.headers
1182
+ if headers
1183
+ Table.new(rows, headers: headers)
1165
1184
  else
1166
1185
  rows
1167
1186
  end
@@ -1170,7 +1189,7 @@ class CSV
1170
1189
 
1171
1190
  # Returns +true+ if the next row read will be a header row.
1172
1191
  def header_row?
1173
- @use_headers and @headers.nil?
1192
+ parser.header_row?
1174
1193
  end
1175
1194
 
1176
1195
  #
@@ -1181,177 +1200,11 @@ class CSV
1181
1200
  # The data source must be open for reading.
1182
1201
  #
1183
1202
  def shift
1184
- #########################################################################
1185
- ### This method is purposefully kept a bit long as simple conditional ###
1186
- ### checks are faster than numerous (expensive) method calls. ###
1187
- #########################################################################
1188
-
1189
- # handle headers not based on document content
1190
- if header_row? and @return_headers and
1191
- [Array, String].include? @use_headers.class
1192
- if @unconverted_fields
1193
- return add_unconverted_fields(parse_headers, Array.new)
1194
- else
1195
- return parse_headers
1196
- end
1197
- end
1198
-
1199
- #
1200
- # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
1201
- # because of \r and/or \n characters embedded in quoted fields
1202
- #
1203
- in_extended_col = false
1204
- csv = Array.new
1205
-
1206
- loop do
1207
- # add another read to the line
1208
- if @prefix_io
1209
- parse = @prefix_io.gets(@row_sep)
1210
- if @prefix_io.eof?
1211
- parse << (@io.gets(@row_sep) || "") unless parse.end_with?(@row_sep)
1212
- @prefix_io = nil # avoid having to test @prefix_io.eof? in main code path
1213
- end
1214
- else
1215
- return nil unless parse = @io.gets(@row_sep)
1216
- end
1217
-
1218
- if in_extended_col
1219
- @line.concat(parse)
1220
- else
1221
- @line = parse.clone
1222
- end
1223
-
1224
- begin
1225
- parse.sub!(@parsers[:line_end], "")
1226
- rescue ArgumentError
1227
- unless parse.valid_encoding?
1228
- message = "Invalid byte sequence in #{parse.encoding}"
1229
- raise MalformedCSVError.new(message, lineno + 1)
1230
- end
1231
- raise
1232
- end
1233
-
1234
- if csv.empty?
1235
- #
1236
- # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
1237
- # CSV's <tt>[nil]</tt>
1238
- #
1239
- if parse.empty?
1240
- @lineno += 1
1241
- if @skip_blanks
1242
- next
1243
- elsif @unconverted_fields
1244
- return add_unconverted_fields(Array.new, Array.new)
1245
- elsif @use_headers
1246
- return self.class::Row.new(@headers, Array.new)
1247
- else
1248
- return Array.new
1249
- end
1250
- end
1251
- end
1252
-
1253
- next if @skip_lines and @skip_lines.match parse
1254
-
1255
- parts = parse.split(@col_sep_split_separator, -1)
1256
- if parts.empty?
1257
- if in_extended_col
1258
- csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop
1259
- else
1260
- csv << nil
1261
- end
1262
- end
1263
-
1264
- # This loop is the hot path of csv parsing. Some things may be non-dry
1265
- # for a reason. Make sure to benchmark when refactoring.
1266
- parts.each do |part|
1267
- if in_extended_col
1268
- # If we are continuing a previous column
1269
- if part.end_with?(@quote_char) && part.count(@quote_char) % 2 != 0
1270
- # extended column ends
1271
- csv.last << part[0..-2]
1272
- if csv.last.match?(@parsers[:stray_quote])
1273
- raise MalformedCSVError.new("Missing or stray quote",
1274
- lineno + 1)
1275
- end
1276
- csv.last.gsub!(@double_quote_char, @quote_char)
1277
- in_extended_col = false
1278
- else
1279
- csv.last << part << @col_sep
1280
- end
1281
- elsif part.start_with?(@quote_char)
1282
- # If we are starting a new quoted column
1283
- if part.count(@quote_char) % 2 != 0
1284
- # start an extended column
1285
- csv << (part[1..-1] << @col_sep)
1286
- in_extended_col = true
1287
- elsif part.end_with?(@quote_char)
1288
- # regular quoted column
1289
- csv << part[1..-2]
1290
- if csv.last.match?(@parsers[:stray_quote])
1291
- raise MalformedCSVError.new("Missing or stray quote",
1292
- lineno + 1)
1293
- end
1294
- csv.last.gsub!(@double_quote_char, @quote_char)
1295
- elsif @liberal_parsing
1296
- csv << part
1297
- else
1298
- raise MalformedCSVError.new("Missing or stray quote",
1299
- lineno + 1)
1300
- end
1301
- elsif part.match?(@parsers[:quote_or_nl])
1302
- # Unquoted field with bad characters.
1303
- if part.match?(@parsers[:nl_or_lf])
1304
- message = "Unquoted fields do not allow \\r or \\n"
1305
- raise MalformedCSVError.new(message, lineno + 1)
1306
- else
1307
- if @liberal_parsing
1308
- csv << part
1309
- else
1310
- raise MalformedCSVError.new("Illegal quoting", lineno + 1)
1311
- end
1312
- end
1313
- else
1314
- # Regular ole unquoted field.
1315
- csv << (part.empty? ? nil : part)
1316
- end
1317
- end
1318
-
1319
- # Replace tacked on @col_sep with @row_sep if we are still in an extended
1320
- # column.
1321
- csv[-1][-1] = @row_sep if in_extended_col
1322
-
1323
- if in_extended_col
1324
- # if we're at eof?(), a quoted field wasn't closed...
1325
- if @io.eof? and !@prefix_io
1326
- raise MalformedCSVError.new("Unclosed quoted field",
1327
- lineno + 1)
1328
- elsif @field_size_limit and csv.last.size >= @field_size_limit
1329
- raise MalformedCSVError.new("Field size exceeded",
1330
- lineno + 1)
1331
- end
1332
- # otherwise, we need to loop and pull some more data to complete the row
1333
- else
1334
- @lineno += 1
1335
-
1336
- # save fields unconverted fields, if needed...
1337
- unconverted = csv.dup if @unconverted_fields
1338
-
1339
- if @use_headers
1340
- # parse out header rows and handle CSV::Row conversions...
1341
- csv = parse_headers(csv)
1342
- else
1343
- # convert fields, if needed...
1344
- csv = convert_fields(csv)
1345
- end
1346
-
1347
- # inject unconverted fields and accessor, if requested...
1348
- if @unconverted_fields and not csv.respond_to? :unconverted_fields
1349
- add_unconverted_fields(csv, unconverted)
1350
- end
1351
-
1352
- # return the results
1353
- break csv
1354
- end
1203
+ @parser_enumerator ||= parser.parse
1204
+ begin
1205
+ @parser_enumerator.next
1206
+ rescue StopIteration
1207
+ nil
1355
1208
  end
1356
1209
  end
1357
1210
  alias_method :gets, :shift
@@ -1376,15 +1229,19 @@ class CSV
1376
1229
  # show encoding
1377
1230
  str << " encoding:" << @encoding.name
1378
1231
  # show other attributes
1379
- %w[ lineno col_sep row_sep
1380
- quote_char skip_blanks liberal_parsing ].each do |attr_name|
1381
- if a = instance_variable_get("@#{attr_name}")
1232
+ ["lineno", "col_sep", "row_sep", "quote_char"].each do |attr_name|
1233
+ if a = __send__(attr_name)
1382
1234
  str << " " << attr_name << ":" << a.inspect
1383
1235
  end
1384
1236
  end
1385
- if @use_headers
1386
- str << " headers:" << headers.inspect
1237
+ ["skip_blanks", "liberal_parsing"].each do |attr_name|
1238
+ if a = __send__("#{attr_name}?")
1239
+ str << " " << attr_name << ":" << a.inspect
1240
+ end
1387
1241
  end
1242
+ _headers = headers
1243
+ _headers = headers
1244
+ str << " headers:" << _headers.inspect if _headers
1388
1245
  str << ">"
1389
1246
  begin
1390
1247
  str.join('')
@@ -1400,7 +1257,7 @@ class CSV
1400
1257
 
1401
1258
  def determine_encoding(encoding, internal_encoding)
1402
1259
  # honor the IO encoding if we can, otherwise default to ASCII-8BIT
1403
- io_encoding = raw_encoding(nil)
1260
+ io_encoding = raw_encoding
1404
1261
  return io_encoding if io_encoding
1405
1262
 
1406
1263
  return Encoding.find(internal_encoding) if internal_encoding
@@ -1413,210 +1270,17 @@ class CSV
1413
1270
  Encoding.default_internal || Encoding.default_external
1414
1271
  end
1415
1272
 
1416
- #
1417
- # Stores the indicated separators for later use.
1418
- #
1419
- # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
1420
- # ahead in the <tt>@io</tt> and try to find one. +ARGF+, +STDIN+, +STDOUT+,
1421
- # +STDERR+ and any stream open for output only with a default
1422
- # <tt>@row_sep</tt> of <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
1423
- #
1424
- # This method also establishes the quoting rules used for CSV output.
1425
- #
1426
- def init_separators(col_sep, row_sep, quote_char, force_quotes)
1427
- # store the selected separators
1428
- @col_sep = col_sep.to_s.encode(@encoding)
1429
- if @col_sep == " "
1430
- @col_sep_split_separator = Regexp.new(/#{Regexp.escape(@col_sep)}/)
1431
- else
1432
- @col_sep_split_separator = @col_sep
1433
- end
1434
- @row_sep = row_sep # encode after resolving :auto
1435
- @quote_char = quote_char.to_s.encode(@encoding)
1436
- @double_quote_char = @quote_char * 2
1437
-
1438
- if @quote_char.length != 1
1439
- raise ArgumentError, ":quote_char has to be a single character String"
1440
- end
1441
-
1442
- #
1443
- # automatically discover row separator when requested
1444
- # (not fully encoding safe)
1445
- #
1446
- if @row_sep == :auto
1447
- saved_prefix = [] # sample chunks to be reprocessed later
1448
- begin
1449
- while @row_sep == :auto && @io.respond_to?(:gets)
1450
- #
1451
- # if we run out of data, it's probably a single line
1452
- # (ensure will set default value)
1453
- #
1454
- break unless sample = @io.gets(nil, 1024)
1455
-
1456
- cr = encode_str("\r")
1457
- lf = encode_str("\n")
1458
- # extend sample if we're unsure of the line ending
1459
- if sample.end_with?(cr)
1460
- sample << (@io.gets(nil, 1) || "")
1461
- end
1462
-
1463
- saved_prefix << sample
1464
-
1465
- # try to find a standard separator
1466
- last_char = nil
1467
- sample.each_char.each_cons(2) do |char, next_char|
1468
- last_char = next_char
1469
- case char
1470
- when cr
1471
- if next_char == lf
1472
- @row_sep = encode_str("\r\n")
1473
- else
1474
- @row_sep = cr
1475
- end
1476
- break
1477
- when lf
1478
- @row_sep = lf
1479
- break
1480
- end
1481
- end
1482
- if @row_sep == :auto
1483
- case last_char
1484
- when cr
1485
- @row_sep = cr
1486
- when lf
1487
- @row_sep = lf
1488
- end
1489
- end
1490
- end
1491
- rescue IOError
1492
- # do nothing: ensure will set default
1493
- ensure
1494
- #
1495
- # set default if we failed to detect
1496
- # (stream not opened for reading or a single line of data)
1497
- #
1498
- @row_sep = $INPUT_RECORD_SEPARATOR if @row_sep == :auto
1499
-
1500
- # save sampled input for later parsing (but only if there is some!)
1501
- saved_prefix = saved_prefix.join('')
1502
- @prefix_io = StringIO.new(saved_prefix) unless saved_prefix.empty?
1503
- end
1504
- end
1505
- @row_sep = @row_sep.to_s.encode(@encoding)
1506
-
1507
- # establish quoting rules
1508
- @force_quotes = force_quotes
1509
- do_quote = lambda do |field|
1510
- field = String(field)
1511
- encoded_quote = @quote_char.encode(field.encoding)
1512
- encoded_quote + field.gsub(encoded_quote, encoded_quote * 2) + encoded_quote
1273
+ def normalize_converters(converters)
1274
+ converters ||= []
1275
+ unless converters.is_a?(Array)
1276
+ converters = [converters]
1513
1277
  end
1514
- quotable_chars = encode_str("\r\n", @col_sep, @quote_char)
1515
- @quote = if @force_quotes
1516
- do_quote
1517
- else
1518
- lambda do |field|
1519
- if field.nil? # represent +nil+ fields as empty unquoted fields
1520
- ""
1521
- else
1522
- field = String(field) # Stringify fields
1523
- # represent empty fields as empty quoted fields
1524
- if field.empty? or
1525
- field.count(quotable_chars).nonzero?
1526
- do_quote.call(field)
1527
- else
1528
- field # unquoted field
1529
- end
1530
- end
1531
- end
1532
- end
1533
- end
1534
-
1535
- # Pre-compiles parsers and stores them by name for access during reads.
1536
- def init_parsers(skip_blanks, field_size_limit, liberal_parsing)
1537
- # store the parser behaviors
1538
- @skip_blanks = skip_blanks
1539
- @field_size_limit = field_size_limit
1540
- @liberal_parsing = liberal_parsing
1541
-
1542
- # prebuild Regexps for faster parsing
1543
- esc_row_sep = escape_re(@row_sep)
1544
- esc_quote = escape_re(@quote_char)
1545
- @parsers = {
1546
- # for detecting parse errors
1547
- quote_or_nl: encode_re("[", esc_quote, "\r\n]"),
1548
- nl_or_lf: encode_re("[\r\n]"),
1549
- stray_quote: encode_re( "[^", esc_quote, "]", esc_quote,
1550
- "[^", esc_quote, "]" ),
1551
- # safer than chomp!()
1552
- line_end: encode_re(esc_row_sep, "\\z"),
1553
- # illegal unquoted characters
1554
- return_newline: encode_str("\r\n")
1555
- }
1556
- end
1557
-
1558
- #
1559
- # Loads any converters requested during construction.
1560
- #
1561
- # If +field_name+ is set <tt>:converters</tt> (the default) field converters
1562
- # are set. When +field_name+ is <tt>:header_converters</tt> header converters
1563
- # are added instead.
1564
- #
1565
- # The <tt>:unconverted_fields</tt> option is also activated for
1566
- # <tt>:converters</tt> calls, if requested.
1567
- #
1568
- def init_converters(converters, ivar_name, convert_method)
1569
- converters = case converters
1570
- when nil then []
1571
- when Array then converters
1572
- else [converters]
1573
- end
1574
- instance_variable_set(ivar_name, [])
1575
- convert = method(convert_method)
1576
-
1577
- # load converters
1578
- converters.each do |converter|
1579
- if converter.is_a? Proc # custom code block
1580
- convert.call(&converter)
1581
- else # by name
1582
- convert.call(converter)
1583
- end
1584
- end
1585
- end
1586
-
1587
- # Stores the pattern of comments to skip from the provided options.
1588
- #
1589
- # The pattern must respond to +.match+, else ArgumentError is raised.
1590
- # Strings are converted to a Regexp.
1591
- #
1592
- # See also CSV.new
1593
- def init_comments(skip_lines)
1594
- @skip_lines = skip_lines
1595
- @skip_lines = Regexp.new(Regexp.escape(@skip_lines)) if @skip_lines.is_a? String
1596
- if @skip_lines and not @skip_lines.respond_to?(:match)
1597
- raise ArgumentError, ":skip_lines has to respond to matches"
1598
- end
1599
- end
1600
- #
1601
- # The actual work method for adding converters, used by both CSV.convert() and
1602
- # CSV.header_convert().
1603
- #
1604
- # This method requires the +var_name+ of the instance variable to place the
1605
- # converters in, the +const+ Hash to lookup named converters in, and the
1606
- # normal parameters of the CSV.convert() and CSV.header_convert() methods.
1607
- #
1608
- def add_converter(var_name, const, name = nil, &converter)
1609
- if name.nil? # custom converter
1610
- instance_variable_get(var_name) << converter
1611
- else # named converter
1612
- combo = const[name]
1613
- case combo
1614
- when Array # combo converter
1615
- combo.each do |converter_name|
1616
- add_converter(var_name, const, converter_name)
1617
- end
1618
- else # individual named converter
1619
- instance_variable_get(var_name) << combo
1278
+ converters.collect do |converter|
1279
+ case converter
1280
+ when Proc # custom code block
1281
+ [nil, converter]
1282
+ else # by name
1283
+ [converter, nil]
1620
1284
  end
1621
1285
  end
1622
1286
  end
@@ -1630,129 +1294,73 @@ class CSV
1630
1294
  #
1631
1295
  def convert_fields(fields, headers = false)
1632
1296
  if headers
1633
- converters = @header_converters
1297
+ header_fields_converter.convert(fields, nil, 0)
1634
1298
  else
1635
- converters = @converters
1636
- if !@use_headers and
1637
- converters.empty? and
1638
- @nil_value.nil? and
1639
- @empty_value_is_empty_string
1640
- return fields
1641
- end
1299
+ fields_converter.convert(fields, @headers, lineno)
1642
1300
  end
1301
+ end
1643
1302
 
1644
- fields.map.with_index do |field, index|
1645
- if field.nil?
1646
- field = @nil_value
1647
- elsif field.empty?
1648
- field = @empty_value unless @empty_value_is_empty_string
1649
- end
1650
- converters.each do |converter|
1651
- break if headers && field.nil?
1652
- field = if converter.arity == 1 # straight field converter
1653
- converter[field]
1654
- else # FieldInfo converter
1655
- header = @use_headers && !headers ? @headers[index] : nil
1656
- converter[field, FieldInfo.new(index, lineno, header)]
1657
- end
1658
- break unless field.is_a? String # short-circuit pipeline for speed
1659
- end
1660
- field # final state of each field, converted or original
1303
+ #
1304
+ # Returns the encoding of the internal IO object.
1305
+ #
1306
+ def raw_encoding
1307
+ if @io.respond_to? :internal_encoding
1308
+ @io.internal_encoding || @io.external_encoding
1309
+ elsif @io.respond_to? :encoding
1310
+ @io.encoding
1311
+ else
1312
+ nil
1661
1313
  end
1662
1314
  end
1663
1315
 
1664
- #
1665
- # This method is used to turn a finished +row+ into a CSV::Row. Header rows
1666
- # are also dealt with here, either by returning a CSV::Row with identical
1667
- # headers and fields (save that the fields do not go through the converters)
1668
- # or by reading past them to return a field row. Headers are also saved in
1669
- # <tt>@headers</tt> for use in future rows.
1670
- #
1671
- # When +nil+, +row+ is assumed to be a header row not based on an actual row
1672
- # of the stream.
1673
- #
1674
- def parse_headers(row = nil)
1675
- if @headers.nil? # header row
1676
- @headers = case @use_headers # save headers
1677
- # Array of headers
1678
- when Array then @use_headers
1679
- # CSV header String
1680
- when String
1681
- self.class.parse_line( @use_headers,
1682
- col_sep: @col_sep,
1683
- row_sep: @row_sep,
1684
- quote_char: @quote_char )
1685
- # first row is headers
1686
- else row
1687
- end
1688
-
1689
- # prepare converted and unconverted copies
1690
- row = @headers if row.nil?
1691
- @headers = convert_fields(@headers, true)
1692
- @headers.each { |h| h.freeze if h.is_a? String }
1693
-
1694
- if @return_headers # return headers
1695
- return self.class::Row.new(@headers, row, true)
1696
- elsif not [Array, String].include? @use_headers.class # skip to field row
1697
- return shift
1698
- end
1316
+ def fields_converter
1317
+ @fields_converter ||= build_fields_converter
1318
+ end
1319
+
1320
+ def build_fields_converter
1321
+ specific_options = {
1322
+ builtin_converters: Converters,
1323
+ }
1324
+ options = @base_fields_converter_options.merge(specific_options)
1325
+ fields_converter = FieldsConverter.new(options)
1326
+ normalize_converters(@initial_converters).each do |name, converter|
1327
+ fields_converter.add_converter(name, &converter)
1699
1328
  end
1329
+ fields_converter
1330
+ end
1700
1331
 
1701
- self.class::Row.new(@headers, convert_fields(row)) # field row
1332
+ def header_fields_converter
1333
+ @header_fields_converter ||= build_header_fields_converter
1702
1334
  end
1703
1335
 
1704
- #
1705
- # This method injects an instance variable <tt>unconverted_fields</tt> into
1706
- # +row+ and an accessor method for +row+ called unconverted_fields(). The
1707
- # variable is set to the contents of +fields+.
1708
- #
1709
- def add_unconverted_fields(row, fields)
1710
- class << row
1711
- attr_reader :unconverted_fields
1336
+ def build_header_fields_converter
1337
+ specific_options = {
1338
+ builtin_converters: HeaderConverters,
1339
+ accept_nil: true,
1340
+ }
1341
+ options = @base_fields_converter_options.merge(specific_options)
1342
+ fields_converter = FieldsConverter.new(options)
1343
+ normalize_converters(@initial_header_converters).each do |name, converter|
1344
+ fields_converter.add_converter(name, &converter)
1712
1345
  end
1713
- row.instance_variable_set(:@unconverted_fields, fields)
1714
- row
1346
+ fields_converter
1715
1347
  end
1716
1348
 
1717
- #
1718
- # This method is an encoding safe version of Regexp::escape(). It will escape
1719
- # any characters that would change the meaning of a regular expression in the
1720
- # encoding of +str+. Regular expression characters that cannot be transcoded
1721
- # to the target encoding will be skipped and no escaping will be performed if
1722
- # a backslash cannot be transcoded.
1723
- #
1724
- def escape_re(str)
1725
- str.gsub(@re_chars) {|c| @re_esc + c}
1349
+ def parser
1350
+ @parser ||= Parser.new(@io, parser_options)
1726
1351
  end
1727
1352
 
1728
- #
1729
- # Builds a regular expression in <tt>@encoding</tt>. All +chunks+ will be
1730
- # transcoded to that encoding.
1731
- #
1732
- def encode_re(*chunks)
1733
- Regexp.new(encode_str(*chunks))
1353
+ def parser_options
1354
+ @parser_options.merge(fields_converter: fields_converter,
1355
+ header_fields_converter: header_fields_converter)
1734
1356
  end
1735
1357
 
1736
- #
1737
- # Builds a String in <tt>@encoding</tt>. All +chunks+ will be transcoded to
1738
- # that encoding.
1739
- #
1740
- def encode_str(*chunks)
1741
- chunks.map { |chunk| chunk.encode(@encoding.name) }.join('')
1358
+ def writer
1359
+ @writer ||= Writer.new(@io, writer_options)
1742
1360
  end
1743
1361
 
1744
- #
1745
- # Returns the encoding of the internal IO object or the +default+ if the
1746
- # encoding cannot be determined.
1747
- #
1748
- def raw_encoding(default = Encoding::ASCII_8BIT)
1749
- if @io.respond_to? :internal_encoding
1750
- @io.internal_encoding || @io.external_encoding
1751
- elsif @io.respond_to? :encoding
1752
- @io.encoding
1753
- else
1754
- default
1755
- end
1362
+ def writer_options
1363
+ @writer_options.merge(header_fields_converter: header_fields_converter)
1756
1364
  end
1757
1365
  end
1758
1366