csvreader 0.7.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ # encoding: utf-8
2
+
3
+ class CsvReader
4
+
5
+
6
+
7
+ class Converter
8
+
9
+
10
+ # A Regexp used to find and convert some common Date formats.
11
+ DATE_MATCHER = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4}
12
+ |
13
+ \d{4}-\d{2}-\d{2} )\z
14
+ /x
15
+
16
+ # A Regexp used to find and convert some common DateTime formats.
17
+ DATE_TIME_MATCHER = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4}
18
+ |
19
+ \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}
20
+ |
21
+ # ISO-8601
22
+ \d{4}-\d{2}-\d{2}
23
+ (?:T\d{2}:\d{2}(?::\d{2}(?:\.\d+)?(?:[+-]\d{2}(?::\d{2})|Z)?)?)?
24
+ )\z
25
+ /x
26
+
27
+
28
+ CONVERTERS = {
29
+ ##
30
+ ## todo/fix: use regex INTEGER_MATCH / FLOAT_MATCH
31
+ ## to avoid rescue (with exception and stacktrace) for every try!!!
32
+ integer: ->(value) { Integer( value ) rescue value },
33
+ float: ->(value) { Float( value ) rescue value },
34
+ numeric: [:integer, :float],
35
+ date: ->(value) {
36
+ begin
37
+ value.match?( DATE_MATCHER ) ? Date.parse( value ) : value
38
+ rescue # date parse errors
39
+ value
40
+ end
41
+ },
42
+ date_time: ->(value) {
43
+ begin
44
+ value.match?( DATE_TIME_MATCHER ) ? DateTime.parse( value ) : value
45
+ rescue # encoding conversion or date parse errors
46
+ value
47
+ end
48
+ },
49
+
50
+ ## new - add null and boolean (any others): why? why not?
51
+ null: -> (value) {
52
+ ## turn empty strings into nil
53
+ ## rename to blank_to_nil or empty_to_nil or add both?
54
+ ## todo: add NIL, nil too? or #NA, N/A etc. - why? why not?
55
+ if value.empty? || ['NULL', 'null', 'N/A', 'n/a', '#NA', '#na' ].include?( value )
56
+ nil
57
+ else
58
+ value
59
+ end
60
+ },
61
+ boolean: -> (value) {
62
+ ## check yaml for possible true/value values - any missing?
63
+ ## add more (or less) - why? why not?
64
+ if ['TRUE', 'true', 't', 'ON', 'on', 'YES', 'yes'].include?( value )
65
+ true
66
+ elsif
67
+ ['FALSE', 'false', 'f', 'OFF', 'off', 'NO', 'no'].include?( value )
68
+ false
69
+ else
70
+ value
71
+ end
72
+ },
73
+ bool: [:boolean], ## bool convenience alias for boolean
74
+
75
+ all: [:null, :boolean, :date_time, :numeric],
76
+ }
77
+
78
+
79
+ HEADER_CONVERTERS = {
80
+ downcase: ->(value) { value.downcase },
81
+ symbol: ->(value) { value.downcase.gsub( /[^\s\w]+/, "" ).strip.
82
+ gsub( /\s+/, "_" ).to_sym
83
+ }
84
+ }
85
+
86
+
87
+ def self.create_header_converters( converters )
88
+ new( converters, HEADER_CONVERTERS )
89
+ end
90
+
91
+ def self.create_converters( converters )
92
+ new( converters, CONVERTERS )
93
+ end
94
+
95
+
96
+
97
+ def initialize( converters, registry=CONVERTERS )
98
+ converters = case converters
99
+ when nil then []
100
+ when Array then converters
101
+ else [converters]
102
+ end
103
+
104
+ @converters = []
105
+
106
+ converters.each do |converter|
107
+ if converter.is_a? Proc # custom code block
108
+ add_converter( registry, &converter)
109
+ else # by name
110
+ add_converter( converter, registry )
111
+ end
112
+ end
113
+ end
114
+
115
+ def to_a() @converters; end ## todo: rename to/use converters attribute name - why? why not?
116
+ def empty?() @converters.empty?; end
117
+
118
+ def convert( value, index_or_header=nil )
119
+ return value if value.nil?
120
+
121
+ @converters.each do |converter|
122
+ value = if converter.arity == 1 # straight converter
123
+ converter.call( value )
124
+ else
125
+ ## note: for CsvReader pass in the zero-based field/column index (integer)
126
+ ## for CsvHashReader pass in the header/field/column name (string)
127
+ converter.call( value, index_or_header )
128
+ end
129
+ break unless value.is_a?( String ) # note: short-circuit pipeline for speed
130
+ end
131
+ value # final state of value, converted or original
132
+ end
133
+
134
+
135
+ private
136
+
137
+ def add_converter( name=nil, registry, &converter )
138
+ if name.nil? # custom converter
139
+ @converters << converter
140
+ else # named converter
141
+ combo = registry[name]
142
+ case combo
143
+ when Array # combo converter
144
+ combo.each do |converter_name|
145
+ add_converter( converter_name, registry )
146
+ end
147
+ else # individual named converter
148
+ @converters << combo
149
+ end
150
+ end
151
+ end # method add_converter
152
+
153
+ end # class Converter
154
+
155
+ end # class CsvReader
@@ -2,54 +2,6 @@
2
2
 
3
3
  class CsvReader
4
4
 
5
- class Parser
6
-
7
- ## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
8
- ## parser must support parse method (with and without block)
9
- ## e.g. records = parse( data )
10
- ## -or-
11
- ## parse( data ) do |record|
12
- ## end
13
-
14
-
15
- DEFAULT = ParserStd.new
16
-
17
- RFC4180 = ParserStrict.new
18
- STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
19
- EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
20
-
21
- MYSQL = ParserStrict.new( sep: "\t",
22
- quote: false,
23
- escape: true,
24
- null: "\\N" )
25
-
26
- POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
27
- escape: true,
28
- null: "" )
29
-
30
- POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
31
- quote: false,
32
- escape: true,
33
- null: "\\N" )
34
-
35
- TAB = ParserTab.new
36
-
37
-
38
- def self.default() DEFAULT; end ## alternative alias for DEFAULT
39
- def self.strict() STRICT; end ## alternative alias for STRICT
40
- def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
41
- def self.excel() EXCEL; end ## alternative alias for EXCEL
42
- def self.mysql() MYSQL; end
43
- def self.postgresql() POSTGRESQL; end
44
- def self.postgres() postgresql; end
45
- def self.postgresql_text() POSTGRESQL_TEXT; end
46
- def self.postgres_text() postgresql_text; end
47
- def self.tab() TAB; end
48
-
49
- end # class Parser
50
-
51
-
52
-
53
5
  ####################################
54
6
  # define errors / exceptions
55
7
  # for all parsers for (re)use
@@ -2,31 +2,19 @@
2
2
 
3
3
  class CsvReader
4
4
 
5
- DEFAULT = CsvBuilder.new( Parser::DEFAULT )
6
- STRICT = CsvBuilder.new( Parser::STRICT )
7
- RFC4180 = CsvBuilder.new( Parser::RFC4180 )
8
- EXCEL = CsvBuilder.new( Parser::EXCEL )
9
- TAB = CsvBuilder.new( Parser::TAB )
10
-
11
- def self.default() DEFAULT; end ## alternative alias for DEFAULT
12
- def self.strict() STRICT; end ## alternative alias for RFC4180
13
- def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
14
- def self.excel() EXCEL; end ## alternative alias for EXCEL
15
- def self.tab() TAB; end ## alternative alias for TAB
16
-
17
-
18
-
19
-
20
-
21
5
  #######
22
6
  ## csv reader
23
7
 
24
- def self.open( path, mode='r:bom|utf-8',
8
+ def self.open( path, mode=nil,
25
9
  sep: nil,
26
10
  converters: nil,
27
11
  parser: nil, &block ) ## rename path to filename or name - why? why not?
28
- f = File.open( path, mode )
29
- csv = new(f, sep: sep, converters: converters, parser: parser )
12
+
13
+ ## note: default mode (if nil/not passed in) to 'r:bom|utf-8'
14
+ f = File.open( path, mode ? mode : 'r:bom|utf-8' )
15
+ csv = new(f, sep: sep,
16
+ converters: converters,
17
+ parser: parser )
30
18
 
31
19
  # handle blocks like Ruby's open(), not like the (old old) CSV library
32
20
  if block_given?
@@ -127,13 +115,15 @@ class CsvReader
127
115
  # create the IO object we will read from
128
116
  @io = data.is_a?(String) ? StringIO.new(data) : data
129
117
 
130
- @sep = sep
131
- @converters = converters
118
+ @sep = sep
119
+
120
+ @converters = Converter.create_converters( converters )
132
121
 
133
122
  @parser = parser.nil? ? Parser::DEFAULT : parser
134
123
  end
135
124
 
136
125
 
126
+
137
127
  ### IO and StringIO Delegation ###
138
128
  extend Forwardable
139
129
  def_delegators :@io,
@@ -152,13 +142,24 @@ class CsvReader
152
142
 
153
143
  def each( &block )
154
144
  if block_given?
155
- kwargs = {
156
- ## converters: converters ## todo: add converters
157
- }
145
+ kwargs = {}
158
146
  ## note: only add separator if present/defined (not nil)
159
147
  kwargs[:sep] = @sep if @sep && @parser.respond_to?( :'sep=' )
160
148
 
161
- @parser.parse( @io, kwargs, &block )
149
+ ## check array / pipeline of converters is empty (size=0 e.g. is [])
150
+ if @converters.empty?
151
+ @parser.parse( @io, kwargs, &block )
152
+ else
153
+ ## add "post"-processing with converters pipeline
154
+ ## that is, convert all strings to integer, float, date, ... if wanted
155
+ @parser.parse( @io, kwargs ) do |raw_record|
156
+ record = []
157
+ raw_record.each_with_index do | value, i |
158
+ record << @converters.convert( value, i )
159
+ end
160
+ block.call( record )
161
+ end
162
+ end
162
163
  else
163
164
  to_enum
164
165
  end
@@ -2,87 +2,188 @@
2
2
 
3
3
  class CsvHashReader
4
4
 
5
- def initialize( parser )
6
- @parser = parser
7
- end
8
5
 
9
- DEFAULT = new( CsvReader::Parser::DEFAULT )
10
- STRICT = new( CsvReader::Parser::STRICT )
11
- RFC4180 = new( CsvReader::Parser::RFC4180 )
6
+ def self.open( path, mode=nil,
7
+ headers: nil,
8
+ sep: nil,
9
+ converters: nil,
10
+ header_converters: nil,
11
+ parser: nil, &block ) ## rename path to filename or name - why? why not?
12
+
13
+ ## note: default mode (if nil/not passed in) to 'r:bom|utf-8'
14
+ f = File.open( path, mode ? mode : 'r:bom|utf-8' )
15
+ csv = new(f, headers: headers,
16
+ sep: sep,
17
+ converters: converters,
18
+ header_converters: header_converters,
19
+ parser: parser )
20
+
21
+ # handle blocks like Ruby's open(), not like the (old old) CSV library
22
+ if block_given?
23
+ begin
24
+ block.call( csv )
25
+ ensure
26
+ csv.close
27
+ end
28
+ else
29
+ csv
30
+ end
31
+ end # method self.open
32
+
33
+
34
+ def self.read( path, headers: nil,
35
+ sep: nil,
36
+ converters: nil,
37
+ header_converters: nil,
38
+ parser: nil )
39
+ open( path,
40
+ headers: headers,
41
+ sep: sep,
42
+ converters: converters,
43
+ header_converters: header_converters,
44
+ parser: parser ) { |csv| csv.read }
45
+ end
12
46
 
13
- def self.default() DEFAULT; end ## alternative alias for DEFAULT
14
- def self.strict() STRICT; end ## alternative alias for STRICT
15
- def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
16
47
 
17
48
 
49
+ def self.foreach( path, headers: nil,
50
+ sep: nil,
51
+ converters: nil,
52
+ header_converters: nil,
53
+ parser: nil, &block )
54
+ csv = open( path,
55
+ headers: headers,
56
+ sep: sep,
57
+ converters: converters,
58
+ header_converters: header_converters,
59
+ parser: parser )
18
60
 
19
- def self.parse( data, sep: nil, headers: nil, &block )
20
- DEFAULT.parse( data, sep: sep, headers: headers, &block )
61
+ if block_given?
62
+ begin
63
+ csv.each( &block )
64
+ ensure
65
+ csv.close
66
+ end
67
+ else
68
+ csv.to_enum ## note: caller (responsible) must close file!!!
69
+ ## remove version without block given - why? why not?
70
+ ## use Csv.open().to_enum or Csv.open().each
71
+ ## or Csv.new( File.new() ).to_enum or Csv.new( File.new() ).each ???
21
72
  end
73
+ end # method self.foreach
22
74
 
23
- def self.read( path, sep: nil, headers: nil )
24
- DEFAULT.read( path, sep: sep, headers: headers )
25
- end
26
75
 
27
- def self.foreach( path, sep: nil, headers: nil, &block )
28
- DEFAULT.foreach( path,sep: sep, headers: headers, &block )
76
+ def self.parse( data, headers: nil,
77
+ sep: nil,
78
+ converters: nil,
79
+ header_converters: nil,
80
+ parser: nil, &block )
81
+ csv = new( data,
82
+ headers: headers,
83
+ sep: sep,
84
+ converters: converters,
85
+ header_converters: header_converters,
86
+ parser: parser )
87
+
88
+ if block_given?
89
+ csv.each( &block ) ## note: caller (responsible) must close file!!! - add autoclose - why? why not?
90
+ else # slurp contents, if no block is given
91
+ csv.read ## note: caller (responsible) must close file!!! - add autoclose - why? why not?
29
92
  end
93
+ end # method self.parse
30
94
 
31
95
 
32
96
 
33
- #############################
34
- ## all "high-level" reader methods
35
- ##
36
97
 
37
- def parse( data, sep: nil, headers: nil, &block )
38
- if block_given?
39
- parse_lines( data, sep: sep, headers: headers, &block )
40
- else
41
- records = []
42
- parse_lines( data, sep: sep, headers: headers ) do |record|
43
- records << record
44
- end
45
- records
46
- end
47
- end
48
98
 
99
+ def initialize( data, headers: nil, sep: nil,
100
+ converters: nil,
101
+ header_converters: nil,
102
+ parser: nil )
103
+ raise ArgumentError.new( "Cannot parse nil as CSV" ) if data.nil?
104
+ ## todo: use (why? why not) - raise ArgumentError, "Cannot parse nil as CSV" if data.nil?
49
105
 
50
- def read( path, sep: nil, headers: nil )
51
- txt = File.open( path, 'r:bom|utf-8' ).read
52
- parse( txt, sep: sep, headers: headers )
53
- end
106
+ # create the IO object we will read from
107
+ @io = data.is_a?(String) ? StringIO.new(data) : data
54
108
 
109
+ ## pass in headers as array e.g. ['A', 'B', 'C']
110
+ ## double check: run header_converters on passed in headers?
111
+ ## for now - do NOT auto-convert passed in headers - keep them as-is (1:1)
112
+ @names = headers ? headers : nil
55
113
 
56
- def foreach( path, sep: nil, headers: nil, &block )
57
- File.open( path, 'r:bom|utf-8' ) do |file|
58
- parse_lines( file, sep: sep, headers: headers, &block )
59
- end
60
- end
114
+ @sep = sep
61
115
 
116
+ @converters = CsvReader::Converter.create_converters( converters )
117
+ @header_converters = CsvReader::Converter.create_header_converters( header_converters )
62
118
 
63
- private
119
+ @parser = parser.nil? ? CsvReader::Parser::DEFAULT : parser
120
+ end
64
121
 
65
- ####################
66
- ## parse_lines helper method to keep in one (central) place only (for easy editing/changing)
67
- ## - builds key/value pairs
68
122
 
69
- def parse_lines( data, sep: nil, headers: nil, &block)
70
- ## pass in headers as array e.g. ['A', 'B', 'C']
71
- names = headers ? headers : nil
72
123
 
73
- kwargs = {
74
- ## converters: converters ## todo: add converters
75
- }
76
- kwargs[:sep] = sep if sep && @parser.respond_to?( :'sep=' ) ## note: only add separator if present/defined (not nil)
124
+ ### IO and StringIO Delegation ###
125
+ extend Forwardable
126
+ def_delegators :@io,
127
+ :close, :closed?, :eof, :eof?
128
+
129
+ ## add more - why? why not?
130
+ ## def_delegators :@io, :binmode, :binmode?, :close, :close_read, :close_write,
131
+ ## :closed?, :eof, :eof?, :external_encoding, :fcntl,
132
+ ## :fileno, :flock, :flush, :fsync, :internal_encoding,
133
+ ## :ioctl, :isatty, :path, :pid, :pos, :pos=, :reopen,
134
+ ## :seek, :stat, :string, :sync, :sync=, :tell, :to_i,
135
+ ## :to_io, :truncate, :tty?
136
+
137
+
138
+ include Enumerable
139
+
140
+
141
+ def each( &block )
142
+
143
+ ## todo/fix:
144
+ ## add case for headers/names.size != values.size
145
+ ## - add rest option? for if less headers than values (see python csv.DictReader - why? why not?)
146
+ ##
147
+ ## handle case with duplicate and empty header names etc.
148
+
149
+
150
+ if block_given?
151
+ kwargs = {}
152
+ ## note: only add separator if present/defined (not nil)
153
+ kwargs[:sep] = @sep if @sep && @parser.respond_to?( :'sep=' )
154
+
155
+ @parser.parse( @io, kwargs ) do |raw_values| # sep: sep
156
+ if @names.nil? ## check for (first) headers row
157
+ if @header_converters.empty?
158
+ @names = raw_values ## store header row / a.k.a. field/column names
159
+ else
160
+ values = []
161
+ raw_values.each_with_index do |value,i|
162
+ values << @header_converters.convert( value, i )
163
+ end
164
+ @names = values
165
+ end
166
+ else ## "regular" record
167
+ raw_record = @names.zip( raw_values ).to_h ## todo/fix: check for more values than names/headers!!!
168
+ if @converters.empty?
169
+ block.call( raw_record )
170
+ else
171
+ ## add "post"-processing with converters pipeline
172
+ ## that is, convert all strings to integer, float, date, ... if wanted
173
+ record = {}
174
+ raw_record.each do | key, value |
175
+ record[ key ] = @converters.convert( value, key )
176
+ end
177
+ block.call( record )
178
+ end
179
+ end
180
+ end
181
+ else
182
+ to_enum
183
+ end
184
+ end # method each
185
+
186
+ def read() to_a; end # method read
77
187
 
78
- @parser.parse( data, kwargs ) do |values| # sep: sep
79
- if names.nil?
80
- names = values ## store header row / a.k.a. field/column names
81
- else
82
- record = names.zip( values ).to_h ## todo/fix: check for more values than names/headers!!!
83
- block.call( record )
84
- end
85
- end
86
- end
87
188
 
88
189
  end # class CsvHashReader