uncsv 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # Configuration options for parsing CSVs. It is a struct-like object with
5
+ # attribute acessors.
6
+ class Config
7
+ # Options that directly map to Std-lib `CSV` options
8
+ CSV_OPTS = %i[
9
+ col_sep row_sep quote_char field_size_limit
10
+ ].freeze
11
+
12
+ # The default values applied if an attribute's value is not specified when
13
+ # constructing a new `Config` object.
14
+ DEFAULTS = {
15
+ col_sep: ',',
16
+ expand_headers: false,
17
+ field_size_limit: nil,
18
+ header_rows: [],
19
+ header_separator: '.',
20
+ nil_empty: true,
21
+ normalize_headers: false,
22
+ quote_char: '"',
23
+ row_sep: :auto,
24
+ skip_rows: [],
25
+ skip_blanks: false,
26
+ unique_headers: false
27
+ }.freeze
28
+
29
+ # The string that separates each field
30
+ #
31
+ # Default: `","`.
32
+ #
33
+ # @return [String] The column separator string
34
+ # @see (see #initialize)
35
+ attr_accessor :col_sep
36
+
37
+ # @!attribute expand_headers
38
+ # Whether to fill empty headers with values from the left.
39
+ #
40
+ # Default `false`. If set to `true`, blank header row cells will assume
41
+ # the header of the row to their left. This is useful for heirarchical
42
+ # headers where not all the header cells are filled in. If set to an
43
+ # array of header indexes, only the specified headers will be expanded.
44
+ #
45
+ # @return [Array] An array of expaned header indexes
46
+
47
+ # The maximum size CSV will read ahead looking for a closing quote.
48
+ #
49
+ # Default: `nil`.
50
+ #
51
+ # @return [nil, Integer] The maximum field size
52
+ # @see (see #initialize)
53
+ attr_accessor :field_size_limit
54
+
55
+ # Indexes of the rows to use as headers
56
+ #
57
+ # Default: `[]`. Accepts an array of zero-based indexes or a single index.
58
+ # For example, it could be set to `0` to indicate a header in the first row.
59
+ # If set to an array of indexes (`[1,2]`), the header row text will be
60
+ # joined by the `:header_separator`. For example, if if the cell (0,0) had
61
+ # the value `"Personal"` and cell (1,0) had the value "Name", the header
62
+ # would become `"Personal.Name"`. Any data above the last header row will be
63
+ # ignored.
64
+ #
65
+ # @return [Array] The header row indexes
66
+ attr_reader :header_rows
67
+
68
+ # The separator between multiple header fields
69
+ #
70
+ # Default: `"."`. When using multiple header rows, this is a string used
71
+ # to separate the individual header fields.
72
+ #
73
+ # @return [String] The separator string
74
+ attr_accessor :header_separator
75
+
76
+ # Whether to represent empty cells as `nil`.
77
+ #
78
+ # Default `false`. If `true`, empty cells will be set to `nil`, otherwise,
79
+ # they are set to an empty string.
80
+ #
81
+ # @return [Boolean] Whether empty cells will be `nil`ed
82
+ attr_accessor :nil_empty
83
+
84
+ # Whether to rewrite headers to a standard format
85
+ #
86
+ # Default `false`. If set to `true`, header field text will be normalized.
87
+ # The text will be lowercased, and non-alphanumeric characters will be
88
+ # replaced with underscores (`_`).
89
+ #
90
+ # If set to a string, those characters will
91
+ # be replaced with the string instead.
92
+ #
93
+ # If set to a hash, the hash will be treated as options to KeyNormalizer,
94
+ # accepting the `:separator`, and `:downcase` options.
95
+ #
96
+ # If set to another object, it is expected to respond to the
97
+ # `normalize(key)` method by returning a normalized string.
98
+ #
99
+ # @see KeyNormalizer
100
+ # @return [KeyNormalizer, Object] The KeyNormalizer object or equivalent
101
+ attr_reader :normalize_headers
102
+
103
+ # The character used to quote individual fields
104
+ #
105
+ # Default `'"'`. If set to `true`, header field text will be normalized. The
106
+ # text will be lowercased, and non-alphanumeric characters will be replaced
107
+ # with underscores (`_`). If set to a string, those characters will be
108
+ # replaced with the string instead.
109
+ #
110
+ # @return [String] The quote character
111
+ # @see (see #initialize)
112
+ attr_accessor :quote_char
113
+
114
+ # The string at the end of each row
115
+ #
116
+ # Default `:auto`.
117
+ #
118
+ # @return [:auto, String] The row separator
119
+ # @see (see #initialize)
120
+ attr_accessor :row_sep
121
+
122
+ # Whether to skip blank rows
123
+ #
124
+ # Default `false`. If `true`, rows whose fields are all empty will be
125
+ # skipped.
126
+ #
127
+ # @return [Boolean] Whether blank rows will be skipped
128
+ attr_accessor :skip_blanks
129
+
130
+ # An array of row indexes to skip
131
+ #
132
+ # Default `[]`. If set to an array of zero-based row indexes, those rows
133
+ # will be skipped. This option does not apply to header rows.
134
+ #
135
+ # @return [Array] The row index to skip
136
+ attr_reader :skip_rows
137
+
138
+ # Whether to force headers to be unique
139
+ #
140
+ # Default `false`. If set to `true`, headers will be forced to be unique by
141
+ # appending numbers to duplicates. For example, if two header cells have the
142
+ # text `"Name"`, the headers will become `"Name.0"`, and `"Name.1"`. The
143
+ # separator between the text and the number can be set using the
144
+ # `:header_separator` option.
145
+ #
146
+ # @return [Boolean] Whether headers will be uniqued
147
+ attr_accessor :unique_headers
148
+
149
+ # Create a new `Config` object.
150
+ #
151
+ # Options will be set to the defaults unless overridden by the `opts`
152
+ # parameter.
153
+ #
154
+ # @param opts [Hash] A hash of configuration options. See the individual
155
+ # attributes for detailed descriptions.
156
+ #
157
+ # @see http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
158
+ # CSV#new
159
+ def initialize(opts = {})
160
+ DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
161
+ end
162
+
163
+ def skip_rows=(rows)
164
+ rows = [rows] unless rows.is_a?(Array)
165
+ @skip_rows = Hash[rows.map { |r| [r, true] }]
166
+ end
167
+
168
+ def header_rows=(rows)
169
+ rows = [rows] unless rows.is_a?(Array)
170
+ @header_rows = rows.sort
171
+ end
172
+
173
+ def expand_headers=(value)
174
+ value = [value] if value.is_a?(Integer)
175
+ @expand_headers = value
176
+ end
177
+
178
+ def normalize_headers=(value)
179
+ if value.is_a?(Hash)
180
+ value = KeyNormalizer.new(value)
181
+ elsif value.is_a?(String)
182
+ value = KeyNormalizer.new(separator: value)
183
+ elsif value == true
184
+ value = KeyNormalizer.new
185
+ end
186
+ @normalize_headers = value
187
+ end
188
+
189
+ def expand_headers
190
+ return header_rows if @expand_headers == true
191
+ return [] if @expand_headers == false
192
+
193
+ @expand_headers
194
+ end
195
+
196
+ # Get options passed through to `CSV#new`.
197
+ #
198
+ # @return [Hash] A hash of the CSV options
199
+ # @see (see #initialize)
200
+ def csv_opts
201
+ Hash[CSV_OPTS.map { |k| [k, public_send(k)] }]
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # A parsed CSV header.
5
+ class Header
6
+ # Create a new `Header` object
7
+ #
8
+ # @param headers [Array<Array<String>>] An array of header row values
9
+ # @param config [Config] Configuration options. Default options if `nil`.
10
+ def initialize(headers, config = nil)
11
+ @headers = headers
12
+ @config = config || Config.new
13
+ @to_a = nil
14
+ end
15
+
16
+ # Iterate over each header field
17
+ #
18
+ # @yield A block to run for each header field
19
+ # @yieldparam row [String] A header field
20
+ # @return [Enumerator] An enumerator over header field
21
+ def each(&block)
22
+ to_a.each(&block)
23
+ end
24
+
25
+ # Get an array of parsed header fields
26
+ #
27
+ # The header fields are cached, so consecutive calls to this method return
28
+ # the same array.
29
+ #
30
+ # @return [Array] The array of header fields
31
+ def to_a
32
+ @to_a ||= begin
33
+ headers = nil_empty(@headers)
34
+ headers = square(headers)
35
+ headers = normalize(headers) if @config.normalize_headers
36
+ headers = expand(headers)
37
+ combined = combine(headers)
38
+ combined = unique(combined) if @config.unique_headers
39
+ combined
40
+ end
41
+ end
42
+
43
+ class << self
44
+ # Parse headers from a CSV
45
+ #
46
+ # @param csv [CSV] A
47
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
48
+ # @param config [Config] Configuration options. Default options if `nil`.
49
+ # @return [OpenStruct] An object with the methods `header`, `index`, and
50
+ # `rows`. `header` is the {Header} object. `index` is the next CSV row
51
+ # index. `rows` is an array of the skipped rows including the header
52
+ # rows.
53
+ def parse!(csv, config)
54
+ index = config.header_rows.empty? ? 0 : (config.header_rows.max + 1)
55
+ rows = read_rows(csv, index)
56
+ headers = config.header_rows.map { |i| rows[i] }
57
+ OpenStruct.new(
58
+ header: new(headers, config),
59
+ index: index,
60
+ rows: rows
61
+ )
62
+ end
63
+
64
+ private
65
+
66
+ # Read a given number of rows from a CSV
67
+ #
68
+ # @param csv [CSV] A
69
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object to
70
+ # read rows from.
71
+ # @param count [Integer] The number of rows to read
72
+ # @return [Array<Array<String>>] An array of the read rows
73
+ def read_rows(csv, count)
74
+ (0...count).map { csv.shift }
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ # Combine multiple headers into a single header
81
+ #
82
+ # Joins individual headers with the `header_separator`.
83
+ #
84
+ # @param headers [Array<Array<String>>] The headers to combine
85
+ # @return [Array<String>] The combined header
86
+ def combine(headers)
87
+ headers.each_with_object([]) do |header, combined|
88
+ header.each_with_index do |key, index|
89
+ parts = [combined[index], key].compact
90
+ combined[index] = if parts.empty?
91
+ nil
92
+ else
93
+ parts.join(@config.header_separator)
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ # Fills in `nil` headers from the left
100
+ #
101
+ # @param headers [Array<Array<String>>] The headers to expand
102
+ # @return [Array<Array<String>>] The expanded headers
103
+ def expand(headers)
104
+ headers.each_with_index.map do |header, index|
105
+ next header unless @config.expand_headers.include?(index)
106
+
107
+ last = nil
108
+ header.map do |key|
109
+ key ? last = key : last
110
+ end
111
+ end
112
+ end
113
+
114
+ # Unique headers by adding numbers to the end
115
+ #
116
+ # @param combined [Array<String>] The combined headers to unique
117
+ # @return [Array<String>] The uniqued headers
118
+ def unique(combined)
119
+ combined = combined.dup
120
+ collate(combined).each do |key, indexes|
121
+ next if indexes.size == 1
122
+
123
+ indexes.each_with_index do |index, count|
124
+ combined[index] = [key, count].compact.join(@config.header_separator)
125
+ end
126
+ end
127
+ combined
128
+ end
129
+
130
+ # Create a hash of headers to arrays of their indexes
131
+ #
132
+ # Used for checking for header uniqueness
133
+ #
134
+ # @param header [Array<String>] The combined header to collate
135
+ # @return [Hash] The collated headers
136
+ def collate(header)
137
+ collated = {}
138
+ header.each_with_index do |key, index|
139
+ collated[key] = (collated[key] || []) << index
140
+ end
141
+ collated
142
+ end
143
+
144
+ # Normalize header values
145
+ #
146
+ # @param headers [Array<Array<String>>] The array of uncombined headers to
147
+ # normalize
148
+ def normalize(headers)
149
+ headers.map do |header|
150
+ header.map do |key|
151
+ @config.normalize_headers.normalize(key)
152
+ end
153
+ end
154
+ end
155
+
156
+ # Make the headers all the same length
157
+ #
158
+ # @param headers [Array<Array<String>>] An array of headers to square
159
+ # @return [Array<Array<String>>] The squared headers
160
+ def square(headers)
161
+ length = headers.map(&:size).max
162
+ headers.map { |h| h.fill(nil, h.size, length - h.size) }
163
+ end
164
+
165
+ # Convert header empty strings to nil
166
+ #
167
+ # @param headers [Array<Array<String>>] An array of headers to convert
168
+ # @return [Array<Array<String>>] The converted headers
169
+ def nil_empty(headers)
170
+ headers.map { |h| h.map { |k| k == '' ? nil : k } }
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # Normalizes strings into a consistant format
5
+ class KeyNormalizer
6
+ # The default values applied if an attribute's value is not specified when
7
+ # constructing a new `KeyNormalizer` object.
8
+ DEFAULTS = {
9
+ downcase: true,
10
+ separator: '_'
11
+ }.freeze
12
+
13
+ # A string to replace all non-alphanumeric characters in the key
14
+ #
15
+ # Default: '_'. Can be set to an empty string to remove non-alphanumeric
16
+ # characters without replacing them.
17
+ #
18
+ # @return [String] The separator string
19
+ attr_accessor :separator
20
+
21
+ # Sets keys to all lower-case if set to `true`
22
+ #
23
+ # Default: true
24
+ #
25
+ # @return [Boolean] Whether the key will be lower-cased
26
+ attr_accessor :downcase
27
+
28
+ # Create a new `KeyNormalizer` object.
29
+ #
30
+ # Options will be set to the defaults unless overridden by the `opts`
31
+ # parameter.
32
+ #
33
+ # @param opts [Hash] A hash of configuration options. See the individual
34
+ # attributes for detailed descriptions.
35
+ def initialize(opts = {})
36
+ DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
37
+ end
38
+
39
+ # Normalize a key
40
+ #
41
+ # Replaces non-alphanumeric characters with `separator`, then
42
+ # deduplicates underscores and trims them from the ends of the key. Then
43
+ # the key is lower-cased if `downcase` is set.
44
+ #
45
+ # @param key [String, nil] The key field to normalize
46
+ # @return [String, nil] The normalized header field or `nil` if the input
47
+ # key is `nil`.
48
+ def normalize(key)
49
+ return nil if key.nil?
50
+
51
+ key = key.gsub(/[^a-z0-9]+/i, separator)
52
+ unless separator.empty?
53
+ escaped_separator = Regexp.escape(separator)
54
+ key.gsub!(/#{escaped_separator}{2,}/, separator)
55
+ key.gsub!(/^#{escaped_separator}|#{escaped_separator}$/, '')
56
+ end
57
+ key.downcase! if downcase
58
+ key
59
+ end
60
+ end
61
+ end
data/lib/uncsv/row.rb ADDED
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # A single data row from a CSV. Fields can be accessed by header or zero-based
5
+ # index.
6
+ class Row
7
+ include Enumerable
8
+
9
+ # The headers for each field
10
+ #
11
+ # If a header for a given field is not defined, it will be `nil`.
12
+ #
13
+ # @return [Array] An array of the field headers
14
+ attr_reader :header
15
+
16
+ # The fields ordered from left to right
17
+ #
18
+ # An array of zero-indexed field values. If a field is empty it will be
19
+ # `nil`, or `''` if `nil_empty` is `false`.
20
+ #
21
+ # @return [Array] An array of the field values
22
+ attr_reader :fields
23
+
24
+ # Create a new `Row` object
25
+ #
26
+ # The `header` and `fields` arrays do not need to be the same length. If
27
+ # they are not, the missing values will be filled with `nil`.
28
+ #
29
+ # @param header [Array] The field headers
30
+ # @param fields [Array] The field values
31
+ # @param config [Config] Configuration options. Default options if `nil`.
32
+ def initialize(header, fields, config = nil)
33
+ @config = config || Config.new
34
+ @header = square(header, fields.size)
35
+ @fields = square(fields, header.size).map { |f| process(f) }
36
+ @map = Hash[header.zip(@fields)]
37
+ end
38
+
39
+ # Get a field by index or header
40
+ #
41
+ # If `key` is an `Integer`, get a field by a zero-based index. If `key` is a
42
+ # header, access a field by it's header. If `key` is nil, or if a field does
43
+ # not exist, will return `nil`.
44
+ #
45
+ # @param key [Integer, String] The index or header
46
+ # @return [String, nil] The field value if it exists
47
+ def [](key)
48
+ return if key.nil?
49
+
50
+ value = key.is_a?(Integer) ? @fields[key] : @map[key]
51
+ process(value)
52
+ end
53
+
54
+ # Gets a hash of headers to fields
55
+ #
56
+ # `nil` headers will not be included in the hash.
57
+ #
58
+ # @return [Hash] A hash of headers to fields
59
+ def to_h
60
+ Hash[@header.compact.map { |h| [h, self[h]] }]
61
+ end
62
+
63
+ # Iterate over each pair of headers and fields
64
+ #
65
+ # @yield A block to run for each pair
66
+ # @yieldparam row [Row] A row object
67
+ # @return [Enumerator] An enumerator over each pair
68
+ def each(&block)
69
+ @map.each_pair(&block)
70
+ end
71
+
72
+ # Get a field by index or header and specify a default
73
+ #
74
+ # Tries to get the field specified by key (see {#[]}). If the field
75
+ # is `nil`, returns the default. If a block is given, the default is the
76
+ # block's return value, otherwise the default is the `default` argument.
77
+ #
78
+ # @yield A block to run if the field is `nil`
79
+ # @yieldparam key [String] The `key` parameter
80
+ # @return [String, Object] The field value or default
81
+ def fetch(key, default = nil)
82
+ value = self[key]
83
+ return value unless value.nil?
84
+
85
+ block_given? ? yield(key) : default
86
+ end
87
+
88
+ private
89
+
90
+ # Fills an array with nil to extend it to the given size
91
+ #
92
+ # @param array [Array] The array to square
93
+ # @param size [Integer] The target array size
94
+ # @return [Array] The squared array
95
+ def square(array, size)
96
+ array.fill(nil, array.size, size - array.size)
97
+ end
98
+
99
+ # Transforms a field value according to the config options
100
+ #
101
+ # @param field [String] The field value to process
102
+ # @return [String] The processed field
103
+ def process(field)
104
+ field = '' if field.nil? && !@config.nil_empty
105
+ field = nil if field == '' && @config.nil_empty
106
+ field
107
+ end
108
+ end
109
+ end
data/lib/uncsv/rows.rb ADDED
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ class Uncsv
6
+ # A collection of parsed rows from a CSV
7
+ class Rows
8
+ # Create a new `Rows` object
9
+ #
10
+ # @param csv [CSV] A
11
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
12
+ # @param config [Config] Configuration options. Default options if `nil`.
13
+ def initialize(csv, config = nil)
14
+ @csv = csv
15
+ @config = config || Config.new
16
+ @started = false
17
+ @parsed = nil
18
+ end
19
+
20
+ # Iterate over each row
21
+ #
22
+ # @yield A block to run for each row
23
+ # @yieldparam row [Row] A row object
24
+ # @return [Enumerator] An enumerator over each row
25
+ def each(&block)
26
+ Enumerator.new do |yielder|
27
+ start
28
+ index = parsed.index
29
+ loop do
30
+ break unless yield_row(yielder, index)
31
+
32
+ index += 1
33
+ end
34
+ end.each(&block)
35
+ end
36
+
37
+ # Get the CSV header
38
+ #
39
+ # @return [Array] An array of the CSV header fields
40
+ # @see Header#to_a
41
+ def header
42
+ parsed.header.to_a
43
+ end
44
+
45
+ private
46
+
47
+ # Whether the given row should be skipped
48
+ #
49
+ # @param fields [Array] An array of field values
50
+ # @param index [Integer] The zero-based row index
51
+ # @return [Boolean] Whether the row should be skipped
52
+ def should_skip?(fields, index)
53
+ return true if @config.skip_rows[index]
54
+ return true if @config.skip_blanks && fields.compact.empty?
55
+
56
+ false
57
+ end
58
+
59
+ # Yield a row from the CSV to the Enumerator yielder
60
+ #
61
+ # Reads a row from the CSV and yields a parsed row if necessary.
62
+ #
63
+ # @param yielder [Enumerator::Yielder] A yielder to yield the row to
64
+ # @param index [Integer] The next row index
65
+ # @return [Boolean] `false` if the CSV is ended
66
+ def yield_row(yielder, index)
67
+ fields = @csv.shift
68
+ return false unless fields
69
+
70
+ unless should_skip?(fields, index)
71
+ yielder << Row.new(header, fields, @config)
72
+ end
73
+ true
74
+ end
75
+
76
+ # Start reading the CSV
77
+ #
78
+ # If the CSV has already been read, it will be rewound and the header will
79
+ # be reset.
80
+ def start
81
+ if @started
82
+ @parsed = nil
83
+ @csv.rewind
84
+ else
85
+ @started = true
86
+ end
87
+ end
88
+
89
+ # Get the header parse object
90
+ #
91
+ # The parsed header is cached, so multiple calls will return the same
92
+ # instance.
93
+ #
94
+ # @return [OpenStruct] The parsed header object
95
+ def parsed
96
+ @parsed ||= Header.parse!(@csv, @config)
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # The current Uncsv library version
5
+ VERSION = '0.3.1'
6
+ end