uncsv 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # Configuration options for parsing CSVs. It is a struct-like object with
5
+ # attribute acessors.
6
+ class Config
7
+ # Options that directly map to Std-lib `CSV` options
8
+ CSV_OPTS = %i[
9
+ col_sep row_sep quote_char field_size_limit
10
+ ].freeze
11
+
12
+ # The default values applied if an attribute's value is not specified when
13
+ # constructing a new `Config` object.
14
+ DEFAULTS = {
15
+ col_sep: ',',
16
+ expand_headers: false,
17
+ field_size_limit: nil,
18
+ header_rows: [],
19
+ header_separator: '.',
20
+ nil_empty: true,
21
+ normalize_headers: false,
22
+ quote_char: '"',
23
+ row_sep: :auto,
24
+ skip_rows: [],
25
+ skip_blanks: false,
26
+ unique_headers: false
27
+ }.freeze
28
+
29
+ # The string that separates each field
30
+ #
31
+ # Default: `","`.
32
+ #
33
+ # @return [String] The column separator string
34
+ # @see (see #initialize)
35
+ attr_accessor :col_sep
36
+
37
+ # @!attribute expand_headers
38
+ # Whether to fill empty headers with values from the left.
39
+ #
40
+ # Default `false`. If set to `true`, blank header row cells will assume
41
+ # the header of the row to their left. This is useful for heirarchical
42
+ # headers where not all the header cells are filled in. If set to an
43
+ # array of header indexes, only the specified headers will be expanded.
44
+ #
45
+ # @return [Array] An array of expaned header indexes
46
+
47
+ # The maximum size CSV will read ahead looking for a closing quote.
48
+ #
49
+ # Default: `nil`.
50
+ #
51
+ # @return [nil, Integer] The maximum field size
52
+ # @see (see #initialize)
53
+ attr_accessor :field_size_limit
54
+
55
+ # Indexes of the rows to use as headers
56
+ #
57
+ # Default: `[]`. Accepts an array of zero-based indexes or a single index.
58
+ # For example, it could be set to `0` to indicate a header in the first row.
59
+ # If set to an array of indexes (`[1,2]`), the header row text will be
60
+ # joined by the `:header_separator`. For example, if if the cell (0,0) had
61
+ # the value `"Personal"` and cell (1,0) had the value "Name", the header
62
+ # would become `"Personal.Name"`. Any data above the last header row will be
63
+ # ignored.
64
+ #
65
+ # @return [Array] The header row indexes
66
+ attr_reader :header_rows
67
+
68
+ # The separator between multiple header fields
69
+ #
70
+ # Default: `"."`. When using multiple header rows, this is a string used
71
+ # to separate the individual header fields.
72
+ #
73
+ # @return [String] The separator string
74
+ attr_accessor :header_separator
75
+
76
+ # Whether to represent empty cells as `nil`.
77
+ #
78
+ # Default `false`. If `true`, empty cells will be set to `nil`, otherwise,
79
+ # they are set to an empty string.
80
+ #
81
+ # @return [Boolean] Whether empty cells will be `nil`ed
82
+ attr_accessor :nil_empty
83
+
84
+ # Whether to rewrite headers to a standard format
85
+ #
86
+ # Default `false`. If set to `true`, header field text will be normalized.
87
+ # The text will be lowercased, and non-alphanumeric characters will be
88
+ # replaced with underscores (`_`).
89
+ #
90
+ # If set to a string, those characters will
91
+ # be replaced with the string instead.
92
+ #
93
+ # If set to a hash, the hash will be treated as options to KeyNormalizer,
94
+ # accepting the `:separator`, and `:downcase` options.
95
+ #
96
+ # If set to another object, it is expected to respond to the
97
+ # `normalize(key)` method by returning a normalized string.
98
+ #
99
+ # @see KeyNormalizer
100
+ # @return [KeyNormalizer, Object] The KeyNormalizer object or equivalent
101
+ attr_reader :normalize_headers
102
+
103
+ # The character used to quote individual fields
104
+ #
105
+ # Default `'"'`. If set to `true`, header field text will be normalized. The
106
+ # text will be lowercased, and non-alphanumeric characters will be replaced
107
+ # with underscores (`_`). If set to a string, those characters will be
108
+ # replaced with the string instead.
109
+ #
110
+ # @return [String] The quote character
111
+ # @see (see #initialize)
112
+ attr_accessor :quote_char
113
+
114
+ # The string at the end of each row
115
+ #
116
+ # Default `:auto`.
117
+ #
118
+ # @return [:auto, String] The row separator
119
+ # @see (see #initialize)
120
+ attr_accessor :row_sep
121
+
122
+ # Whether to skip blank rows
123
+ #
124
+ # Default `false`. If `true`, rows whose fields are all empty will be
125
+ # skipped.
126
+ #
127
+ # @return [Boolean] Whether blank rows will be skipped
128
+ attr_accessor :skip_blanks
129
+
130
+ # An array of row indexes to skip
131
+ #
132
+ # Default `[]`. If set to an array of zero-based row indexes, those rows
133
+ # will be skipped. This option does not apply to header rows.
134
+ #
135
+ # @return [Array] The row index to skip
136
+ attr_reader :skip_rows
137
+
138
+ # Whether to force headers to be unique
139
+ #
140
+ # Default `false`. If set to `true`, headers will be forced to be unique by
141
+ # appending numbers to duplicates. For example, if two header cells have the
142
+ # text `"Name"`, the headers will become `"Name.0"`, and `"Name.1"`. The
143
+ # separator between the text and the number can be set using the
144
+ # `:header_separator` option.
145
+ #
146
+ # @return [Boolean] Whether headers will be uniqued
147
+ attr_accessor :unique_headers
148
+
149
+ # Create a new `Config` object.
150
+ #
151
+ # Options will be set to the defaults unless overridden by the `opts`
152
+ # parameter.
153
+ #
154
+ # @param opts [Hash] A hash of configuration options. See the individual
155
+ # attributes for detailed descriptions.
156
+ #
157
+ # @see http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
158
+ # CSV#new
159
+ def initialize(opts = {})
160
+ DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
161
+ end
162
+
163
+ def skip_rows=(rows)
164
+ rows = [rows] unless rows.is_a?(Array)
165
+ @skip_rows = Hash[rows.map { |r| [r, true] }]
166
+ end
167
+
168
+ def header_rows=(rows)
169
+ rows = [rows] unless rows.is_a?(Array)
170
+ @header_rows = rows.sort
171
+ end
172
+
173
+ def expand_headers=(value)
174
+ value = [value] if value.is_a?(Integer)
175
+ @expand_headers = value
176
+ end
177
+
178
+ def normalize_headers=(value)
179
+ if value.is_a?(Hash)
180
+ value = KeyNormalizer.new(value)
181
+ elsif value.is_a?(String)
182
+ value = KeyNormalizer.new(separator: value)
183
+ elsif value == true
184
+ value = KeyNormalizer.new
185
+ end
186
+ @normalize_headers = value
187
+ end
188
+
189
+ def expand_headers
190
+ return header_rows if @expand_headers == true
191
+ return [] if @expand_headers == false
192
+
193
+ @expand_headers
194
+ end
195
+
196
+ # Get options passed through to `CSV#new`.
197
+ #
198
+ # @return [Hash] A hash of the CSV options
199
+ # @see (see #initialize)
200
+ def csv_opts
201
+ Hash[CSV_OPTS.map { |k| [k, public_send(k)] }]
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # A parsed CSV header.
5
+ class Header
6
+ # Create a new `Header` object
7
+ #
8
+ # @param headers [Array<Array<String>>] An array of header row values
9
+ # @param config [Config] Configuration options. Default options if `nil`.
10
+ def initialize(headers, config = nil)
11
+ @headers = headers
12
+ @config = config || Config.new
13
+ @to_a = nil
14
+ end
15
+
16
+ # Iterate over each header field
17
+ #
18
+ # @yield A block to run for each header field
19
+ # @yieldparam row [String] A header field
20
+ # @return [Enumerator] An enumerator over header field
21
+ def each(&block)
22
+ to_a.each(&block)
23
+ end
24
+
25
+ # Get an array of parsed header fields
26
+ #
27
+ # The header fields are cached, so consecutive calls to this method return
28
+ # the same array.
29
+ #
30
+ # @return [Array] The array of header fields
31
+ def to_a
32
+ @to_a ||= begin
33
+ headers = nil_empty(@headers)
34
+ headers = square(headers)
35
+ headers = normalize(headers) if @config.normalize_headers
36
+ headers = expand(headers)
37
+ combined = combine(headers)
38
+ combined = unique(combined) if @config.unique_headers
39
+ combined
40
+ end
41
+ end
42
+
43
+ class << self
44
+ # Parse headers from a CSV
45
+ #
46
+ # @param csv [CSV] A
47
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
48
+ # @param config [Config] Configuration options. Default options if `nil`.
49
+ # @return [OpenStruct] An object with the methods `header`, `index`, and
50
+ # `rows`. `header` is the {Header} object. `index` is the next CSV row
51
+ # index. `rows` is an array of the skipped rows including the header
52
+ # rows.
53
+ def parse!(csv, config)
54
+ index = config.header_rows.empty? ? 0 : (config.header_rows.max + 1)
55
+ rows = read_rows(csv, index)
56
+ headers = config.header_rows.map { |i| rows[i] }
57
+ OpenStruct.new(
58
+ header: new(headers, config),
59
+ index: index,
60
+ rows: rows
61
+ )
62
+ end
63
+
64
+ private
65
+
66
+ # Read a given number of rows from a CSV
67
+ #
68
+ # @param csv [CSV] A
69
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object to
70
+ # read rows from.
71
+ # @param count [Integer] The number of rows to read
72
+ # @return [Array<Array<String>>] An array of the read rows
73
+ def read_rows(csv, count)
74
+ (0...count).map { csv.shift }
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ # Combine multiple headers into a single header
81
+ #
82
+ # Joins individual headers with the `header_separator`.
83
+ #
84
+ # @param headers [Array<Array<String>>] The headers to combine
85
+ # @return [Array<String>] The combined header
86
+ def combine(headers)
87
+ headers.each_with_object([]) do |header, combined|
88
+ header.each_with_index do |key, index|
89
+ parts = [combined[index], key].compact
90
+ combined[index] = if parts.empty?
91
+ nil
92
+ else
93
+ parts.join(@config.header_separator)
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ # Fills in `nil` headers from the left
100
+ #
101
+ # @param headers [Array<Array<String>>] The headers to expand
102
+ # @return [Array<Array<String>>] The expanded headers
103
+ def expand(headers)
104
+ headers.each_with_index.map do |header, index|
105
+ next header unless @config.expand_headers.include?(index)
106
+
107
+ last = nil
108
+ header.map do |key|
109
+ key ? last = key : last
110
+ end
111
+ end
112
+ end
113
+
114
+ # Unique headers by adding numbers to the end
115
+ #
116
+ # @param combined [Array<String>] The combined headers to unique
117
+ # @return [Array<String>] The uniqued headers
118
+ def unique(combined)
119
+ combined = combined.dup
120
+ collate(combined).each do |key, indexes|
121
+ next if indexes.size == 1
122
+
123
+ indexes.each_with_index do |index, count|
124
+ combined[index] = [key, count].compact.join(@config.header_separator)
125
+ end
126
+ end
127
+ combined
128
+ end
129
+
130
+ # Create a hash of headers to arrays of their indexes
131
+ #
132
+ # Used for checking for header uniqueness
133
+ #
134
+ # @param header [Array<String>] The combined header to collate
135
+ # @return [Hash] The collated headers
136
+ def collate(header)
137
+ collated = {}
138
+ header.each_with_index do |key, index|
139
+ collated[key] = (collated[key] || []) << index
140
+ end
141
+ collated
142
+ end
143
+
144
+ # Normalize header values
145
+ #
146
+ # @param headers [Array<Array<String>>] The array of uncombined headers to
147
+ # normalize
148
+ def normalize(headers)
149
+ headers.map do |header|
150
+ header.map do |key|
151
+ @config.normalize_headers.normalize(key)
152
+ end
153
+ end
154
+ end
155
+
156
+ # Make the headers all the same length
157
+ #
158
+ # @param headers [Array<Array<String>>] An array of headers to square
159
+ # @return [Array<Array<String>>] The squared headers
160
+ def square(headers)
161
+ length = headers.map(&:size).max
162
+ headers.map { |h| h.fill(nil, h.size, length - h.size) }
163
+ end
164
+
165
+ # Convert header empty strings to nil
166
+ #
167
+ # @param headers [Array<Array<String>>] An array of headers to convert
168
+ # @return [Array<Array<String>>] The converted headers
169
+ def nil_empty(headers)
170
+ headers.map { |h| h.map { |k| k == '' ? nil : k } }
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # Normalizes strings into a consistant format
5
+ class KeyNormalizer
6
+ # The default values applied if an attribute's value is not specified when
7
+ # constructing a new `KeyNormalizer` object.
8
+ DEFAULTS = {
9
+ downcase: true,
10
+ separator: '_'
11
+ }.freeze
12
+
13
+ # A string to replace all non-alphanumeric characters in the key
14
+ #
15
+ # Default: '_'. Can be set to an empty string to remove non-alphanumeric
16
+ # characters without replacing them.
17
+ #
18
+ # @return [String] The separator string
19
+ attr_accessor :separator
20
+
21
+ # Sets keys to all lower-case if set to `true`
22
+ #
23
+ # Default: true
24
+ #
25
+ # @return [Boolean] Whether the key will be lower-cased
26
+ attr_accessor :downcase
27
+
28
+ # Create a new `KeyNormalizer` object.
29
+ #
30
+ # Options will be set to the defaults unless overridden by the `opts`
31
+ # parameter.
32
+ #
33
+ # @param opts [Hash] A hash of configuration options. See the individual
34
+ # attributes for detailed descriptions.
35
+ def initialize(opts = {})
36
+ DEFAULTS.merge(opts).each { |k, v| public_send("#{k}=", v) }
37
+ end
38
+
39
+ # Normalize a key
40
+ #
41
+ # Replaces non-alphanumeric characters with `separator`, then
42
+ # deduplicates underscores and trims them from the ends of the key. Then
43
+ # the key is lower-cased if `downcase` is set.
44
+ #
45
+ # @param key [String, nil] The key field to normalize
46
+ # @return [String, nil] The normalized header field or `nil` if the input
47
+ # key is `nil`.
48
+ def normalize(key)
49
+ return nil if key.nil?
50
+
51
+ key = key.gsub(/[^a-z0-9]+/i, separator)
52
+ unless separator.empty?
53
+ escaped_separator = Regexp.escape(separator)
54
+ key.gsub!(/#{escaped_separator}{2,}/, separator)
55
+ key.gsub!(/^#{escaped_separator}|#{escaped_separator}$/, '')
56
+ end
57
+ key.downcase! if downcase
58
+ key
59
+ end
60
+ end
61
+ end
data/lib/uncsv/row.rb ADDED
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # A single data row from a CSV. Fields can be accessed by header or zero-based
5
+ # index.
6
+ class Row
7
+ include Enumerable
8
+
9
+ # The headers for each field
10
+ #
11
+ # If a header for a given field is not defined, it will be `nil`.
12
+ #
13
+ # @return [Array] An array of the field headers
14
+ attr_reader :header
15
+
16
+ # The fields ordered from left to right
17
+ #
18
+ # An array of zero-indexed field values. If a field is empty it will be
19
+ # `nil`, or `''` if `nil_empty` is `false`.
20
+ #
21
+ # @return [Array] An array of the field values
22
+ attr_reader :fields
23
+
24
+ # Create a new `Row` object
25
+ #
26
+ # The `header` and `fields` arrays do not need to be the same length. If
27
+ # they are not, the missing values will be filled with `nil`.
28
+ #
29
+ # @param header [Array] The field headers
30
+ # @param fields [Array] The field values
31
+ # @param config [Config] Configuration options. Default options if `nil`.
32
+ def initialize(header, fields, config = nil)
33
+ @config = config || Config.new
34
+ @header = square(header, fields.size)
35
+ @fields = square(fields, header.size).map { |f| process(f) }
36
+ @map = Hash[header.zip(@fields)]
37
+ end
38
+
39
+ # Get a field by index or header
40
+ #
41
+ # If `key` is an `Integer`, get a field by a zero-based index. If `key` is a
42
+ # header, access a field by it's header. If `key` is nil, or if a field does
43
+ # not exist, will return `nil`.
44
+ #
45
+ # @param key [Integer, String] The index or header
46
+ # @return [String, nil] The field value if it exists
47
+ def [](key)
48
+ return if key.nil?
49
+
50
+ value = key.is_a?(Integer) ? @fields[key] : @map[key]
51
+ process(value)
52
+ end
53
+
54
+ # Gets a hash of headers to fields
55
+ #
56
+ # `nil` headers will not be included in the hash.
57
+ #
58
+ # @return [Hash] A hash of headers to fields
59
+ def to_h
60
+ Hash[@header.compact.map { |h| [h, self[h]] }]
61
+ end
62
+
63
+ # Iterate over each pair of headers and fields
64
+ #
65
+ # @yield A block to run for each pair
66
+ # @yieldparam row [Row] A row object
67
+ # @return [Enumerator] An enumerator over each pair
68
+ def each(&block)
69
+ @map.each_pair(&block)
70
+ end
71
+
72
+ # Get a field by index or header and specify a default
73
+ #
74
+ # Tries to get the field specified by key (see {#[]}). If the field
75
+ # is `nil`, returns the default. If a block is given, the default is the
76
+ # block's return value, otherwise the default is the `default` argument.
77
+ #
78
+ # @yield A block to run if the field is `nil`
79
+ # @yieldparam key [String] The `key` parameter
80
+ # @return [String, Object] The field value or default
81
+ def fetch(key, default = nil)
82
+ value = self[key]
83
+ return value unless value.nil?
84
+
85
+ block_given? ? yield(key) : default
86
+ end
87
+
88
+ private
89
+
90
+ # Fills an array with nil to extend it to the given size
91
+ #
92
+ # @param array [Array] The array to square
93
+ # @param size [Integer] The target array size
94
+ # @return [Array] The squared array
95
+ def square(array, size)
96
+ array.fill(nil, array.size, size - array.size)
97
+ end
98
+
99
+ # Transforms a field value according to the config options
100
+ #
101
+ # @param field [String] The field value to process
102
+ # @return [String] The processed field
103
+ def process(field)
104
+ field = '' if field.nil? && !@config.nil_empty
105
+ field = nil if field == '' && @config.nil_empty
106
+ field
107
+ end
108
+ end
109
+ end
data/lib/uncsv/rows.rb ADDED
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ class Uncsv
6
+ # A collection of parsed rows from a CSV
7
+ class Rows
8
+ # Create a new `Rows` object
9
+ #
10
+ # @param csv [CSV] A
11
+ # {http://ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html CSV} object.
12
+ # @param config [Config] Configuration options. Default options if `nil`.
13
+ def initialize(csv, config = nil)
14
+ @csv = csv
15
+ @config = config || Config.new
16
+ @started = false
17
+ @parsed = nil
18
+ end
19
+
20
+ # Iterate over each row
21
+ #
22
+ # @yield A block to run for each row
23
+ # @yieldparam row [Row] A row object
24
+ # @return [Enumerator] An enumerator over each row
25
+ def each(&block)
26
+ Enumerator.new do |yielder|
27
+ start
28
+ index = parsed.index
29
+ loop do
30
+ break unless yield_row(yielder, index)
31
+
32
+ index += 1
33
+ end
34
+ end.each(&block)
35
+ end
36
+
37
+ # Get the CSV header
38
+ #
39
+ # @return [Array] An array of the CSV header fields
40
+ # @see Header#to_a
41
+ def header
42
+ parsed.header.to_a
43
+ end
44
+
45
+ private
46
+
47
+ # Whether the given row should be skipped
48
+ #
49
+ # @param fields [Array] An array of field values
50
+ # @param index [Integer] The zero-based row index
51
+ # @return [Boolean] Whether the row should be skipped
52
+ def should_skip?(fields, index)
53
+ return true if @config.skip_rows[index]
54
+ return true if @config.skip_blanks && fields.compact.empty?
55
+
56
+ false
57
+ end
58
+
59
+ # Yield a row from the CSV to the Enumerator yielder
60
+ #
61
+ # Reads a row from the CSV and yields a parsed row if necessary.
62
+ #
63
+ # @param yielder [Enumerator::Yielder] A yielder to yield the row to
64
+ # @param index [Integer] The next row index
65
+ # @return [Boolean] `false` if the CSV is ended
66
+ def yield_row(yielder, index)
67
+ fields = @csv.shift
68
+ return false unless fields
69
+
70
+ unless should_skip?(fields, index)
71
+ yielder << Row.new(header, fields, @config)
72
+ end
73
+ true
74
+ end
75
+
76
+ # Start reading the CSV
77
+ #
78
+ # If the CSV has already been read, it will be rewound and the header will
79
+ # be reset.
80
+ def start
81
+ if @started
82
+ @parsed = nil
83
+ @csv.rewind
84
+ else
85
+ @started = true
86
+ end
87
+ end
88
+
89
+ # Get the header parse object
90
+ #
91
+ # The parsed header is cached, so multiple calls will return the same
92
+ # instance.
93
+ #
94
+ # @return [OpenStruct] The parsed header object
95
+ def parsed
96
+ @parsed ||= Header.parse!(@csv, @config)
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Uncsv
4
+ # The current Uncsv library version
5
+ VERSION = '0.3.1'
6
+ end