csv2avro 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,434 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe CSV2Avro::Converter do
4
+ describe '#read' do
5
+ context 'schema with string and integer columns' do
6
+ let(:schema_io) do
7
+ StringIO.new(
8
+ {
9
+ name: 'categories',
10
+ type: 'record',
11
+ fields: [
12
+ { name: 'id', type: 'int' },
13
+ { name: 'name', type: 'string' },
14
+ { name: 'description', type: ['string', 'null'] }
15
+ ]
16
+ }.to_json
17
+ )
18
+ end
19
+
20
+ context 'separated with commas (csv)' do
21
+ let(:reader) do
22
+ StringIO.new(
23
+ csv_string = CSV.generate do |csv|
24
+ csv << %w[id name description]
25
+ csv << %w[1 dresses Dresses]
26
+ csv << %w[2 female-tops]
27
+ end
28
+ )
29
+ end
30
+
31
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
32
+
33
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
34
+
35
+ let(:bad_rows_writer) { StringIO.new }
36
+
37
+ let(:error_writer) { StringIO.new }
38
+
39
+ before do
40
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, {}, schema: schema).convert
41
+ end
42
+
43
+ it 'should not have any bad rows' do
44
+ expect(bad_rows_writer.read).to eq("")
45
+ end
46
+
47
+ it 'should not have any errors' do
48
+ expect(error_writer.read).to eq("")
49
+ end
50
+
51
+ it 'should store the data with the given schema' do
52
+ expect(AvroReader.new(writer).read).to eq(
53
+ [
54
+ { 'id'=>1, 'name'=>'dresses', 'description'=>'Dresses' },
55
+ { 'id'=>2, 'name'=>'female-tops', 'description'=>nil }
56
+ ]
57
+ )
58
+ end
59
+ end
60
+
61
+ context 'separated with tabs (tsv)' do
62
+ let(:reader) do
63
+ StringIO.new(
64
+ csv_string = CSV.generate({col_sep: "\t"}) do |csv|
65
+ csv << %w[id name description]
66
+ csv << %w[1 dresses Dresses]
67
+ csv << %w[2 female-tops]
68
+ end
69
+ )
70
+ end
71
+
72
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
73
+
74
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
75
+
76
+ let(:bad_rows_writer) { StringIO.new }
77
+
78
+ let(:error_writer) { StringIO.new }
79
+
80
+ before do
81
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { delimiter: "\t" }, schema: schema).convert
82
+ end
83
+
84
+ it 'should not have any bad rows' do
85
+ expect(bad_rows_writer.read).to eq("")
86
+ end
87
+
88
+ it 'should not have any errors' do
89
+ expect(error_writer.read).to eq("")
90
+ end
91
+
92
+ it 'should store the data with the given schema' do
93
+ expect(AvroReader.new(writer).read).to eq(
94
+ [
95
+ { 'id'=>1, 'name'=>'dresses', 'description'=>'Dresses' },
96
+ { 'id'=>2, 'name'=>'female-tops', 'description'=>nil }
97
+ ]
98
+ )
99
+ end
100
+ end
101
+ end
102
+
103
+ context 'schema with boolean and array columns' do
104
+ let(:schema_io) do
105
+ StringIO.new(
106
+ {
107
+ name: 'categories',
108
+ type: 'record',
109
+ fields: [
110
+ { name: 'id', type: 'int' },
111
+ { name: 'enabled', type: ['boolean', 'null'] },
112
+ { name: 'image_links', type: [{ type: 'array', items: 'string' }, 'null'] }
113
+ ]
114
+ }.to_json
115
+ )
116
+ end
117
+
118
+ context 'separated with commas (default)' do
119
+ let(:reader) do
120
+ StringIO.new(
121
+ csv_string = CSV.generate({col_sep: "\t"}) do |csv|
122
+ csv << %w[id enabled image_links]
123
+ csv << %w[1 true http://www.images.com/dresses.jpeg]
124
+ csv << %w[2 false http://www.images.com/bras1.jpeg,http://www.images.com/bras2.jpeg]
125
+ end
126
+ )
127
+ end
128
+
129
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
130
+
131
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
132
+
133
+ let(:bad_rows_writer) { StringIO.new }
134
+
135
+ let(:error_writer) { StringIO.new }
136
+
137
+ before do
138
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { delimiter: "\t" }, schema: schema).convert
139
+ end
140
+
141
+ it 'should not have any bad rows' do
142
+ expect(bad_rows_writer.read).to eq("")
143
+ end
144
+
145
+ it 'should not have any errors' do
146
+ expect(error_writer.read).to eq("")
147
+ end
148
+
149
+ it 'should store the data with the given schema' do
150
+ expect(AvroReader.new(writer).read).to eq(
151
+ [
152
+ { 'id'=>1, 'enabled'=>true, 'image_links'=>['http://www.images.com/dresses.jpeg'] },
153
+ { 'id'=>2, 'enabled'=>false, 'image_links'=>['http://www.images.com/bras1.jpeg', 'http://www.images.com/bras2.jpeg'] }
154
+ ]
155
+ )
156
+ end
157
+ end
158
+
159
+ context 'separated with semicolons' do
160
+ let(:reader) do
161
+ StringIO.new(
162
+ csv_string = CSV.generate({col_sep: "\t"}) do |csv|
163
+ csv << %w[id enabled image_links]
164
+ csv << %w[1 true http://www.images.com/dresses.jpeg]
165
+ csv << %w[2 false http://www.images.com/bras1.jpeg;http://www.images.com/bras2.jpeg]
166
+ end
167
+ )
168
+ end
169
+
170
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
171
+
172
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
173
+
174
+ let(:bad_rows_writer) { StringIO.new }
175
+
176
+ let(:error_writer) { StringIO.new }
177
+
178
+ before do
179
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { delimiter: "\t", array_delimiter: ';' }, schema: schema).convert
180
+ end
181
+
182
+ it 'should not have any bad rows' do
183
+ expect(bad_rows_writer.read).to eq("")
184
+ end
185
+
186
+ it 'should not have any errors' do
187
+ expect(error_writer.read).to eq("")
188
+ end
189
+
190
+ it 'should store the data with the given schema' do
191
+ expect(AvroReader.new(writer).read).to eq(
192
+ [
193
+ { 'id'=>1, 'enabled'=>true, 'image_links'=>['http://www.images.com/dresses.jpeg'] },
194
+ { 'id'=>2, 'enabled'=>false, 'image_links'=>['http://www.images.com/bras1.jpeg', 'http://www.images.com/bras2.jpeg'] }
195
+ ]
196
+ )
197
+ end
198
+ end
199
+ end
200
+
201
+ context 'shema with default vaules' do
202
+ let(:schema_io) do
203
+ StringIO.new(
204
+ {
205
+ name: 'product',
206
+ type: 'record',
207
+ fields: [
208
+ { name: 'id', type: 'int' },
209
+ { name: 'category', type: 'string', default: 'unknown' },
210
+ { name: 'size_type', type: 'string', default: 'regular' },
211
+ { name: 'enabled', type: ['boolean', 'null'], default: false }
212
+ ]
213
+ }.to_json
214
+ )
215
+ end
216
+
217
+ let(:reader) do
218
+ StringIO.new(
219
+ csv_string = CSV.generate do |csv|
220
+ csv << %w[id category enabled]
221
+ csv << %w[1 dresses true]
222
+ csv << %w[2 ]
223
+ end
224
+ )
225
+ end
226
+
227
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
228
+
229
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
230
+
231
+ let(:bad_rows_writer) { StringIO.new }
232
+
233
+ let(:error_writer) { StringIO.new }
234
+
235
+ before do
236
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { write_defaults: true }, schema: schema).convert
237
+ end
238
+
239
+ it 'should not have any bad rows' do
240
+ expect(bad_rows_writer.read).to eq("")
241
+ end
242
+
243
+ it 'should not have any errors' do
244
+ expect(error_writer.read).to eq("")
245
+ end
246
+
247
+ it 'should store the defaults data' do
248
+ expect(AvroReader.new(writer).read).to eq(
249
+ [
250
+ { 'id'=>1, 'category'=>'dresses', 'size_type'=> 'regular' ,'enabled'=>true },
251
+ { 'id'=>2, 'category'=>'unknown', 'size_type'=> 'regular' ,'enabled'=>false }
252
+ ]
253
+ )
254
+ end
255
+ end
256
+
257
+ context 'schema with aliased fields' do
258
+ let(:reader) do
259
+ StringIO.new(
260
+ csv_string = CSV.generate do |csv|
261
+ csv << %w[id color_id]
262
+ csv << %w[1 1_red]
263
+ csv << %w[2 2_blue]
264
+ end
265
+ )
266
+ end
267
+
268
+ let(:schema_io) do
269
+ StringIO.new(
270
+ {
271
+ name: 'product',
272
+ type: 'record',
273
+ fields: [
274
+ { name: 'id', type: 'int' },
275
+ { name: 'look_id', type: 'string', aliases: ['color_id', 'photo_group_id'] }
276
+ ]
277
+ }.to_json
278
+ )
279
+ end
280
+
281
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
282
+
283
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
284
+
285
+ let(:bad_rows_writer) { StringIO.new }
286
+
287
+ let(:error_writer) { StringIO.new }
288
+
289
+ before do
290
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, {}, schema: schema).convert
291
+ end
292
+
293
+ it 'should not have any bad rows' do
294
+ expect(bad_rows_writer.read).to eq("")
295
+ end
296
+
297
+ it 'should not have any errors' do
298
+ expect(error_writer.read).to eq("")
299
+ end
300
+
301
+ it 'should store the data with the given schema' do
302
+ expect(AvroReader.new(writer).read).to eq(
303
+ [
304
+ {'id'=>1, 'look_id'=>'1_red'},
305
+ {'id'=>2, 'look_id'=>'2_blue'}
306
+ ]
307
+ )
308
+ end
309
+ end
310
+
311
+ context 'schema with enum column' do
312
+ let(:schema_io) do
313
+ StringIO.new(
314
+ {
315
+ name: 'product',
316
+ type: 'record',
317
+ fields: [
318
+ { name: 'id', type: 'int' },
319
+ { name: 'size_type', type:
320
+ [
321
+ {
322
+ type:'enum', name:'size_type_values', symbols:['regular', 'petite', 'plus', 'tall', 'big_and_tall', 'maternity']
323
+ }, 'null'
324
+ ], default: 'regular'
325
+ }
326
+ ]
327
+ }.to_json
328
+ )
329
+ end
330
+
331
+ let(:reader) do
332
+ StringIO.new(
333
+ csv_string = CSV.generate do |csv|
334
+ csv << %w[id size_type]
335
+ csv << %w[1 regular]
336
+ csv << %W[2 big\sand\stall]
337
+ csv << %w[3 ]
338
+ end
339
+ )
340
+ end
341
+
342
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
343
+
344
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
345
+
346
+ let(:bad_rows_writer) { StringIO.new }
347
+
348
+ let(:error_writer) { StringIO.new }
349
+
350
+ before do
351
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { write_defaults: true }, schema: schema).convert
352
+ end
353
+
354
+ it 'should not have any bad rows' do
355
+ expect(bad_rows_writer.read).to eq("")
356
+ end
357
+
358
+ it 'should not have any errors' do
359
+ expect(error_writer.read).to eq("")
360
+ end
361
+
362
+ it 'should store the data with the given schema' do
363
+ expect(AvroReader.new(writer).read).to eq(
364
+ [
365
+ { 'id'=>1, 'size_type'=>'regular' },
366
+ { 'id'=>2, 'size_type'=>'big_and_tall' },
367
+ { 'id'=>3, 'size_type'=>'regular' }
368
+ ]
369
+ )
370
+ end
371
+ end
372
+
373
+ context 'data with bad rows' do
374
+ let(:schema_io) do
375
+ StringIO.new(
376
+ {
377
+ name: 'categories',
378
+ type: 'record',
379
+ fields: [
380
+ { name: 'id', type: 'int' },
381
+ { name: 'name', type: 'string', aliases: ['title'] },
382
+ { name: 'description', type: ['string', 'null'] }
383
+ ]
384
+ }.to_json
385
+ )
386
+ end
387
+
388
+ let(:reader) do
389
+ StringIO.new(
390
+ csv_string = CSV.generate({col_sep: "\t"}) do |csv|
391
+ csv << %w[id title description]
392
+ csv << ['1', nil, 'dresses']
393
+ csv << %w[2 female-tops]
394
+ csv << %w[3 female-bottoms]
395
+ csv << ['4', nil, 'female-shoes']
396
+ end
397
+ )
398
+ end
399
+
400
+ let(:schema) { CSV2Avro::Schema.new(schema_io) }
401
+
402
+ let(:writer) { CSV2Avro::AvroWriter.new(StringIO.new, schema) }
403
+
404
+ let(:bad_rows_writer) { StringIO.new }
405
+
406
+ let(:error_writer) { StringIO.new }
407
+
408
+ before do
409
+ CSV2Avro::Converter.new(reader, writer, bad_rows_writer, error_writer, { delimiter: "\t" }, schema: schema).convert
410
+ end
411
+
412
+ it 'should have the bad data in the original form' do
413
+ expect(bad_rows_writer.string).to eq(
414
+ "id\ttitle\tdescription\n1\t\tdresses\n4\t\tfemale-shoes\n"
415
+ )
416
+ end
417
+
418
+ it 'should have an error' do
419
+ expect(error_writer.string).to eq(
420
+ "line 2: Missing value at name\nline 5: Missing value at name\n"
421
+ )
422
+ end
423
+
424
+ it 'should store the data with the given schema' do
425
+ expect(AvroReader.new(writer).read).to eq(
426
+ [
427
+ { 'id'=>2, 'name'=>'female-tops', 'description'=>nil },
428
+ { 'id'=>3, 'name'=>'female-bottoms', 'description'=>nil }
429
+ ]
430
+ )
431
+ end
432
+ end
433
+ end
434
+ end
@@ -0,0 +1,85 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe CSV2Avro::Schema do
4
+ describe '#defaults' do
5
+ context 'shema with default values' do
6
+ let(:schema_io) do
7
+ StringIO.new(
8
+ {
9
+ name: 'product',
10
+ type: 'record',
11
+ fields: [
12
+ { name: 'id', type: 'int' },
13
+ { name: 'category', type: 'string', default: 'unknown' },
14
+ { name: 'enabled', type: ['boolean', 'null'], default: false }
15
+ ]
16
+ }.to_json
17
+ )
18
+ end
19
+
20
+ subject(:schema) do
21
+ CSV2Avro::Schema.new(schema_io)
22
+ end
23
+
24
+ it 'should return a hash with the field - default value pairs' do
25
+ expect(schema.defaults).to eq({ 'category'=>'unknown', 'enabled'=>false })
26
+ end
27
+ end
28
+ end
29
+
30
+ describe '#types' do
31
+ context 'shema with different types' do
32
+ let(:schema_io) do
33
+ StringIO.new(
34
+ {
35
+ name: 'product',
36
+ type: 'record',
37
+ fields: [
38
+ { name: 'id', type: 'int' },
39
+ { name: 'category', type: 'string' },
40
+ { name: 'reviews', type: { type: 'array', items: 'string' }},
41
+ { name: 'enabled', type: ['boolean', 'null'] },
42
+ { name: 'availability', type: {
43
+ type:'enum', name:'availability_values', symbols:['in_stock', 'out_of_stock', 'preorder']
44
+ }, default: 'in_stock'
45
+ }
46
+ ]
47
+ }.to_json
48
+ )
49
+ end
50
+
51
+ subject(:schema) do
52
+ CSV2Avro::Schema.new(schema_io)
53
+ end
54
+
55
+ it 'should return a hash with the field - default value pairs' do
56
+ expect(schema.types).to eq({ 'id'=>:int, 'category'=>:string, 'reviews'=>:array, 'enabled'=>:boolean, 'availability'=>:enum })
57
+ end
58
+ end
59
+ end
60
+
61
+ describe '#aliases' do
62
+ context 'shema with aliases' do
63
+ let(:schema_io) do
64
+ StringIO.new(
65
+ {
66
+ name: 'product',
67
+ type: 'record',
68
+ fields: [
69
+ { name: 'id', type: 'int' },
70
+ { name: 'look_id', type: 'string', aliases: ['color_id', 'photo_group_id'] }
71
+ ]
72
+ }.to_json
73
+ )
74
+ end
75
+
76
+ subject(:schema) do
77
+ CSV2Avro::Schema.new(schema_io)
78
+ end
79
+
80
+ it 'should return a hash with the alias - name mapping' do
81
+ expect(schema.aliases).to eq({ 'color_id'=>'look_id', 'photo_group_id'=>'look_id' })
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe CSV2Avro do
4
+ describe '#convert' do
5
+ let(:options) do
6
+ {
7
+ schema: './spec/support/schema.avsc'
8
+ }
9
+ end
10
+
11
+ subject(:converter) do
12
+ ARGV.replace ['./spec/support/data.csv']
13
+
14
+ CSV2Avro.new(options)
15
+ end
16
+
17
+ it 'should write the problems to STDERR' do
18
+ expect { converter.convert }.to output("line 4: Missing value at name\n").to_stderr
19
+ end
20
+
21
+ it 'should have a bad row' do
22
+ File.open('./spec/support/data.bad.csv', 'r') do |file|
23
+ expect(file.read).to eq("id,name,description\n3,,Bras\n")
24
+ end
25
+ end
26
+
27
+ it 'should contain the avro data' do
28
+ File.open('./spec/support/data.avro', 'r') do |file|
29
+ expect(AvroReader.new(file).read).to eq(
30
+ [
31
+ { 'id'=>1, 'name'=>'dresses', 'description'=>'Dresses' },
32
+ { 'id'=>2, 'name'=>'female-tops', 'description'=>nil }
33
+ ]
34
+ )
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,15 @@
1
+ require 'csv2avro'
2
+ require 'csv2avro/converter'
3
+ require 'csv2avro/avro_writer'
4
+
5
+ require 'json'
6
+
7
+ Dir[File.dirname(__FILE__) + '/support/**/*.rb'].each {|f| require f }
8
+
9
+ RSpec.configure do |config|
10
+ config.after(:all) do
11
+ Dir["./spec/support/*.avro", "./spec/support/*.bad*"].each do |file|
12
+ File.delete(file)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ class AvroReader
2
+ attr_reader :io
3
+
4
+ def initialize(io, schema=nil)
5
+ @io = io
6
+ @reader = if schema
7
+ schema_file = Avro::Schema.parse(schema)
8
+ Avro::IO::DatumReader.new(nil, schema_file)
9
+ else
10
+ Avro::IO::DatumReader.new
11
+ end
12
+ end
13
+
14
+ def read
15
+ dr = Avro::DataFile::Reader.new(io, @reader)
16
+
17
+ rows = []
18
+ dr.each { |record| rows << record }
19
+
20
+ rows
21
+ end
22
+ end
@@ -0,0 +1,4 @@
1
+ id,name,description
2
+ 1,dresses,Dresses
3
+ 2,female-tops,
4
+ 3,,Bras
@@ -0,0 +1,17 @@
1
+ {
2
+ "type": "record",
3
+ "name": "categories",
4
+ "fields": [{
5
+ "name": "id",
6
+ "type": "int"
7
+ },
8
+ {
9
+ "name": "name",
10
+ "type": "string"
11
+ },
12
+ {
13
+ "name": "description",
14
+ "type": ["string", "null"]
15
+ }
16
+ ]
17
+ }