csv2avro 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -5
- data/README.md +6 -6
- data/bin/csv2avro +9 -8
- data/lib/csv2avro/avro_writer.rb +2 -2
- data/lib/csv2avro/converter.rb +9 -14
- data/lib/csv2avro/datum_writer.rb +31 -0
- data/lib/{avro_schema.rb → csv2avro/schema_validator.rb} +19 -10
- data/lib/csv2avro/version.rb +1 -1
- data/lib/csv2avro.rb +1 -1
- data/spec/csv2avro/converter_spec.rb +3 -3
- data/spec/csv2avro_spec.rb +54 -20
- data/spec/support/data.csv +2 -0
- data/spec/support/data_quoted.csv +8 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e7e9a8d86d5cd8e85b957ffecb5e60ccfe9c8b5
|
4
|
+
data.tar.gz: 2b12a6828c601dfe19e6d93bc39b481ec1e15118
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1caef810f21aa9f9b8dd1562c253a967f5b1c94382296f677dd6703624ac51d3d13774457c76820bf52c4a88795829d28a1b46017384ab306abf9f62bb50a078
|
7
|
+
data.tar.gz: cf9f67c9316d2840f883a36082a30187ae8aebbe24db71a4100496ea833b20568b9ec16f4e92258b38705b7334f07b78b52766db78245cc81c2a32e3c6244d95
|
data/CHANGELOG.md
CHANGED
@@ -3,17 +3,27 @@
|
|
3
3
|
All notable changes to this project are documented in this file.
|
4
4
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
5
5
|
|
6
|
-
## 1.0
|
6
|
+
## 1.1.0 (2015-09-16) [compare](https://github.com/sspinc/csv2avro/compare/1.0.2...1.1.0))
|
7
|
+
|
8
|
+
### Changed
|
9
|
+
* Write usage and error messages to stderr
|
10
|
+
* Exit code 1 for general errors, 2 for missing arguments
|
11
|
+
* Bad rows report with error causes instead of bad rows csv
|
12
|
+
|
13
|
+
### Fixed
|
14
|
+
* Handle quoted headers
|
15
|
+
|
16
|
+
## 1.0.2 (2015-06-29) [compare](https://github.com/sspinc/csv2avro/compare/1.0.1...1.0.2))
|
7
17
|
|
8
18
|
### Fixed
|
9
19
|
* Continue on parsing errors
|
10
20
|
|
11
|
-
## 1.0.1 (2015-06-12
|
21
|
+
## 1.0.1 (2015-06-12) [compare](https://github.com/sspinc/csv2avro/compare/1.0.0...1.0.1))
|
12
22
|
|
13
23
|
### Fixed
|
14
24
|
* CSV parsing issues
|
15
25
|
|
16
|
-
## 1.0.0 (2015-06-05
|
26
|
+
## 1.0.0 (2015-06-05) [compare](https://github.com/sspinc/csv2avro/compare/0.4.0...1.0.0))
|
17
27
|
|
18
28
|
### Added
|
19
29
|
* Usage description to readme
|
@@ -23,7 +33,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
23
33
|
### Fixed
|
24
34
|
* Docker image entrypoint
|
25
35
|
|
26
|
-
## 0.4.0 (2015-05-07
|
36
|
+
## 0.4.0 (2015-05-07) [compare](https://github.com/sspinc/csv2avro/compare/0.3.0...0.4.0))
|
27
37
|
|
28
38
|
### Added
|
29
39
|
* Streaming support (#7)
|
@@ -38,7 +48,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
38
48
|
### Fixed
|
39
49
|
* Build project into Docker image (#9)
|
40
50
|
|
41
|
-
## 0.3.0 (2015-04-28
|
51
|
+
## 0.3.0 (2015-04-28) [compare](https://github.com/sspinc/csv2avro/compare/0.1.0...0.3.0))
|
42
52
|
|
43
53
|
### Added
|
44
54
|
* Docker support (#6)
|
data/README.md
CHANGED
@@ -14,13 +14,13 @@ or if you prefer to live on the edge, just clone this repository and build it fr
|
|
14
14
|
```
|
15
15
|
$ csv2avro --schema ./spec/support/schema.avsc ./spec/support/data.csv
|
16
16
|
```
|
17
|
-
This will process the data.csv file and creates a *data.avro* file and a *data.bad
|
17
|
+
This will process the data.csv file and creates a *data.avro* file and a *data.bad* file with a report of the bad rows.
|
18
18
|
|
19
|
-
You can override the bad
|
19
|
+
You can override the bad rows report file location with the `--bad-rows [BAD_ROWS]` option.
|
20
20
|
|
21
21
|
### Streaming
|
22
22
|
```
|
23
|
-
$ cat ./spec/support/data.csv | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad
|
23
|
+
$ cat ./spec/support/data.csv | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad > ./spec/support/data.avro
|
24
24
|
```
|
25
25
|
This will process the *input stream* and push the avro data to the *output stream*. If you're working with streams you will need to specify the `--bad-rows` location.
|
26
26
|
|
@@ -29,7 +29,7 @@ This will process the *input stream* and push the avro data to the *output strea
|
|
29
29
|
#### AWS S3 storage
|
30
30
|
|
31
31
|
```
|
32
|
-
aws s3 cp s3://csv-bucket/transactions.csv - | csv2avro --schema ./transactions.avsc --bad-rows ./transactions.bad
|
32
|
+
aws s3 cp s3://csv-bucket/transactions.csv - | csv2avro --schema ./transactions.avsc --bad-rows ./transactions.bad | aws s3 cp - s3://avro-bucket/transactions.avro
|
33
33
|
```
|
34
34
|
|
35
35
|
This will stream your file stored in AWS S3, converts the data and pushes it back to S3. For more information, please check the [AWS CLI documentation](http://docs.aws.amazon.com/cli/latest/reference/s3/index.html).
|
@@ -37,7 +37,7 @@ This will stream your file stored in AWS S3, converts the data and pushes it bac
|
|
37
37
|
#### Convert compressed files
|
38
38
|
|
39
39
|
```
|
40
|
-
gunzip -c ./spec/support/data.csv.gz | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad
|
40
|
+
gunzip -c ./spec/support/data.csv.gz | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad > ./spec/support/data.avro
|
41
41
|
```
|
42
42
|
|
43
43
|
This will uncompress the file and converts it to avro, leaving the original file intact.
|
@@ -50,7 +50,7 @@ $ csv2avro --help
|
|
50
50
|
Version 1.0.1 of CSV2Avro
|
51
51
|
Usage: csv2avro [options] [file]
|
52
52
|
-s, --schema SCHEMA A file containing the Avro schema. This value is required.
|
53
|
-
-b, --bad-rows [BAD_ROWS] The output location of the bad rows file.
|
53
|
+
-b, --bad-rows [BAD_ROWS] The output location of the bad rows report file.
|
54
54
|
-d, --delimiter [DELIMITER] Field delimiter. If none specified, then comma is used as the delimiter.
|
55
55
|
-a [ARRAY_DELIMITER], Array field delimiter. If none specified, then comma is used as the delimiter.
|
56
56
|
--array-delimiter
|
data/bin/csv2avro
CHANGED
@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
|
|
14
14
|
options[:schema] = path
|
15
15
|
end
|
16
16
|
|
17
|
-
opts.on('-b', '--bad-rows [BAD_ROWS]', 'The output location of the bad rows file.') do |path|
|
17
|
+
opts.on('-b', '--bad-rows [BAD_ROWS]', 'The output location of the bad rows report file.') do |path|
|
18
18
|
options[:bad_rows] = path
|
19
19
|
end
|
20
20
|
|
@@ -35,7 +35,7 @@ option_parser = OptionParser.new do |opts|
|
|
35
35
|
end
|
36
36
|
|
37
37
|
opts.on('-h', '--help', 'Prints help') do
|
38
|
-
puts opts
|
38
|
+
$stderr.puts opts
|
39
39
|
exit
|
40
40
|
end
|
41
41
|
end
|
@@ -48,11 +48,12 @@ begin
|
|
48
48
|
|
49
49
|
CSV2Avro.new(options).convert
|
50
50
|
rescue OptionParser::MissingArgument => ex
|
51
|
-
puts ex.message
|
52
|
-
|
53
|
-
|
51
|
+
$stderr.puts ex.message
|
52
|
+
$stderr.puts option_parser
|
53
|
+
exit 2
|
54
54
|
rescue Exception => e
|
55
|
-
puts 'Uh oh, something went wrong!'
|
56
|
-
puts e.message
|
57
|
-
puts e.backtrace.join("\n")
|
55
|
+
$stderr.puts 'Uh oh, something went wrong!'
|
56
|
+
$stderr.puts e.message
|
57
|
+
$stderr.puts e.backtrace.join("\n")
|
58
|
+
exit 1
|
58
59
|
end
|
data/lib/csv2avro/avro_writer.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'avro'
|
2
|
-
require 'avro_schema'
|
3
2
|
require 'forwardable'
|
3
|
+
require 'csv2avro/datum_writer'
|
4
4
|
|
5
5
|
class CSV2Avro
|
6
6
|
class AvroWriter
|
@@ -12,7 +12,7 @@ class CSV2Avro
|
|
12
12
|
def_delegators :avro_writer, :flush, :close
|
13
13
|
|
14
14
|
def initialize(writer, schema)
|
15
|
-
datum_writer =
|
15
|
+
datum_writer = CSV2Avro::DatumWriter.new(schema.avro_schema)
|
16
16
|
@avro_writer = Avro::DataFile::Writer.new(writer, datum_writer, schema.avro_schema)
|
17
17
|
end
|
18
18
|
|
data/lib/csv2avro/converter.rb
CHANGED
@@ -13,7 +13,7 @@ class CSV2Avro
|
|
13
13
|
@schema = schema
|
14
14
|
|
15
15
|
# read header row explicitly
|
16
|
-
@header = @reader.readline.strip.split(col_sep)
|
16
|
+
@header = @reader.readline.strip.split(col_sep).map{ |col| col.gsub('"','') }
|
17
17
|
end
|
18
18
|
|
19
19
|
def convert
|
@@ -21,7 +21,9 @@ class CSV2Avro
|
|
21
21
|
begin
|
22
22
|
row = csv.shift
|
23
23
|
rescue CSV::MalformedCSVError
|
24
|
-
|
24
|
+
error_msg = "L#{row_number}: Unable to parse"
|
25
|
+
@error_writer.puts(error_msg)
|
26
|
+
@bad_rows_writer.puts(error_msg)
|
25
27
|
next
|
26
28
|
end
|
27
29
|
hash = row.to_hash
|
@@ -31,12 +33,10 @@ class CSV2Avro
|
|
31
33
|
|
32
34
|
begin
|
33
35
|
@writer.write(hash)
|
34
|
-
rescue
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
@error_writer.puts("line #{line_number}: #{Avro::Schema.errors.shift}")
|
39
|
-
end
|
36
|
+
rescue CSV2Avro::SchemaValidationError => e
|
37
|
+
error_msg = "L#{row_number}: #{e.errors.join(', ')}"
|
38
|
+
@error_writer.puts(error_msg)
|
39
|
+
@bad_rows_writer.puts(error_msg)
|
40
40
|
end
|
41
41
|
end
|
42
42
|
@writer.flush
|
@@ -71,12 +71,7 @@ class CSV2Avro
|
|
71
71
|
@csv ||= CSV.new(@reader, csv_options)
|
72
72
|
end
|
73
73
|
|
74
|
-
def
|
75
|
-
options = csv_options.tap { |hash| hash.delete(:header_converters) }
|
76
|
-
@bad_rows_csv ||= CSV.new(@bad_rows_writer, options)
|
77
|
-
end
|
78
|
-
|
79
|
-
def line_number
|
74
|
+
def row_number
|
80
75
|
@reader.lineno + 1
|
81
76
|
end
|
82
77
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'avro'
|
2
|
+
require 'csv2avro/schema_validator'
|
3
|
+
|
4
|
+
class CSV2Avro
|
5
|
+
class DatumWriter < Avro::IO::DatumWriter
|
6
|
+
|
7
|
+
attr_reader :schema_validator
|
8
|
+
|
9
|
+
def initialize(*args)
|
10
|
+
super
|
11
|
+
@schema_validator = CSV2Avro::SchemaValidator.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def write(datum, encoder)
|
15
|
+
schema_validator.clear
|
16
|
+
if !schema_validator.validate(writers_schema, datum)
|
17
|
+
raise SchemaValidationError.new(schema_validator.errors)
|
18
|
+
end
|
19
|
+
super
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class SchemaValidationError < StandardError
|
24
|
+
|
25
|
+
attr_reader :errors
|
26
|
+
|
27
|
+
def initialize(schema_errors)
|
28
|
+
@errors = schema_errors
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -1,12 +1,19 @@
|
|
1
|
-
|
2
|
-
class Schema
|
3
|
-
@errors = []
|
1
|
+
require 'avro/schema'
|
4
2
|
|
5
|
-
|
6
|
-
|
3
|
+
class CSV2Avro
|
4
|
+
class SchemaValidator
|
5
|
+
|
6
|
+
attr_reader :errors
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@errors = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def clear
|
13
|
+
@errors.clear
|
7
14
|
end
|
8
15
|
|
9
|
-
def
|
16
|
+
def validate(expected_schema, datum, name=nil, suppress_error=false)
|
10
17
|
expected_type = expected_schema.type_sym
|
11
18
|
|
12
19
|
valid = case expected_type
|
@@ -18,10 +25,10 @@ module Avro
|
|
18
25
|
datum.is_a? String
|
19
26
|
when :int
|
20
27
|
(datum.is_a?(Fixnum) || datum.is_a?(Bignum)) &&
|
21
|
-
(INT_MIN_VALUE <= datum) && (datum <= INT_MAX_VALUE)
|
28
|
+
(Avro::Schema::INT_MIN_VALUE <= datum) && (datum <= Avro::Schema::INT_MAX_VALUE)
|
22
29
|
when :long
|
23
30
|
(datum.is_a?(Fixnum) || datum.is_a?(Bignum)) &&
|
24
|
-
(LONG_MIN_VALUE <= datum) && (datum <= LONG_MAX_VALUE)
|
31
|
+
(Avro::Schema::LONG_MIN_VALUE <= datum) && (datum <= Avro::Schema::LONG_MAX_VALUE)
|
25
32
|
when :float, :double
|
26
33
|
datum.is_a?(Float) || datum.is_a?(Fixnum) || datum.is_a?(Bignum)
|
27
34
|
when :fixed
|
@@ -38,12 +45,14 @@ module Avro
|
|
38
45
|
expected_schema.schemas.any?{|s| validate(s, datum, nil, true) }
|
39
46
|
when :record, :error, :request
|
40
47
|
datum.is_a?(Hash) &&
|
41
|
-
expected_schema.fields.
|
48
|
+
expected_schema.fields.reduce(true){|result, f|
|
49
|
+
validate_result = validate(f.type, datum[f.name], f.name)
|
50
|
+
result && validate_result }
|
42
51
|
else
|
43
52
|
false
|
44
53
|
end
|
45
54
|
|
46
|
-
if !
|
55
|
+
if !valid && name
|
47
56
|
if datum.nil? && expected_type != :null
|
48
57
|
@errors << "Missing value at #{name}"
|
49
58
|
else
|
data/lib/csv2avro/version.rb
CHANGED
data/lib/csv2avro.rb
CHANGED
@@ -351,15 +351,15 @@ RSpec.describe CSV2Avro::Converter do
|
|
351
351
|
CSV2Avro::Converter.new(reader, avro_writer, bad_rows_writer, error_writer, { delimiter: "\t" }, schema: schema).convert
|
352
352
|
end
|
353
353
|
|
354
|
-
it 'should
|
354
|
+
it 'should report the bad rows correctly' do
|
355
355
|
expect(bad_rows_writer.string).to eq(
|
356
|
-
"
|
356
|
+
"L2: Missing value at name\nL5: Missing value at name\n"
|
357
357
|
)
|
358
358
|
end
|
359
359
|
|
360
360
|
it 'should have an error' do
|
361
361
|
expect(error_writer.string).to eq(
|
362
|
-
"
|
362
|
+
"L2: Missing value at name\nL5: Missing value at name\n"
|
363
363
|
)
|
364
364
|
end
|
365
365
|
|
data/spec/csv2avro_spec.rb
CHANGED
@@ -3,32 +3,66 @@ require 'spec_helper'
|
|
3
3
|
RSpec.describe CSV2Avro do
|
4
4
|
describe '#convert' do
|
5
5
|
let(:options) { { schema: './spec/support/schema.avsc' } }
|
6
|
-
|
7
|
-
|
8
|
-
ARGV.replace ['./spec/support/data.csv']
|
6
|
+
subject(:converter) do
|
7
|
+
CSV2Avro.new(options)
|
9
8
|
end
|
10
|
-
subject(:converter) { CSV2Avro.new(options) }
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
context "Unquoted header" do
|
11
|
+
before do
|
12
|
+
ARGV.replace ['./spec/support/data.csv']
|
13
|
+
end
|
14
|
+
|
15
|
+
bad_rows_output = "L4: Missing value at name\nL7: Unable to parse\nL9: Missing value at id, Missing value at name\nL10: 'male-shoes' at id doesn't match the type '\"int\"', Missing value at name\n"
|
16
|
+
it 'should write errors to STDERR' do
|
17
|
+
expect { converter.convert }.to output(bad_rows_output).to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should have bad rows' do
|
21
|
+
File.open('./spec/support/data.bad', 'r') do |file|
|
22
|
+
expect(file.read).to eq(bad_rows_output)
|
23
|
+
end
|
24
|
+
end
|
15
25
|
|
16
|
-
|
17
|
-
|
18
|
-
|
26
|
+
it 'should contain the avro data' do
|
27
|
+
File.open('./spec/support/data.avro', 'r') do |file|
|
28
|
+
expect(AvroReader.new(file).read).to eq(
|
29
|
+
[
|
30
|
+
{ 'id'=>1, 'name'=>'dresses', 'description'=>'Dresses' },
|
31
|
+
{ 'id'=>2, 'name'=>'female-tops', 'description'=>nil },
|
32
|
+
{ 'id'=>4, 'name'=>'male-tops', 'description'=>"Male Tops\nand Male Shirts"},
|
33
|
+
{ 'id'=>6, 'name'=>'male-shoes', 'description'=>'Male Shoes'}
|
34
|
+
]
|
35
|
+
)
|
36
|
+
end
|
19
37
|
end
|
20
38
|
end
|
21
39
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
40
|
+
context "Quoted header" do
|
41
|
+
before do
|
42
|
+
ARGV.replace ['./spec/support/data_quoted.csv']
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should write errors to STDERR' do
|
46
|
+
expect { converter.convert }.to output("L4: Missing value at name\nL7: Unable to parse\n").to_stderr
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should have a bad row' do
|
50
|
+
File.open('./spec/support/data_quoted.bad', 'r') do |file|
|
51
|
+
expect(file.read).to eq("L4: Missing value at name\nL7: Unable to parse\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should contain the avro data' do
|
56
|
+
File.open('./spec/support/data_quoted.avro', 'r') do |file|
|
57
|
+
expect(AvroReader.new(file).read).to eq(
|
58
|
+
[
|
59
|
+
{ 'id'=>1, 'name'=>'dresses', 'description'=>'Dresses' },
|
60
|
+
{ 'id'=>2, 'name'=>'female-tops', 'description'=>nil },
|
61
|
+
{ 'id'=>4, 'name'=>'male-tops', 'description'=>"Male Tops\nand Male Shirts"},
|
62
|
+
{ 'id'=>6, 'name'=>'male-shoes', 'description'=>'Male Shoes'}
|
63
|
+
]
|
64
|
+
)
|
65
|
+
end
|
32
66
|
end
|
33
67
|
end
|
34
68
|
end
|
data/spec/support/data.csv
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv2avro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Ableda
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-
|
12
|
+
date: 2015-09-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -113,11 +113,12 @@ files:
|
|
113
113
|
- Rakefile
|
114
114
|
- bin/csv2avro
|
115
115
|
- csv2avro.gemspec
|
116
|
-
- lib/avro_schema.rb
|
117
116
|
- lib/csv2avro.rb
|
118
117
|
- lib/csv2avro/avro_writer.rb
|
119
118
|
- lib/csv2avro/converter.rb
|
119
|
+
- lib/csv2avro/datum_writer.rb
|
120
120
|
- lib/csv2avro/schema.rb
|
121
|
+
- lib/csv2avro/schema_validator.rb
|
121
122
|
- lib/csv2avro/version.rb
|
122
123
|
- spec/csv2avro/converter_spec.rb
|
123
124
|
- spec/csv2avro/schema_spec.rb
|
@@ -125,6 +126,7 @@ files:
|
|
125
126
|
- spec/spec_helper.rb
|
126
127
|
- spec/support/avro_reader.rb
|
127
128
|
- spec/support/data.csv
|
129
|
+
- spec/support/data_quoted.csv
|
128
130
|
- spec/support/schema.avsc
|
129
131
|
homepage: ''
|
130
132
|
licenses:
|
@@ -157,5 +159,6 @@ test_files:
|
|
157
159
|
- spec/spec_helper.rb
|
158
160
|
- spec/support/avro_reader.rb
|
159
161
|
- spec/support/data.csv
|
162
|
+
- spec/support/data_quoted.csv
|
160
163
|
- spec/support/schema.avsc
|
161
164
|
has_rdoc:
|