csv2avro 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.dockerignore +1 -0
- data/.gitignore +15 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +48 -0
- data/Dockerfile +23 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +80 -0
- data/Rakefile +41 -0
- data/bin/csv2avro +58 -0
- data/csv2avro.gemspec +28 -0
- data/lib/avro_schema.rb +57 -0
- data/lib/csv2avro/avro_writer.rb +27 -0
- data/lib/csv2avro/converter.rb +125 -0
- data/lib/csv2avro/schema.rb +44 -0
- data/lib/csv2avro/version.rb +3 -0
- data/lib/csv2avro.rb +78 -0
- data/spec/csv2avro/converter_spec.rb +434 -0
- data/spec/csv2avro/schema_spec.rb +85 -0
- data/spec/csv2avro_spec.rb +38 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/support/avro_reader.rb +22 -0
- data/spec/support/data.csv +4 -0
- data/spec/support/schema.avsc +17 -0
- metadata +161 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6eae97d5b2bf7476331128770ffee4d3b6d69d7a
|
4
|
+
data.tar.gz: 554e64338b5950de37ccaad44927176f6922f94c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3a70f269a7337d6dad0bd24528e9092b897217254610a8257db0fecc72d55dada505c68040b3a1309b3e8266473257f118a88231a54b5627c74ffb63c998d49c
|
7
|
+
data.tar.gz: 01cc32197d34410522aed53d4682aa9c91b20a63ad9e09a80831cc7e6af6d5bfd1a972488bb7b52f263451222a0363f10e5ab8a07e09ab38a5154b46496ff93e
|
data/.dockerignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
.git
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project are documented in this file.
|
4
|
+
This project adheres to [Semantic Versioning](http://semver.org/).
|
5
|
+
|
6
|
+
## 1.0.0 (2015-06-05; [compare](https://github.com/sspinc/csv2avro/compare/0.4.0...1.0.0))
|
7
|
+
|
8
|
+
### Added
|
9
|
+
* Usage description to readme
|
10
|
+
* Detailed exception reporting
|
11
|
+
* `aws-cli` to Docker image
|
12
|
+
|
13
|
+
### Fixed
|
14
|
+
* Docker image entrypoint
|
15
|
+
|
16
|
+
## 0.4.0 (2015-05-07; [compare](https://github.com/sspinc/csv2avro/compare/0.3.0...0.4.0))
|
17
|
+
|
18
|
+
### Added
|
19
|
+
* Streaming support (#7)
|
20
|
+
* `rake docker:spec` task
|
21
|
+
|
22
|
+
### Removed
|
23
|
+
* S3 support (#7)
|
24
|
+
|
25
|
+
### Changed
|
26
|
+
* Do not include .git in Docker build context
|
27
|
+
|
28
|
+
### Fixed
|
29
|
+
* Build project into Docker image (#9)
|
30
|
+
|
31
|
+
## 0.3.0 (2015-04-28; [compare](https://github.com/sspinc/csv2avro/compare/0.1.0...0.3.0))
|
32
|
+
|
33
|
+
### Added
|
34
|
+
* Docker support (#6)
|
35
|
+
* `rake docker:build` task
|
36
|
+
* `rake docker:push` task to push to Docker Hub
|
37
|
+
* Semantic Docker tags
|
38
|
+
* CHANGELOG.md
|
39
|
+
|
40
|
+
## 0.1.0 (2015-04-07)
|
41
|
+
Initial release
|
42
|
+
|
43
|
+
### Added
|
44
|
+
* CLI (`csv2avro convert`) to convert CSV files to Avro (#1)
|
45
|
+
* Travis CI (#2)
|
46
|
+
* Bad rows (#4)
|
47
|
+
* Versioning ($5)
|
48
|
+
* Gem packaging
|
data/Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
FROM ruby:2.1
|
2
|
+
MAINTAINER Secret Sauce Partners, Inc. <dev@sspinc.io>
|
3
|
+
|
4
|
+
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
5
|
+
python2.7 get-pip.py && \
|
6
|
+
pip install awscli
|
7
|
+
|
8
|
+
# throw errors if Gemfile has been modified since Gemfile.lock
|
9
|
+
RUN bundle config --global frozen 1
|
10
|
+
|
11
|
+
RUN mkdir -p /srv/csv2avro
|
12
|
+
WORKDIR /srv/csv2avro
|
13
|
+
|
14
|
+
RUN mkdir -p /srv/csv2avro/lib/csv2avro
|
15
|
+
|
16
|
+
COPY lib/csv2avro/version.rb /srv/csv2avro/lib/csv2avro/version.rb
|
17
|
+
COPY csv2avro.gemspec Gemfile Gemfile.lock /srv/csv2avro/
|
18
|
+
|
19
|
+
RUN bundle install
|
20
|
+
|
21
|
+
COPY . /srv/csv2avro
|
22
|
+
|
23
|
+
ENTRYPOINT ["./bin/csv2avro"]
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Secret Sauce Partners, Inc.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# CSV2Avro
|
2
|
+
|
3
|
+
Convert CSV files to Avro like a boss.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
$ gem install csv2avro
|
8
|
+
|
9
|
+
or if you prefer to live on the edge, just clone this repository and build it from scratch.
|
10
|
+
|
11
|
+
You can run the converter within a **Docker** container, you just need to pull the `sspinc/csv2avro` image.
|
12
|
+
|
13
|
+
```
|
14
|
+
$ docker pull sspinc/csv2avro
|
15
|
+
```
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
### Basic
|
20
|
+
```
|
21
|
+
$ csv2avro --schema ./spec/support/schema.avsc ./spec/support/data.csv
|
22
|
+
```
|
23
|
+
This will process the data.csv file and creates a *data.avro* file and a *data.bad.csv* file with the bad rows.
|
24
|
+
|
25
|
+
You can override the bad-rows file location with the `--bad-rows [BAD_ROWS]` option.
|
26
|
+
|
27
|
+
### CSV2Avro in Docker
|
28
|
+
|
29
|
+
```
|
30
|
+
$ docker run sspinc/csv2avro --help
|
31
|
+
```
|
32
|
+
|
33
|
+
### Streaming
|
34
|
+
```
|
35
|
+
$ cat ./spec/support/data.csv | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad.csv > ./spec/support/data.avro
|
36
|
+
```
|
37
|
+
This will process the *input stream* and push the avro data to the *output stream*. If you're working with streams you will need to specify the `--bad-rows` location.
|
38
|
+
|
39
|
+
### Advanced features
|
40
|
+
|
41
|
+
#### AWS S3 storage
|
42
|
+
|
43
|
+
```
|
44
|
+
aws s3 cp s3://csv-bucket/transactions.csv - | csv2avro --schema ./transactions.avsc --bad-rows ./transactions.bad.csv | aws s3 cp - s3://avro-bucket/transactions.avro
|
45
|
+
```
|
46
|
+
|
47
|
+
This will stream your file stored in AWS S3, converts the data and pushes it back to S3. For more information, please check the [AWS CLI documentation](http://docs.aws.amazon.com/cli/latest/reference/s3/index.html).
|
48
|
+
|
49
|
+
#### Convert compressed files
|
50
|
+
|
51
|
+
```
|
52
|
+
gunzip -c ./spec/support/data.csv.gz | csv2avro --schema ./spec/support/schema.avsc --bad-rows ./spec/support/data.bad.csv > ./spec/support/data.avro
|
53
|
+
```
|
54
|
+
|
55
|
+
This will uncompress the file and converts it to avro, leaving the original file intact.
|
56
|
+
|
57
|
+
### More
|
58
|
+
|
59
|
+
For a full list of available options, run `csv2avro --help`
|
60
|
+
```
|
61
|
+
$ csv2avro --help
|
62
|
+
Version 1.0.0 of CSV2Avro
|
63
|
+
Usage: csv2avro [options] [file]
|
64
|
+
-s, --schema SCHEMA A file containing the Avro schema. This value is required.
|
65
|
+
-b, --bad-rows [BAD_ROWS] The output location of the bad rows file.
|
66
|
+
-d, --delimiter [DELIMITER] Field delimiter. If none specified, then comma is used as the delimiter.
|
67
|
+
-a [ARRAY_DELIMITER], Array field delimiter. If none specified, then comma is used as the delimiter.
|
68
|
+
--array-delimiter
|
69
|
+
-D, --write-defaults Write default values.
|
70
|
+
-c, --stdout Output will go to the standard output stream, leaving files intact.
|
71
|
+
-h, --help Prints help
|
72
|
+
```
|
73
|
+
|
74
|
+
## Contributing
|
75
|
+
|
76
|
+
1. Fork it ( https://github.com/sspinc/csv2avro/fork )
|
77
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
78
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
79
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
80
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'bundler/gem_tasks'
|
3
|
+
require 'bump/tasks'
|
4
|
+
|
5
|
+
# Remove pre and set rake tasks
|
6
|
+
Rake.application.instance_eval do
|
7
|
+
%w[bump:pre bump:set].each do |task|
|
8
|
+
@tasks.delete(task)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# Default directory to look in is `/spec`
|
13
|
+
# Run with `rake spec`
|
14
|
+
RSpec::Core::RakeTask.new(:spec) do |task|
|
15
|
+
task.rspec_opts = ['--color', '--format', 'documentation']
|
16
|
+
end
|
17
|
+
|
18
|
+
task :default => :spec
|
19
|
+
|
20
|
+
namespace :docker do
|
21
|
+
desc "Build docker image"
|
22
|
+
task :build do
|
23
|
+
sh "docker build -t sspinc/csv2avro:#{CSV2Avro::VERSION} ."
|
24
|
+
minor_version = CSV2Avro::VERSION.sub(/\.[0-9]+$/, '')
|
25
|
+
sh "docker tag -f sspinc/csv2avro:#{CSV2Avro::VERSION} sspinc/csv2avro:#{minor_version}"
|
26
|
+
major_version = minor_version.sub(/\.[0-9]+$/, '')
|
27
|
+
sh "docker tag -f sspinc/csv2avro:#{CSV2Avro::VERSION} sspinc/csv2avro:#{major_version}"
|
28
|
+
|
29
|
+
sh "docker tag -f sspinc/csv2avro:#{CSV2Avro::VERSION} sspinc/csv2avro:latest"
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "Run specs inside docker image"
|
33
|
+
task :spec => :build do
|
34
|
+
sh "docker run -t --entrypoint=rake sspinc/csv2avro:#{CSV2Avro::VERSION} spec"
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Push docker image"
|
38
|
+
task :push => :spec do
|
39
|
+
sh "docker push sspinc/csv2avro"
|
40
|
+
end
|
41
|
+
end
|
data/bin/csv2avro
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH << File.dirname(__FILE__) + '/../lib' if $0 == __FILE__
|
4
|
+
require 'optparse'
|
5
|
+
require 'csv2avro'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
|
9
|
+
option_parser = OptionParser.new do |opts|
|
10
|
+
opts.banner = "Version #{CSV2Avro::VERSION} of CSV2Avro\n" \
|
11
|
+
"Usage: #{File.basename(__FILE__)} [options] [file]"
|
12
|
+
|
13
|
+
opts.on('-s', '--schema SCHEMA', 'A file containing the Avro schema. This value is required.') do |path|
|
14
|
+
options[:schema] = path
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-b', '--bad-rows [BAD_ROWS]', 'The output location of the bad rows file.') do |path|
|
18
|
+
options[:bad_rows] = path
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on('-d', '--delimiter [DELIMITER]', 'Field delimiter. If none specified, then comma is used as the delimiter.') do |char|
|
22
|
+
options[:delimiter] = char.gsub("\\t", "\t")
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on('-a', '--array-delimiter [ARRAY_DELIMITER]', 'Array field delimiter. If none specified, then comma is used as the delimiter.') do |char|
|
26
|
+
options[:array_delimiter] = char
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('-D', '--write-defaults', 'Write default values.') do
|
30
|
+
options[:write_defaults] = true
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('-c', '--stdout', 'Output will go to the standard output stream, leaving files intact.') do
|
34
|
+
options[:stdout] = true
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('-h', '--help', 'Prints help') do
|
38
|
+
puts opts
|
39
|
+
exit
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
option_parser.parse!
|
44
|
+
|
45
|
+
begin
|
46
|
+
raise OptionParser::MissingArgument.new('--schema') if options[:schema].nil?
|
47
|
+
raise OptionParser::MissingArgument.new('--bad-rows') if options[:bad_rows].nil? && ARGV.empty?
|
48
|
+
|
49
|
+
CSV2Avro.new(options).convert
|
50
|
+
rescue OptionParser::MissingArgument => ex
|
51
|
+
puts ex.message
|
52
|
+
|
53
|
+
puts option_parser
|
54
|
+
rescue Exception => e
|
55
|
+
puts 'Uh oh, something went wrong!'
|
56
|
+
puts e.message
|
57
|
+
puts e.backtrace.join("\n")
|
58
|
+
end
|
data/csv2avro.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'csv2avro/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "csv2avro"
|
8
|
+
spec.version = CSV2Avro::VERSION
|
9
|
+
spec.authors = ["Peter Ableda"]
|
10
|
+
spec.email = ["scotty@secretsaucepartners.com"]
|
11
|
+
spec.summary = %q{Convert CSV files to Avro}
|
12
|
+
spec.description = %q{Convert CSV files to Avro like a boss.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec", "~> 3.2"
|
24
|
+
spec.add_development_dependency "pry", "~> 0.10"
|
25
|
+
spec.add_development_dependency "bump", "~> 0.5"
|
26
|
+
|
27
|
+
spec.add_dependency "avro", "~> 1.7"
|
28
|
+
end
|
data/lib/avro_schema.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
module Avro
|
2
|
+
class Schema
|
3
|
+
@errors = []
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_accessor :errors
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.validate(expected_schema, datum, name=nil, suppress_error=false)
|
10
|
+
expected_type = expected_schema.type_sym
|
11
|
+
|
12
|
+
valid = case expected_type
|
13
|
+
when :null
|
14
|
+
datum.nil?
|
15
|
+
when :boolean
|
16
|
+
datum == true || datum == false
|
17
|
+
when :string, :bytes
|
18
|
+
datum.is_a? String
|
19
|
+
when :int
|
20
|
+
(datum.is_a?(Fixnum) || datum.is_a?(Bignum)) &&
|
21
|
+
(INT_MIN_VALUE <= datum) && (datum <= INT_MAX_VALUE)
|
22
|
+
when :long
|
23
|
+
(datum.is_a?(Fixnum) || datum.is_a?(Bignum)) &&
|
24
|
+
(LONG_MIN_VALUE <= datum) && (datum <= LONG_MAX_VALUE)
|
25
|
+
when :float, :double
|
26
|
+
datum.is_a?(Float) || datum.is_a?(Fixnum) || datum.is_a?(Bignum)
|
27
|
+
when :fixed
|
28
|
+
datum.is_a?(String) && datum.size == expected_schema.size
|
29
|
+
when :enum
|
30
|
+
expected_schema.symbols.include? datum
|
31
|
+
when :array
|
32
|
+
datum.is_a?(Array) &&
|
33
|
+
datum.all?{|d| validate(expected_schema.items, d) }
|
34
|
+
when :map
|
35
|
+
datum.keys.all?{|k| k.is_a? String } &&
|
36
|
+
datum.values.all?{|v| validate(expected_schema.values, v) }
|
37
|
+
when :union
|
38
|
+
expected_schema.schemas.any?{|s| validate(s, datum, nil, true) }
|
39
|
+
when :record, :error, :request
|
40
|
+
datum.is_a?(Hash) &&
|
41
|
+
expected_schema.fields.all?{|f| validate(f.type, datum[f.name], f.name) }
|
42
|
+
else
|
43
|
+
false
|
44
|
+
end
|
45
|
+
|
46
|
+
if !suppress_error && !valid && name
|
47
|
+
if datum.nil? && expected_type != :null
|
48
|
+
@errors << "Missing value at #{name}"
|
49
|
+
else
|
50
|
+
@errors << "'#{datum}' at #{name} does'n match the type '#{expected_schema.to_s}'"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
valid
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'avro'
|
2
|
+
require 'avro_schema'
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
class CSV2Avro
|
6
|
+
class AvroWriter
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
attr_reader :avro_writer
|
10
|
+
|
11
|
+
def_delegators :'avro_writer.writer', :seek, :read, :eof?
|
12
|
+
def_delegators :avro_writer, :flush, :close
|
13
|
+
|
14
|
+
def initialize(writer, schema)
|
15
|
+
datum_writer = Avro::IO::DatumWriter.new(schema.avro_schema)
|
16
|
+
@avro_writer = Avro::DataFile::Writer.new(writer, datum_writer, schema.avro_schema)
|
17
|
+
end
|
18
|
+
|
19
|
+
def writer_schema
|
20
|
+
avro_writer.datum_writer.writers_schema
|
21
|
+
end
|
22
|
+
|
23
|
+
def write(hash)
|
24
|
+
avro_writer << hash
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'csv2avro/schema'
|
2
|
+
require 'csv2avro/avro_writer'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
class CSV2Avro
|
6
|
+
class Converter
|
7
|
+
attr_reader :writer, :bad_rows_writer, :error_writer, :schema, :reader, :csv_options, :converter_options, :header_row, :column_separator
|
8
|
+
|
9
|
+
def initialize(reader, writer, bad_rows_writer, error_writer, options, schema: schema)
|
10
|
+
@writer = writer
|
11
|
+
@bad_rows_writer = bad_rows_writer
|
12
|
+
@error_writer = error_writer
|
13
|
+
@schema = schema
|
14
|
+
|
15
|
+
@column_separator = options[:delimiter] || ','
|
16
|
+
|
17
|
+
@reader = reader
|
18
|
+
@header_row = reader.readline.strip
|
19
|
+
header = header_row.split(column_separator)
|
20
|
+
|
21
|
+
init_header_converter
|
22
|
+
@csv_options = {
|
23
|
+
headers: header,
|
24
|
+
skip_blanks: true,
|
25
|
+
col_sep: column_separator,
|
26
|
+
header_converters: :aliases
|
27
|
+
}
|
28
|
+
|
29
|
+
@converter_options = options
|
30
|
+
end
|
31
|
+
|
32
|
+
def convert
|
33
|
+
defaults = schema.defaults if converter_options[:write_defaults]
|
34
|
+
|
35
|
+
fields_to_convert = schema.types.reject{ |key, value| value == :string }
|
36
|
+
|
37
|
+
reader.each do |line|
|
38
|
+
CSV.parse(line, csv_options) do |row|
|
39
|
+
row = row.to_hash
|
40
|
+
|
41
|
+
if converter_options[:write_defaults]
|
42
|
+
add_defaults_to_row!(row, defaults)
|
43
|
+
end
|
44
|
+
|
45
|
+
convert_fields!(row, fields_to_convert)
|
46
|
+
|
47
|
+
begin
|
48
|
+
writer.write(row)
|
49
|
+
writer.flush
|
50
|
+
rescue
|
51
|
+
if bad_rows_writer.size == 0
|
52
|
+
bad_rows_writer << header_row + "\n"
|
53
|
+
end
|
54
|
+
|
55
|
+
bad_rows_writer << line
|
56
|
+
bad_rows_writer.flush
|
57
|
+
|
58
|
+
until Avro::Schema.errors.empty? do
|
59
|
+
error_writer << "line #{reader.lineno}: #{Avro::Schema.errors.shift}\n"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def convert_fields!(row, fields_to_convert)
|
69
|
+
fields_to_convert.each do |key, value|
|
70
|
+
row[key] = begin
|
71
|
+
case value
|
72
|
+
when :int
|
73
|
+
Integer(row[key])
|
74
|
+
when :float, :double
|
75
|
+
Float(row[key])
|
76
|
+
when :boolean
|
77
|
+
parse_boolean(row[key])
|
78
|
+
when :array
|
79
|
+
parse_array(row[key])
|
80
|
+
when :enum
|
81
|
+
row[key].downcase.tr(" ", "_")
|
82
|
+
end
|
83
|
+
rescue
|
84
|
+
row[key]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
row
|
89
|
+
end
|
90
|
+
|
91
|
+
def parse_boolean(value)
|
92
|
+
return true if value == true || value =~ (/^(true|t|yes|y|1)$/i)
|
93
|
+
return false if value == false || value =~ (/^(false|f|no|n|0)$/i)
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
|
97
|
+
def parse_array(value)
|
98
|
+
delimiter = converter_options[:array_delimiter] || ','
|
99
|
+
|
100
|
+
value.split(delimiter) if value
|
101
|
+
end
|
102
|
+
|
103
|
+
def add_defaults_to_row!(row, defaults)
|
104
|
+
# Add default values to nil cells
|
105
|
+
row.each do |key, value|
|
106
|
+
row[key] = defaults[key] if value.nil?
|
107
|
+
end
|
108
|
+
|
109
|
+
# Add default values to missing columns
|
110
|
+
defaults.each do |key, value|
|
111
|
+
row[key] = defaults[key] unless row.has_key?(key)
|
112
|
+
end
|
113
|
+
|
114
|
+
row
|
115
|
+
end
|
116
|
+
|
117
|
+
def init_header_converter
|
118
|
+
aliases = schema.aliases
|
119
|
+
|
120
|
+
CSV::HeaderConverters[:aliases] = lambda do |header|
|
121
|
+
aliases[header] || header
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
class CSV2Avro
|
4
|
+
class Schema
|
5
|
+
attr_reader :avro_schema, :schema_string
|
6
|
+
|
7
|
+
def initialize(schema)
|
8
|
+
@schema_string = schema.read
|
9
|
+
@avro_schema = Avro::Schema.parse(schema_string)
|
10
|
+
end
|
11
|
+
|
12
|
+
def defaults
|
13
|
+
Hash[
|
14
|
+
avro_schema.fields.map{ |field| [field.name, field.default] unless field.default.nil? }.compact
|
15
|
+
]
|
16
|
+
end
|
17
|
+
|
18
|
+
def types
|
19
|
+
Hash[
|
20
|
+
avro_schema.fields.map do |field|
|
21
|
+
type = if field.type.type_sym == :union
|
22
|
+
# use the primary type
|
23
|
+
field.type.schemas[0].type_sym
|
24
|
+
else
|
25
|
+
field.type.type_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
[field.name, type]
|
29
|
+
end
|
30
|
+
]
|
31
|
+
end
|
32
|
+
|
33
|
+
# TODO: Change this when the avro gem starts to support aliases
|
34
|
+
def aliases
|
35
|
+
schema_as_json = JSON.parse(schema_string)
|
36
|
+
|
37
|
+
Hash[
|
38
|
+
schema_as_json['fields'].select{ |field| field['aliases'] }.flat_map do |field|
|
39
|
+
field['aliases'].map { |one_alias| [one_alias, field['name']]}
|
40
|
+
end
|
41
|
+
]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/lib/csv2avro.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'csv2avro/converter'
|
2
|
+
require 'csv2avro/version'
|
3
|
+
|
4
|
+
class CSV2Avro
|
5
|
+
attr_reader :input_path, :schema_path, :bad_rows_path, :stdout_option, :options
|
6
|
+
|
7
|
+
def initialize(options)
|
8
|
+
@input_path = ARGV.first
|
9
|
+
@schema_path = options.delete(:schema)
|
10
|
+
@bad_rows_path = options.delete(:bad_rows)
|
11
|
+
@stdout_option = !input_path || options.delete(:stdout)
|
12
|
+
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def convert
|
17
|
+
Converter.new(reader, writer, bad_rows_writer, error_writer, options, schema: schema).convert
|
18
|
+
ensure
|
19
|
+
writer.close if writer
|
20
|
+
|
21
|
+
if bad_rows_writer.size == 0
|
22
|
+
File.delete(bad_rows_uri)
|
23
|
+
elsif bad_rows_writer
|
24
|
+
bad_rows_writer.close
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def schema
|
31
|
+
@schema ||= File.open(schema_path, 'r') do |schema|
|
32
|
+
CSV2Avro::Schema.new(schema)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def reader
|
37
|
+
ARGF.lineno = 0
|
38
|
+
ARGF
|
39
|
+
end
|
40
|
+
|
41
|
+
def writer
|
42
|
+
@__writer ||= begin
|
43
|
+
writer = if stdout_option
|
44
|
+
IO.new(STDOUT.fileno)
|
45
|
+
else
|
46
|
+
File.open(avro_uri, 'w')
|
47
|
+
end
|
48
|
+
|
49
|
+
CSV2Avro::AvroWriter.new(writer, schema)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def avro_uri
|
54
|
+
dir = File.dirname(input_path)
|
55
|
+
ext = File.extname(input_path)
|
56
|
+
name = File.basename(input_path, ext)
|
57
|
+
|
58
|
+
"#{dir}/#{name}.avro"
|
59
|
+
end
|
60
|
+
|
61
|
+
def error_writer
|
62
|
+
$stderr
|
63
|
+
end
|
64
|
+
|
65
|
+
def bad_rows_writer
|
66
|
+
@__bad_rows_writer ||= File.open(bad_rows_uri, 'w')
|
67
|
+
end
|
68
|
+
|
69
|
+
def bad_rows_uri
|
70
|
+
return bad_rows_path if bad_rows_path
|
71
|
+
|
72
|
+
dir = File.dirname(input_path)
|
73
|
+
ext = File.extname(input_path)
|
74
|
+
name = File.basename(input_path, ext)
|
75
|
+
|
76
|
+
"#{dir}/#{name}.bad#{ext}"
|
77
|
+
end
|
78
|
+
end
|