typed_data 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 483f78cdcfb329a0d14f4173fa586f64eae12c8f61dbd78492f1cd85cceadbea
4
- data.tar.gz: 45173313046d5a6dc54d920ed5bee399e64fd8af2a2e710e2b5051b922fcc440
3
+ metadata.gz: 72c692bdd403124256b1670dbaf3fde4a626272779ec7727e07c52e6ca6f4afd
4
+ data.tar.gz: 9fb2db3eeed81eb8141a01ca7f3b9aa3fc791b5d3790c6903fe63886567f9dc9
5
5
  SHA512:
6
- metadata.gz: 4041f615dedb5782ce40d15ae80eca87f3e168b988c2d2de4d9d07d0ba05b00bc2a6e8b14b981d65575ab9f2e38b0db50653286c3093ffaa8e7a586eb7a5e5d9
7
- data.tar.gz: 04f2f8e8a86d63e8f5e06066b96e983332107f116694e059e0b06cd14ebb66827cd8bddc2e85c631298518d8387d28452edd856161b412c105f9a9d05d4151fa
6
+ metadata.gz: e881d11c4543afa34e40d7ccdb40c8f302ac2bb5bf55a29e367e3960cfc885412089dc6b6c40d4167621727141d2f3bd123d5a82d3e6f4ff8b915a7fae65b6b7
7
+ data.tar.gz: 82b34a7ee6180ea0f12c2a065c5b971d55b495cf15d89b865d92bf0b344b5e6c4065a5993d87a35ed92f95810f27194f6fc6be460b1bb312efb7720d0a2b66a3
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # TypedData
2
2
 
3
- ![](https://github.com/abicky/ecsmec/workflows/test/badge.svg?branch=master)
3
+ ![](https://github.com/abicky/typed_data/workflows/CI/badge.svg?branch=master)
4
4
 
5
5
  TypedData is a library that converts hash objects managed by an Avro schema so that the objects can be loaded into BigQuery.
6
6
 
@@ -23,6 +23,8 @@ Or install it yourself as:
23
23
 
24
24
  ## Usage
25
25
 
26
+ ### Use as Ruby library
27
+
26
28
  ```ruby
27
29
  require "typed_data"
28
30
 
@@ -81,19 +83,20 @@ converter.convert({
81
83
  #=> {"int_field"=>1,
82
84
  # "int_or_string_field"=>{"string_value"=>"string"},
83
85
  # "array_field"=>[1, 2],
84
- # "union_type_array_field"=>[{"int_value"=>"1"}, {"string_value"=>"2"}],
86
+ # "union_type_array_field"=>[{"int_value"=>1}, {"string_value"=>"2"}],
85
87
  # "nested_map_field"=>
86
88
  # [{"key"=>"nested_map",
87
89
  # "value"=>
88
- # [{"key"=>"key1", "value"=>{"int_value"=>"1"}},
90
+ # [{"key"=>"key1", "value"=>{"int_value"=>1}},
89
91
  # {"key"=>"key2", "value"=>{"string_value"=>"2"}}]}]}
90
92
  ```
91
93
 
92
- You can specify a formatter for the union type keys. For example, the formatter for tables managed by [Google BigQuery Sink Connector](https://docs.confluent.io/current/connect/kafka-connect-bigquery/index.html) is like below:
94
+ You can specify the formatter for union type keys. The default formatter is `:bigquery`, which is used for BigQuery tables created by loading Avro data for the first time.
95
+ The other formatter is `:avro`, the formatter for the Avro JSON encoding, which is used in tables managed by [Google BigQuery Sink Connector](https://docs.confluent.io/current/connect/kafka-connect-bigquery/index.html):
96
+
93
97
 
94
98
  ```ruby
95
- converter = TypedData::Converter.new(schema)
96
- converter.union_type_key_formatter = ->(type) { type.split("_").first }
99
+ converter = TypedData::Converter.new(schema, key_formatter: :avro)
97
100
  converter.convert({
98
101
  "int_field" => 1,
99
102
  "int_or_string_field" => "string",
@@ -109,14 +112,92 @@ converter.convert({
109
112
  #=> {"int_field"=>1,
110
113
  # "int_or_string_field"=>{"string"=>"string"},
111
114
  # "array_field"=>[1, 2],
112
- # "union_type_array_field"=>[{"int"=>"1"}, {"string"=>"2"}],
115
+ # "union_type_array_field"=>[{"int"=>1}, {"string"=>"2"}],
113
116
  # "nested_map_field"=>
114
117
  # [{"key"=>"nested_map",
115
118
  # "value"=>
116
- # [{"key"=>"key1", "value"=>{"int"=>"1"}},
119
+ # [{"key"=>"key1", "value"=>{"int"=>1}},
117
120
  # {"key"=>"key2", "value"=>{"string"=>"2"}}]}]}
118
121
  ```
119
122
 
123
+ `TypedData::Restorer` enables you to restore the converted data:
124
+
125
+ ```ruby
126
+ restorer = TypedData::Restorer.new(schema)
127
+ restorer.restore({
128
+ "int_field" => 1,
129
+ "int_or_string_field" => { "string_value" => "string" },
130
+ "array_field" => [1, 2],
131
+ "union_type_array_field" => [
132
+ { "int_value" => 1 },
133
+ { "string_value" => "2" },
134
+ ],
135
+ "nested_map_field" => [
136
+ {
137
+ "key" => "nested_map",
138
+ "value" =>[
139
+ {
140
+ "key" => "key1",
141
+ "value" => { "int_value" => 1 }
142
+ },
143
+ {
144
+ "key" => "key2",
145
+ "value" => { "string_value" => "2"}
146
+ },
147
+ ],
148
+ },
149
+ ],
150
+ })
151
+ #=> {"int_field"=>1,
152
+ # "int_or_string_field"=>"string",
153
+ # "array_field"=>[1, 2],
154
+ # "union_type_array_field"=>[1, "2"],
155
+ # "nested_map_field"=>{"nested_map"=>{"key1"=>1, "key2"=>"2"}}}
156
+ ```
157
+
158
+ ### Use as CLI
159
+
160
+ ```
161
+ $ typed-data help
162
+ Commands:
163
+ typed-data convert [file] --schema=SCHEMA # Convert data in an encoding similar to Avro JSON encoding
164
+ typed-data help [COMMAND] # Describe available commands or one specific command
165
+ typed-data restore [file] --schema=SCHEMA # Restore converted data
166
+
167
+ $ typed-data help convert
168
+ Usage:
169
+ typed-data convert [file] --schema=SCHEMA
170
+
171
+ Options:
172
+ --schema=SCHEMA # Path to Avro schema file
173
+ [--key-format=FORMAT] # Format for union type key
174
+ # Default: bigquery
175
+ # Possible values: bigquery, avro
176
+
177
+ Description:
178
+ This command converts data in an encoding similar to Avro JSON encoding. You can specify the file in
179
+ JSON format or JSON Lines format. If the file option is ommited, the command read data from stdin.
180
+ $ typed-data help restore
181
+ Usage:
182
+ typed-data restore [file] --schema=SCHEMA
183
+
184
+ Options:
185
+ --schema=SCHEMA # Path to Avro schema file
186
+ [--key-format=FORMAT] # Format for union type key
187
+ # Default: bigquery
188
+ # Possible values: bigquery, avro
189
+
190
+ Description:
191
+ This command restores converted data. You can specify the file in JSON format or JSON Lines format. If
192
+ the file option is ommited, the command read data from stdin.
193
+ ```
194
+
195
+ For example, you can restore the data loaded into a BigQuery table like below:
196
+
197
+ ```
198
+ $ bq query --format json 'SELECT * FROM <table>' | typed-data restore --schema /path/to/avsc
199
+ ```
200
+
120
201
 
121
202
  ## Development
122
203
 
@@ -126,7 +207,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
126
207
 
127
208
  ## Contributing
128
209
 
129
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/typed_data.
210
+ Bug reports and pull requests are welcome on GitHub at https://github.com/abicky/typed_data.
130
211
 
131
212
 
132
213
  ## License
@@ -0,0 +1,22 @@
1
+ require "json"
2
+ require "typed_data"
3
+
4
+ schema = JSON.parse(File.read(File.join(__dir__, "schema.avsc")))
5
+ data = JSON.parse(File.read(File.join(__dir__, "data.jsonl")))
6
+
7
+ puts "Schema:"
8
+ pp schema
9
+ puts
10
+
11
+ puts "Input data:"
12
+ pp data
13
+ puts
14
+
15
+ converter = TypedData::Converter.new(schema)
16
+ puts "Converted data with the default key formatter:"
17
+ pp converter.convert(data)
18
+ puts
19
+
20
+ converter = TypedData::Converter.new(schema, key_formatter: :avro)
21
+ puts "Converted data with the key formatter :avro:"
22
+ pp converter.convert(data)
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+
3
+ cd $(dirname $0)
4
+
5
+ data=data.jsonl
6
+
7
+ echo "Input data:"
8
+ cat $data
9
+ echo
10
+
11
+ cmd="typed-data convert --schema schema.avsc $data"
12
+ echo "Execute: $cmd"
13
+ eval $cmd
14
+ echo
15
+
16
+ cmd="cat $data | typed-data convert --schema schema.avsc"
17
+ echo "Execute: $cmd"
18
+ eval $cmd
@@ -0,0 +1 @@
1
+ {"int_field":1,"int_or_string_field":{"string_value":"string"},"array_field":[1,2],"union_type_array_field":[{"int_value":1},{"string_value":"2"}],"nested_map_field":[{"key":"nested_map","value":[{"key":"key1","value":{"int_value":1}},{"key":"key2","value":{"string_value":"2"}}]}]}
@@ -0,0 +1 @@
1
+ {"int_field":1,"int_or_string_field":"string","array_field":[1,2],"union_type_array_field":[1,"2"],"nested_map_field":{"nested_map":{"key1":1,"key2":"2"}}}
@@ -0,0 +1,17 @@
1
+ require "json"
2
+ require "typed_data"
3
+
4
+ schema = JSON.parse(File.read(File.join(__dir__, "schema.avsc")))
5
+ data = JSON.parse(File.read(File.join(__dir__, "converted_data.jsonl")))
6
+
7
+ puts "Schema:"
8
+ pp schema
9
+ puts
10
+
11
+ puts "Input data:"
12
+ pp data
13
+ puts
14
+
15
+ restorer = TypedData::Restorer.new(schema)
16
+ puts "Restored data:"
17
+ pp restorer.restore(data)
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+
3
+ cd $(dirname $0)
4
+
5
+ data=converted_data.jsonl
6
+
7
+ echo "Input data:"
8
+ cat $data
9
+ echo
10
+
11
+ cmd="typed-data restore --schema schema.avsc $data"
12
+ echo "Execute: $cmd"
13
+ eval $cmd
14
+ echo
15
+
16
+ cmd="cat $data | typed-data restore --schema schema.avsc"
17
+ echo "Execute: $cmd"
18
+ eval $cmd
@@ -0,0 +1,47 @@
1
+ {
2
+ "name": "Record",
3
+ "type": "record",
4
+ "fields": [
5
+ {
6
+ "name": "int_field",
7
+ "type": "int"
8
+ },
9
+ {
10
+ "name": "int_or_string_field",
11
+ "type": [
12
+ "int",
13
+ "string"
14
+ ]
15
+ },
16
+ {
17
+ "name": "array_field",
18
+ "type": {
19
+ "type": "array",
20
+ "items": "int"
21
+ }
22
+ },
23
+ {
24
+ "name": "union_type_array_field",
25
+ "type": {
26
+ "type": "array",
27
+ "items": [
28
+ "int",
29
+ "string"
30
+ ]
31
+ }
32
+ },
33
+ {
34
+ "name": "nested_map_field",
35
+ "type": {
36
+ "type": "map",
37
+ "values": {
38
+ "type": "map",
39
+ "values": [
40
+ "int",
41
+ "string"
42
+ ]
43
+ }
44
+ }
45
+ }
46
+ ]
47
+ }
data/exe/typed-data ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "typed_data/cli"
3
+
4
+ TypedData::CLI.start(ARGV)
data/lib/typed_data.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require "typed_data/converter"
2
+ require "typed_data/restorer"
2
3
  require "typed_data/version"
3
4
 
4
5
  module TypedData
5
- class Error < StandardError; end
6
- # Your code goes here...
7
6
  end
@@ -0,0 +1,68 @@
1
+ require "json"
2
+ require "thor"
3
+ require "typed_data/converter"
4
+ require "typed_data/restorer"
5
+
6
+ module TypedData
7
+ class CLI < Thor
8
+ def self.exit_on_failure?
9
+ true
10
+ end
11
+
12
+ desc "convert [file]", "Convert data in an encoding similar to Avro JSON encoding"
13
+ long_desc <<~DESC
14
+ This command converts data in an encoding similar to Avro JSON encoding.
15
+ You can specify the file in JSON format or JSON Lines format.
16
+ If the file option is ommited, the command read data from stdin.
17
+ DESC
18
+ option :schema, desc: "Path to Avro schema file", required: true
19
+ option :"key-format", desc: "Format for union type key", enum: %w[bigquery avro], default: "bigquery", banner: "FORMAT"
20
+ def convert(file = nil)
21
+ process(TypedData::Converter, :convert, file)
22
+ end
23
+
24
+ desc "restore [file]", "Restore converted data"
25
+ long_desc <<~DESC
26
+ This command restores converted data.
27
+ You can specify the file in JSON format or JSON Lines format.
28
+ If the file option is ommited, the command read data from stdin.
29
+ DESC
30
+ option :schema, desc: "Path to Avro schema file", required: true
31
+ option :"key-format", desc: "Format for union type key", enum: %w[bigquery avro], default: "bigquery", banner: "FORMAT"
32
+ def restore(file = nil)
33
+ process(TypedData::Restorer, :restore, file)
34
+ end
35
+
36
+ private
37
+
38
+ def process(processor_class, method_name, file)
39
+ abort_if_not_exist(options[:schema])
40
+ abort_if_not_exist(file) if file
41
+
42
+ schema = JSON.parse(File.read(options[:schema]))
43
+ processor = processor_class.new(schema, key_formatter: options[:"key-format"].to_sym)
44
+
45
+ input = file ? File.open(file) : $stdin
46
+ first_line = input.readline.lstrip
47
+ if first_line.start_with?("[")
48
+ first_line << input.read
49
+ JSON.parse(first_line).each do |record|
50
+ puts processor.public_send(method_name, record).to_json
51
+ end
52
+ else
53
+ records = input
54
+ puts processor.public_send(method_name, JSON.parse(first_line)).to_json
55
+ input.each do |line|
56
+ puts processor.public_send(method_name, JSON.parse(line)).to_json
57
+ end
58
+ end
59
+ end
60
+
61
+ def abort_if_not_exist(file)
62
+ unless File.exist?(file)
63
+ $stderr.puts("#{file} doesn't exit")
64
+ exit(1)
65
+ end
66
+ end
67
+ end
68
+ end
@@ -1,111 +1,116 @@
1
1
  # frozen_string_literal: true
2
+ require "typed_data/key_formatter"
2
3
  require "typed_data/schema"
3
4
 
4
5
  module TypedData
5
6
  class Converter
6
- attr_accessor :union_type_key_formatter
7
-
8
7
  # @param schema [Hash] an Avro schema
9
- def initialize(schema)
8
+ # @param key_formatter [Symbol]
9
+ def initialize(schema, key_formatter: :bigquery)
10
10
  @schema = Schema.new(schema)
11
- @union_type_key_formatter = ->(type) { "#{type}_value" }
11
+ @union_type_key_formatter = KeyFormatter.find(key_formatter)
12
+ end
13
+
14
+ def union_type_key_formatter=(formatter)
15
+ warn "DEPRECATION WARNING: #{__method__} is deprecated. Specify the key_formatter :avsc to TypedData::Converter.new instead."
16
+ @union_type_key_formatter = formatter
12
17
  end
13
18
 
14
19
  # @param data [Hash]
15
20
  def convert(data)
16
- convert_record(@schema.root_type, data)
21
+ @schema.root_type.accept(self, data)
22
+ end
23
+
24
+ # @param type [TypedData::Schema::Type]
25
+ # @param value [Object]
26
+ def visit(type, value)
27
+ value
17
28
  end
18
29
 
19
- private
30
+ # @param type [TypedData::Schema::BytesType]
31
+ # @param value [String]
32
+ def visit_bytes(type, value)
33
+ [value].pack("m0")
34
+ end
35
+
36
+ # @param type [TypedData::Schema::IntType]
37
+ # @param logical_type [String, nil] a logical type of the int type
38
+ # @param value [Integer]
39
+ def visit_int(type, logical_type, value)
40
+ case logical_type
41
+ when "date"
42
+ (Date.new(1970, 1, 1) + value).to_s
43
+ when "time-millis"
44
+ Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%T.%3N")
45
+ else
46
+ value
47
+ end
48
+ end
20
49
 
21
- # @param type [RecordType]
50
+ # @param type [TypedData::Schema::LongType]
51
+ # @param logical_type [String, nil] logical type of the long type
52
+ # @param value [Integer]
53
+ def visit_long(type, logical_type, value)
54
+ case logical_type
55
+ when "time-micros"
56
+ Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%T.%6N")
57
+ when "timestamp-millis"
58
+ Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%F %T.%3N")
59
+ when "timestamp-micros"
60
+ Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%F %T.%6N")
61
+ else
62
+ value
63
+ end
64
+ end
65
+
66
+ # @param type [TypedData::Schema::RecordType]
22
67
  # @param record [Hash{String => Object}]
23
- def convert_record(type, record)
68
+ def visit_record(type, record)
24
69
  record.each_with_object({}) do |(key, value), converted|
25
- subtype = type.find_type(key)
26
- case subtype
27
- when Schema::ArrayType
28
- converted[key] = convert_array(subtype, value)
29
- when Schema::MapType
30
- converted[key] = convert_map(subtype, value)
31
- when Schema::RecordType
32
- converted[key] = convert_record(subtype, value)
33
- when Schema::UnionType
34
- converted[key] = convert_union(subtype, value)
35
- else
36
- converted[key] = subtype.coerce(value)
37
- end
70
+ converted[key] = type.find_type(key).accept(self, value)
38
71
  end
39
72
  end
40
73
 
41
- # @param type [ArrayType]
74
+ # @param type [TypedData::Schema::ArrayType]
42
75
  # @param array [Array<Object>]
43
- def convert_array(type, array)
76
+ def visit_array(type, array)
44
77
  array.each_with_object([]) do |value, ret|
45
78
  next if value.nil?
46
79
 
47
- subtype = type.find_match(value)
48
- case subtype
49
- when Schema::ArrayType
50
- ret.concat(convert_array(subtype, value))
51
- when Schema::MapType
52
- ret << convert_map(subtype, value)
53
- when Schema::RecordType
54
- ret << convert_record(subtype, value)
55
- when Schema::UnionType
56
- ret << convert_union(subtype, value)
80
+ converted_value = type.element_type.accept(self, value)
81
+ if type.element_type.is_a?(Schema::ArrayType)
82
+ # BigQuery doesn't support nested arrays
83
+ ret.concat(converted_value)
57
84
  else
58
- ret << subtype.coerce(value)
85
+ ret << converted_value
59
86
  end
60
87
  end
61
88
  end
62
89
 
63
- # @param type [MapType]
90
+ # @param type [TypedData::Schema::MapType]
64
91
  # @param map [Hash{String => Object}]
65
- def convert_map(type, map)
92
+ def visit_map(type, map)
66
93
  map.each_with_object([]) do |(key, value), ret|
67
- subtype = type.find_match(value)
68
- case subtype
69
- when Schema::ArrayType
70
- value = convert_array(subtype, value)
71
- when Schema::MapType
72
- value = convert_map(subtype, value)
73
- when Schema::RecordType
74
- value = convert_record(subtype, value)
75
- when Schema::UnionType
76
- value = convert_union(subtype, value)
77
- else
78
- value = subtype.coerce(value)
79
- end
80
- ret << { "key" => key, "value" => value }
94
+ ret << { "key" => key, "value" => type.element_type.accept(self, value) }
81
95
  end
82
96
  end
83
97
 
84
- # @param type [UnionType]
98
+ # @param type [TypedData::Schema::UnionType]
99
+ # @param types [Array<TypedData::Schema::Type>] types the union type includes
85
100
  # @param map [Object]
86
- def convert_union(type, value)
87
- subtype = type.find_match(value)
88
- case subtype
89
- when Schema::ArrayType
90
- converted_value = convert_array(subtype, value)
91
- when Schema::MapType
92
- converted_value = convert_map(subtype, value)
93
- when Schema::RecordType
94
- converted_value = convert_record(subtype, value)
95
- when Schema::UnionType
96
- converted_value = convert_union(subtype, value)
97
- when Schema::NullType
98
- converted_value = nil
99
- else
100
- converted_value = subtype.coerce(value).to_s
101
+ def visit_union(type, types, value)
102
+ element_type = types.find { |t| t.match?(value) }
103
+ if element_type.nil?
104
+ raise Schema::InvalidValue, %Q{the value #{value.inspect} doesn't match the type #{types.map(&:to_s)}}
101
105
  end
106
+ converted_value = element_type.accept(self, value)
102
107
 
103
108
  if type.nullable_single?
104
109
  converted_value
105
- elsif subtype.is_a?(Schema::NullType)
110
+ elsif element_type.is_a?(Schema::NullType)
106
111
  {}
107
112
  else
108
- { union_type_key_formatter.call(subtype.to_s) => converted_value }
113
+ { @union_type_key_formatter.call(element_type.to_s) => converted_value }
109
114
  end
110
115
  end
111
116
  end
@@ -0,0 +1,17 @@
1
+ module TypedData
2
+ class KeyFormatter
3
+ class UnknownFormatter < StandardError; end
4
+
5
+ UNION_TYPE_KEY_FORMATTERS = {
6
+ bigquery: ->(type) { "#{type}_value" },
7
+ avro: ->(type) { type.split("_").first },
8
+ }
9
+
10
+ # @param formatter [Symbol]
11
+ def self.find(formatter)
12
+ UNION_TYPE_KEY_FORMATTERS.fetch(formatter) do
13
+ raise UnknownFormatter, "Unknown formatter: #{formatter}"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+ require "time"
3
+ require "typed_data/key_formatter"
4
+ require "typed_data/schema"
5
+
6
+ module TypedData
7
+ class Restorer
8
+ # @param schema [Hash] an Avro schema
9
+ # @param key_formatter [Symbol]
10
+ def initialize(schema, key_formatter: :bigquery)
11
+ @schema = Schema.new(schema)
12
+ @union_type_key_formatter = KeyFormatter.find(key_formatter)
13
+ end
14
+
15
+ # @param data [Hash]
16
+ def restore(data)
17
+ @schema.root_type.accept(self, data)
18
+ end
19
+
20
+ # @param type [TypedData::Schema::Type]
21
+ # @param value [Object]
22
+ def visit(type, value)
23
+ value
24
+ end
25
+
26
+ # @param type [TypedData::Schema::BytesType]
27
+ # @param value [String]
28
+ def visit_bytes(type, value)
29
+ value.unpack("m0").first
30
+ end
31
+
32
+ # @param type [TypedData::Schema::IntType]
33
+ # @param logical_type [String, nil] a logical type of the int type
34
+ # @param value [Integer]
35
+ def visit_int(type, logical_type, value)
36
+ case logical_type
37
+ when "date"
38
+ (Date.parse(value) - Date.new(1970, 1, 1)).to_i
39
+ when "time-millis"
40
+ t = Time.parse(value)
41
+ (t.sec + t.min * 60 + t.hour * 60**2) * 10**3 + t.nsec / 10**6
42
+ else
43
+ value
44
+ end
45
+ end
46
+
47
+ # @param type [TypedData::Schema::LongType]
48
+ # @param logical_type [String, nil] logical type of the long type
49
+ # @param value [Integer]
50
+ def visit_long(type, logical_type, value)
51
+ case logical_type
52
+ when "time-micros"
53
+ t = Time.parse(value)
54
+ (t.sec + t.min * 60 + t.hour * 60**2) * 10**6 + t.nsec / 10**3
55
+ when "timestamp-millis"
56
+ t = parse_as_utc(value)
57
+ t.to_i * 10**3 + t.nsec / 10**6
58
+ when "timestamp-micros"
59
+ t = parse_as_utc(value)
60
+ t.to_i * 10**6 + t.nsec / 10**3
61
+ else
62
+ value
63
+ end
64
+ end
65
+
66
+ # @param type [TypedData::Schema::RecordType]
67
+ # @param record [Hash{String => Object}]
68
+ def visit_record(type, record)
69
+ record.each_with_object({}) do |(key, value), restored|
70
+ restored[key] = type.find_type(key).accept(self, value)
71
+ end
72
+ end
73
+
74
+ # @param type [TypedData::Schema::ArrayType]
75
+ # @param array [Array<Object>]
76
+ def visit_array(type, array)
77
+ array.each_with_object([]) do |value, ret|
78
+ next if value.nil?
79
+
80
+ if type.element_type.is_a?(Schema::ArrayType)
81
+ # BigQuery doesn't support nested arrays
82
+ ret << type.element_type.element_type.accept(self, value)
83
+ else
84
+ ret << type.element_type.accept(self, value)
85
+ end
86
+ end
87
+ end
88
+
89
+ # @param type [TypedData::Schema::MapType]
90
+ # @param map [Hash{String => Object}]
91
+ def visit_map(type, array)
92
+ array.each_with_object({}) do |hash, ret|
93
+ ret[hash["key"]] = type.element_type.accept(self, hash["value"])
94
+ end
95
+ end
96
+
97
+ # @param type [TypedData::Schema::UnionType]
98
+ # @param types [Array<TypedData::Schema::Type>] types the union type includes
99
+ # @param map [Object]
100
+ def visit_union(type, types, value)
101
+ if type.nullable_single?
102
+ return if value.nil?
103
+
104
+ element_type = types.find { |t| !t.is_a?(Schema::NullType) }
105
+ element_type.accept(self, value)
106
+ else
107
+ value_without_nil = value.compact
108
+ return if value_without_nil.empty?
109
+
110
+ k = value_without_nil.keys.first
111
+ v = value_without_nil.values.first
112
+ element_type = types.find { |t| k == @union_type_key_formatter.call(t.to_s) }
113
+ element_type.accept(self, v)
114
+ end
115
+ end
116
+
117
+ private
118
+
119
+ # @param time [String]
120
+ def parse_as_utc(time)
121
+ d = Date._parse(time)
122
+ Time.utc(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d.fetch(:sec_fraction, 0) * 1000000)
123
+ end
124
+ end
125
+ end
@@ -11,13 +11,13 @@ require "typed_data/schema/null_type"
11
11
  require "typed_data/schema/record_type"
12
12
  require "typed_data/schema/string_type"
13
13
  require "typed_data/schema/union_type"
14
+ require "typed_data/schema/errors"
14
15
 
15
16
  module TypedData
16
17
  class Schema
17
- class UnknownField < StandardError; end
18
- class UnsupportedType < StandardError; end
19
-
20
18
  class << self
19
+ # @param type [String, Hash{Symbol => Object}, Array<Hash{Symbol => Object}>]
20
+ # @param logical_type [String, nil]
21
21
  def build_type(type, logical_type = nil)
22
22
  type = type.first if type.is_a?(Array) && type.size == 1
23
23
 
@@ -25,27 +25,26 @@ module TypedData
25
25
  when Array
26
26
  UnionType.new(type)
27
27
  when Hash
28
- subtype = type["type"] || type[:type]
29
- logical_type = type["logicalType"] || type[:logicalType]
30
- if logical_type
31
- return build_type(subtype, logical_type)
28
+ actual_type = type[:type]
29
+ if type[:logicalType]
30
+ return build_type(actual_type, type[:logicalType])
32
31
  end
33
32
 
34
- case subtype
33
+ case actual_type
35
34
  when "enum"
36
- EnumType.new(type["name"] || type[:name], type["symbols"] || type[:symbols])
35
+ EnumType.new(type[:name], type[:symbols])
37
36
  when "fixed"
38
- BytesType.new(type["name"] || type[:name] || "bytes")
37
+ BytesType.new(type[:name] || "bytes")
39
38
  when "array"
40
- items = type["items"] || type[:items]
39
+ items = type[:items]
41
40
  ArrayType.new(items.is_a?(Array) ? items : [items])
42
41
  when "map"
43
- values = type["values"] || type[:values]
42
+ values = type[:values]
44
43
  MapType.new(values.is_a?(Array) ? values : [values])
45
44
  when "record"
46
- RecordType.new(type["name"] || type[:name], type["fields"] || type[:fields])
45
+ RecordType.new(type[:name], type[:fields])
47
46
  else
48
- raise UnsupportedType, "Unknown type: #{subtype}"
47
+ raise UnsupportedType, "Unknown type: #{actual_type}"
49
48
  end
50
49
  when "boolean"
51
50
  BooleanType.new(type, logical_type)
@@ -71,11 +70,25 @@ module TypedData
71
70
 
72
71
  # @param schema [Hash] an Avro schema
73
72
  def initialize(schema)
74
- @schema = schema
75
- if (schema["type"] || schema[:type]) != "record"
76
- raise UnsupportedType, 'The root type must be "record"'
73
+ @schema = deep_symbolize_keys(schema)
74
+ @root_type = Schema.build_type(@schema)
75
+ end
76
+
77
+ private
78
+
79
+ # @param hash [Object]
80
+ # @return [Object] an object with symbolized keys
81
+ def deep_symbolize_keys(o)
82
+ case o
83
+ when Array
84
+ o.map(&method(:deep_symbolize_keys))
85
+ when Hash
86
+ o.each_with_object({}) do |(k, v), h|
87
+ h[k.to_sym] = deep_symbolize_keys(v)
88
+ end
89
+ else
90
+ o
77
91
  end
78
- @root_type = RecordType.new(schema["name"] || schema[:name], schema["fields"] || schema[:fields])
79
92
  end
80
93
  end
81
94
  end
@@ -4,27 +4,27 @@ require "typed_data/schema/type"
4
4
  module TypedData
5
5
  class Schema
6
6
  class ArrayType < Type
7
- attr_reader :fields
7
+ attr_reader :element_type
8
8
 
9
9
  # @param types [Array<String>]
10
10
  def initialize(types)
11
- @type = Schema.build_type(types.select { |t| t != "null" })
11
+ @element_type = Schema.build_type(types)
12
+ end
13
+
14
+ def accept(visitor, value)
15
+ visitor.visit_array(self, value)
12
16
  end
13
17
 
14
18
  def to_s
15
- "array_#{@type}"
19
+ "array_#{@element_type}"
16
20
  end
17
21
 
18
22
  def primitive?
19
23
  false
20
24
  end
21
25
 
22
- def find_match(value)
23
- @type.match?(value) ? @type : @type.find_match(value)
24
- end
25
-
26
26
  def match?(value)
27
- value.is_a?(Array) && value.all? { |v| @type.match?(v) }
27
+ value.is_a?(Array) && value.all? { |v| @element_type.match?(v) }
28
28
  end
29
29
  end
30
30
  end
@@ -3,8 +3,8 @@
3
3
  module TypedData
4
4
  class Schema
5
5
  class BytesType < Type
6
- def coerce(value)
7
- [value].pack("m0")
6
+ def accept(visitor, value)
7
+ visitor.visit_bytes(self, value)
8
8
  end
9
9
 
10
10
  def primitive?
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TypedData
4
+ class Schema
5
+ class UnknownField < StandardError; end
6
+ class UnsupportedType < StandardError; end
7
+ class InvalidValue < StandardError; end
8
+ end
9
+ end
@@ -4,6 +4,11 @@ module TypedData
4
4
  class Schema
5
5
  class IntType < Type
6
6
  VALUE_RANGE = -2**31 .. 2**31 - 1
7
+ SUPPORTED_LOGICAL_TYPES = %w[date time-millis]
8
+
9
+ def accept(visitor, value)
10
+ visitor.visit_int(self, @logical_type, value)
11
+ end
7
12
 
8
13
  def to_s
9
14
  if @logical_type
@@ -13,17 +18,6 @@ module TypedData
13
18
  end
14
19
  end
15
20
 
16
- def coerce(value)
17
- case @logical_type
18
- when "date"
19
- (Date.new(1970, 1, 1) + value).to_s
20
- when "time-millis"
21
- Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%T.%3N")
22
- else
23
- value
24
- end
25
- end
26
-
27
21
  def primitive?
28
22
  true
29
23
  end
@@ -3,6 +3,12 @@
3
3
  module TypedData
4
4
  class Schema
5
5
  class LongType < Type
6
+ SUPPORTED_LOGICAL_TYPES = %w[time-micros timestamp-millis timestamp-micros]
7
+
8
+ def accept(visitor, value)
9
+ visitor.visit_long(self, @logical_type, value)
10
+ end
11
+
6
12
  def to_s
7
13
  if @logical_type
8
14
  "#{@name}_#{@logical_type.gsub("-", "_")}"
@@ -11,19 +17,6 @@ module TypedData
11
17
  end
12
18
  end
13
19
 
14
- def coerce(value)
15
- case @logical_type
16
- when "time-micros"
17
- Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%T.%6N")
18
- when "timestamp-millis"
19
- Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%F %T.%3N")
20
- when "timestamp-micros"
21
- Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%F %T.%6N")
22
- else
23
- value
24
- end
25
- end
26
-
27
20
  def primitive?
28
21
  true
29
22
  end
@@ -3,25 +3,27 @@
3
3
  module TypedData
4
4
  class Schema
5
5
  class MapType < Type
6
+ attr_reader :element_type
7
+
6
8
  # @param types [Array<String>]
7
9
  def initialize(types)
8
- @type = Schema.build_type(types)
10
+ @element_type = Schema.build_type(types)
11
+ end
12
+
13
+ def accept(visitor, value)
14
+ visitor.visit_map(self, value)
9
15
  end
10
16
 
11
17
  def to_s
12
- "map_#{@type}"
18
+ "map_#{@element_type}"
13
19
  end
14
20
 
15
21
  def primitive?
16
22
  false
17
23
  end
18
24
 
19
- def find_match(value)
20
- @type.match?(value) ? @type : @type.find_match(value)
21
- end
22
-
23
25
  def match?(value)
24
- value.is_a?(Hash) && value.all? { |_, v| @type.match?(v) }
26
+ value.is_a?(Hash) && value.all? { |_, v| @element_type.match?(v) }
25
27
  end
26
28
  end
27
29
  end
@@ -3,14 +3,19 @@
3
3
  module TypedData
4
4
  class Schema
5
5
  class RecordType < Type
6
+ # @param name [String]
6
7
  # @param fields [Array] an array of "fields" in an Avro schema
7
8
  def initialize(name, fields)
8
9
  @name = name
9
10
  @field_to_type = fields.each_with_object({}) do |field, h|
10
- h[field["name"] || field[:name]] = Schema.build_type(field["type"] || field[:type])
11
+ h[field[:name]] = Schema.build_type(field[:type])
11
12
  end
12
13
  end
13
14
 
15
+ def accept(visitor, value)
16
+ visitor.visit_record(self, value)
17
+ end
18
+
14
19
  def primitive?
15
20
  false
16
21
  end
@@ -3,6 +3,8 @@
3
3
  module TypedData
4
4
  class Schema
5
5
  class StringType < Type
6
+ SUPPORTED_LOGICAL_TYPES = %w[uuid]
7
+
6
8
  def primitive?
7
9
  true
8
10
  end
@@ -1,19 +1,25 @@
1
1
  # frozen_string_literal: true
2
+ require "typed_data/schema/errors"
2
3
 
3
4
  module TypedData
4
5
  class Schema
5
6
  class Type
7
+ SUPPORTED_LOGICAL_TYPES = []
8
+
6
9
  def initialize(name, logical_type = nil)
7
10
  @name = name
11
+ if logical_type && !self.class::SUPPORTED_LOGICAL_TYPES.include?(logical_type)
12
+ raise UnsupportedType, %Q{#{name} doesn't support the logical type "#{logical_type}"}
13
+ end
8
14
  @logical_type = logical_type
9
15
  end
10
16
 
11
- def to_s
12
- @name
17
+ def accept(visitor, value)
18
+ visitor.visit(self, value)
13
19
  end
14
20
 
15
- def coerce(value)
16
- value
21
+ def to_s
22
+ @name
17
23
  end
18
24
 
19
25
  def primitive?
@@ -1,4 +1,5 @@
1
1
  # frozen_string_literal: true
2
+ require "typed_data/schema/errors"
2
3
 
3
4
  module TypedData
4
5
  class Schema
@@ -7,21 +8,21 @@ module TypedData
7
8
  def initialize(types)
8
9
  @types = types.map(&Schema.method(:build_type))
9
10
  @nullable_single = @types.size == 2 && @types.any? { |t| t.is_a?(NullType) }
10
- @nullable_primitive = @nullable_single && @types.any?(&:primitive?)
11
+ @nullable_primitive_type = @types.find(&:primitive?) if @nullable_single
12
+ end
13
+
14
+ def accept(visitor, value)
15
+ visitor.visit_union(self, @types, value)
11
16
  end
12
17
 
13
18
  def to_s
14
- @nullable_primitive ? @types.first.to_s : "union_#{@types.map(&:to_s).join("_")}"
19
+ @nullable_primitive_type&.to_s || "union_#{@types.map(&:to_s).join("_")}"
15
20
  end
16
21
 
17
22
  def primitive?
18
23
  false
19
24
  end
20
25
 
21
- def find_match(value)
22
- @types.find { |t| t.match?(value) }
23
- end
24
-
25
26
  def match?(value)
26
27
  @types.any? { |t| t.match?(value) }
27
28
  end
@@ -1,3 +1,3 @@
1
1
  module TypedData
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
data/typed_data.gemspec CHANGED
@@ -22,6 +22,8 @@ Gem::Specification.new do |spec|
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
24
 
25
+ spec.add_runtime_dependency "thor"
26
+
25
27
  spec.add_development_dependency "avro"
26
28
  spec.add_development_dependency "google-cloud-bigquery"
27
29
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: typed_data
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - abicky
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-04-25 00:00:00.000000000 Z
11
+ date: 2021-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: avro
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -42,7 +56,8 @@ description: TypedData is a library that converts hash objects managed by an Avr
42
56
  schema so that the objects can be loaded into BigQuery.
43
57
  email:
44
58
  - takeshi.arabiki@gmail.com
45
- executables: []
59
+ executables:
60
+ - typed-data
46
61
  extensions: []
47
62
  extra_rdoc_files: []
48
63
  files:
@@ -56,13 +71,25 @@ files:
56
71
  - Rakefile
57
72
  - bin/console
58
73
  - bin/setup
74
+ - example/convert.rb
75
+ - example/convert.sh
76
+ - example/converted_data.jsonl
77
+ - example/data.jsonl
78
+ - example/restore.rb
79
+ - example/restore.sh
80
+ - example/schema.avsc
81
+ - exe/typed-data
59
82
  - lib/typed_data.rb
83
+ - lib/typed_data/cli.rb
60
84
  - lib/typed_data/converter.rb
85
+ - lib/typed_data/key_formatter.rb
86
+ - lib/typed_data/restorer.rb
61
87
  - lib/typed_data/schema.rb
62
88
  - lib/typed_data/schema/array_type.rb
63
89
  - lib/typed_data/schema/boolean_type.rb
64
90
  - lib/typed_data/schema/bytes_type.rb
65
91
  - lib/typed_data/schema/enum_type.rb
92
+ - lib/typed_data/schema/errors.rb
66
93
  - lib/typed_data/schema/float_type.rb
67
94
  - lib/typed_data/schema/int_type.rb
68
95
  - lib/typed_data/schema/long_type.rb
@@ -95,7 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
95
122
  - !ruby/object:Gem::Version
96
123
  version: '0'
97
124
  requirements: []
98
- rubygems_version: 3.1.2
125
+ rubygems_version: 3.1.4
99
126
  signing_key:
100
127
  specification_version: 4
101
128
  summary: A library that converts hash objects managed by an Avro schema