typed_data 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +88 -7
- data/example/convert.rb +22 -0
- data/example/convert.sh +18 -0
- data/example/converted_data.jsonl +1 -0
- data/example/data.jsonl +1 -0
- data/example/restore.rb +17 -0
- data/example/restore.sh +18 -0
- data/example/schema.avsc +47 -0
- data/exe/typed-data +4 -0
- data/lib/typed_data.rb +1 -2
- data/lib/typed_data/cli.rb +68 -0
- data/lib/typed_data/converter.rb +73 -68
- data/lib/typed_data/key_formatter.rb +17 -0
- data/lib/typed_data/restorer.rb +125 -0
- data/lib/typed_data/schema.rb +30 -15
- data/lib/typed_data/schema/array_type.rb +8 -8
- data/lib/typed_data/schema/bytes_type.rb +2 -2
- data/lib/typed_data/schema/int_type.rb +4 -11
- data/lib/typed_data/schema/long_type.rb +4 -13
- data/lib/typed_data/schema/map_type.rb +9 -7
- data/lib/typed_data/schema/record_type.rb +6 -5
- data/lib/typed_data/schema/type.rb +4 -4
- data/lib/typed_data/schema/union_type.rb +6 -7
- data/lib/typed_data/version.rb +1 -1
- data/typed_data.gemspec +2 -0
- metadata +29 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72c692bdd403124256b1670dbaf3fde4a626272779ec7727e07c52e6ca6f4afd
|
4
|
+
data.tar.gz: 9fb2db3eeed81eb8141a01ca7f3b9aa3fc791b5d3790c6903fe63886567f9dc9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e881d11c4543afa34e40d7ccdb40c8f302ac2bb5bf55a29e367e3960cfc885412089dc6b6c40d4167621727141d2f3bd123d5a82d3e6f4ff8b915a7fae65b6b7
|
7
|
+
data.tar.gz: 82b34a7ee6180ea0f12c2a065c5b971d55b495cf15d89b865d92bf0b344b5e6c4065a5993d87a35ed92f95810f27194f6fc6be460b1bb312efb7720d0a2b66a3
|
data/README.md
CHANGED
@@ -23,6 +23,8 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
## Usage
|
25
25
|
|
26
|
+
### Use as Ruby library
|
27
|
+
|
26
28
|
```ruby
|
27
29
|
require "typed_data"
|
28
30
|
|
@@ -81,19 +83,20 @@ converter.convert({
|
|
81
83
|
#=> {"int_field"=>1,
|
82
84
|
# "int_or_string_field"=>{"string_value"=>"string"},
|
83
85
|
# "array_field"=>[1, 2],
|
84
|
-
# "union_type_array_field"=>[{"int_value"=>
|
86
|
+
# "union_type_array_field"=>[{"int_value"=>1}, {"string_value"=>"2"}],
|
85
87
|
# "nested_map_field"=>
|
86
88
|
# [{"key"=>"nested_map",
|
87
89
|
# "value"=>
|
88
|
-
# [{"key"=>"key1", "value"=>{"int_value"=>
|
90
|
+
# [{"key"=>"key1", "value"=>{"int_value"=>1}},
|
89
91
|
# {"key"=>"key2", "value"=>{"string_value"=>"2"}}]}]}
|
90
92
|
```
|
91
93
|
|
92
|
-
You can specify
|
94
|
+
You can specify the formatter for union type keys. The default formatter is `:bigquery`, which is used for BigQuery tables created by loading Avro data for the first time.
|
95
|
+
The other formatter is `:avro`, the formatter for the Avro JSON encoding, which is used in tables managed by [Google BigQuery Sink Connector](https://docs.confluent.io/current/connect/kafka-connect-bigquery/index.html):
|
96
|
+
|
93
97
|
|
94
98
|
```ruby
|
95
|
-
converter = TypedData::Converter.new(schema)
|
96
|
-
converter.union_type_key_formatter = ->(type) { type.split("_").first }
|
99
|
+
converter = TypedData::Converter.new(schema, key_formatter: :avro)
|
97
100
|
converter.convert({
|
98
101
|
"int_field" => 1,
|
99
102
|
"int_or_string_field" => "string",
|
@@ -109,14 +112,92 @@ converter.convert({
|
|
109
112
|
#=> {"int_field"=>1,
|
110
113
|
# "int_or_string_field"=>{"string"=>"string"},
|
111
114
|
# "array_field"=>[1, 2],
|
112
|
-
# "union_type_array_field"=>[{"int"=>
|
115
|
+
# "union_type_array_field"=>[{"int"=>1}, {"string"=>"2"}],
|
113
116
|
# "nested_map_field"=>
|
114
117
|
# [{"key"=>"nested_map",
|
115
118
|
# "value"=>
|
116
|
-
# [{"key"=>"key1", "value"=>{"int"=>
|
119
|
+
# [{"key"=>"key1", "value"=>{"int"=>1}},
|
117
120
|
# {"key"=>"key2", "value"=>{"string"=>"2"}}]}]}
|
118
121
|
```
|
119
122
|
|
123
|
+
`TypedData::Restorer` enables you to restore the converted data:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
restorer = TypedData::Restorer.new(schema)
|
127
|
+
restorer.restore({
|
128
|
+
"int_field" => 1,
|
129
|
+
"int_or_string_field" => { "string_value" => "string" },
|
130
|
+
"array_field" => [1, 2],
|
131
|
+
"union_type_array_field" => [
|
132
|
+
{ "int_value" => 1 },
|
133
|
+
{ "string_value" => "2" },
|
134
|
+
],
|
135
|
+
"nested_map_field" => [
|
136
|
+
{
|
137
|
+
"key" => "nested_map",
|
138
|
+
"value" =>[
|
139
|
+
{
|
140
|
+
"key" => "key1",
|
141
|
+
"value" => { "int_value" => 1 }
|
142
|
+
},
|
143
|
+
{
|
144
|
+
"key" => "key2",
|
145
|
+
"value" => { "string_value" => "2"}
|
146
|
+
},
|
147
|
+
],
|
148
|
+
},
|
149
|
+
],
|
150
|
+
})
|
151
|
+
#=> {"int_field"=>1,
|
152
|
+
# "int_or_string_field"=>"string",
|
153
|
+
# "array_field"=>[1, 2],
|
154
|
+
# "union_type_array_field"=>[1, "2"],
|
155
|
+
# "nested_map_field"=>{"nested_map"=>{"key1"=>1, "key2"=>"2"}}}
|
156
|
+
```
|
157
|
+
|
158
|
+
### Use as CLI
|
159
|
+
|
160
|
+
```
|
161
|
+
$ typed-data help
|
162
|
+
Commands:
|
163
|
+
typed-data convert [file] --schema=SCHEMA # Convert data in an encoding similar to Avro JSON encoding
|
164
|
+
typed-data help [COMMAND] # Describe available commands or one specific command
|
165
|
+
typed-data restore [file] --schema=SCHEMA # Restore converted data
|
166
|
+
|
167
|
+
$ typed-data help convert
|
168
|
+
Usage:
|
169
|
+
typed-data convert [file] --schema=SCHEMA
|
170
|
+
|
171
|
+
Options:
|
172
|
+
--schema=SCHEMA # Path to Avro schema file
|
173
|
+
[--key-format=FORMAT] # Format for union type key
|
174
|
+
# Default: bigquery
|
175
|
+
# Possible values: bigquery, avro
|
176
|
+
|
177
|
+
Description:
|
178
|
+
This command converts data in an encoding similar to Avro JSON encoding. You can specify the file in
|
179
|
+
JSON format or JSON Lines format. If the file option is ommited, the command read data from stdin.
|
180
|
+
$ typed-data help restore
|
181
|
+
Usage:
|
182
|
+
typed-data restore [file] --schema=SCHEMA
|
183
|
+
|
184
|
+
Options:
|
185
|
+
--schema=SCHEMA # Path to Avro schema file
|
186
|
+
[--key-format=FORMAT] # Format for union type key
|
187
|
+
# Default: bigquery
|
188
|
+
# Possible values: bigquery, avro
|
189
|
+
|
190
|
+
Description:
|
191
|
+
This command restores converted data. You can specify the file in JSON format or JSON Lines format. If
|
192
|
+
the file option is ommited, the command read data from stdin.
|
193
|
+
```
|
194
|
+
|
195
|
+
For example, you can restore the data loaded into a BigQuery table like below:
|
196
|
+
|
197
|
+
```
|
198
|
+
$ bq query --format json 'SELECT * FROM <table>' | typed-data restore --schema /path/to/avsc
|
199
|
+
```
|
200
|
+
|
120
201
|
|
121
202
|
## Development
|
122
203
|
|
data/example/convert.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "json"
|
2
|
+
require "typed_data"
|
3
|
+
|
4
|
+
schema = JSON.parse(File.read(File.join(__dir__, "schema.avsc")))
|
5
|
+
data = JSON.parse(File.read(File.join(__dir__, "data.jsonl")))
|
6
|
+
|
7
|
+
puts "Schema:"
|
8
|
+
pp schema
|
9
|
+
puts
|
10
|
+
|
11
|
+
puts "Input data:"
|
12
|
+
pp data
|
13
|
+
puts
|
14
|
+
|
15
|
+
converter = TypedData::Converter.new(schema)
|
16
|
+
puts "Converted data with the default key formatter:"
|
17
|
+
pp converter.convert(data)
|
18
|
+
puts
|
19
|
+
|
20
|
+
converter = TypedData::Converter.new(schema, key_formatter: :avro)
|
21
|
+
puts "Converted data with the key formatter :avro:"
|
22
|
+
pp converter.convert(data)
|
data/example/convert.sh
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
cd $(dirname $0)
|
4
|
+
|
5
|
+
data=data.jsonl
|
6
|
+
|
7
|
+
echo "Input data:"
|
8
|
+
cat $data
|
9
|
+
echo
|
10
|
+
|
11
|
+
cmd="typed-data convert --schema schema.avsc $data"
|
12
|
+
echo "Execute: $cmd"
|
13
|
+
eval $cmd
|
14
|
+
echo
|
15
|
+
|
16
|
+
cmd="cat $data | typed-data convert --schema schema.avsc"
|
17
|
+
echo "Execute: $cmd"
|
18
|
+
eval $cmd
|
@@ -0,0 +1 @@
|
|
1
|
+
{"int_field":1,"int_or_string_field":{"string_value":"string"},"array_field":[1,2],"union_type_array_field":[{"int_value":1},{"string_value":"2"}],"nested_map_field":[{"key":"nested_map","value":[{"key":"key1","value":{"int_value":1}},{"key":"key2","value":{"string_value":"2"}}]}]}
|
data/example/data.jsonl
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
{"int_field":1,"int_or_string_field":"string","array_field":[1,2],"union_type_array_field":[1,"2"],"nested_map_field":{"nested_map":{"key1":1,"key2":"2"}}}
|
data/example/restore.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "json"
|
2
|
+
require "typed_data"
|
3
|
+
|
4
|
+
schema = JSON.parse(File.read(File.join(__dir__, "schema.avsc")))
|
5
|
+
data = JSON.parse(File.read(File.join(__dir__, "converted_data.jsonl")))
|
6
|
+
|
7
|
+
puts "Schema:"
|
8
|
+
pp schema
|
9
|
+
puts
|
10
|
+
|
11
|
+
puts "Input data:"
|
12
|
+
pp data
|
13
|
+
puts
|
14
|
+
|
15
|
+
restorer = TypedData::Restorer.new(schema)
|
16
|
+
puts "Restored data:"
|
17
|
+
pp restorer.restore(data)
|
data/example/restore.sh
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
cd $(dirname $0)
|
4
|
+
|
5
|
+
data=converted_data.jsonl
|
6
|
+
|
7
|
+
echo "Input data:"
|
8
|
+
cat $data
|
9
|
+
echo
|
10
|
+
|
11
|
+
cmd="typed-data restore --schema schema.avsc $data"
|
12
|
+
echo "Execute: $cmd"
|
13
|
+
eval $cmd
|
14
|
+
echo
|
15
|
+
|
16
|
+
cmd="cat $data | typed-data restore --schema schema.avsc"
|
17
|
+
echo "Execute: $cmd"
|
18
|
+
eval $cmd
|
data/example/schema.avsc
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
{
|
2
|
+
"name": "Record",
|
3
|
+
"type": "record",
|
4
|
+
"fields": [
|
5
|
+
{
|
6
|
+
"name": "int_field",
|
7
|
+
"type": "int"
|
8
|
+
},
|
9
|
+
{
|
10
|
+
"name": "int_or_string_field",
|
11
|
+
"type": [
|
12
|
+
"int",
|
13
|
+
"string"
|
14
|
+
]
|
15
|
+
},
|
16
|
+
{
|
17
|
+
"name": "array_field",
|
18
|
+
"type": {
|
19
|
+
"type": "array",
|
20
|
+
"items": "int"
|
21
|
+
}
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"name": "union_type_array_field",
|
25
|
+
"type": {
|
26
|
+
"type": "array",
|
27
|
+
"items": [
|
28
|
+
"int",
|
29
|
+
"string"
|
30
|
+
]
|
31
|
+
}
|
32
|
+
},
|
33
|
+
{
|
34
|
+
"name": "nested_map_field",
|
35
|
+
"type": {
|
36
|
+
"type": "map",
|
37
|
+
"values": {
|
38
|
+
"type": "map",
|
39
|
+
"values": [
|
40
|
+
"int",
|
41
|
+
"string"
|
42
|
+
]
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
]
|
47
|
+
}
|
data/exe/typed-data
ADDED
data/lib/typed_data.rb
CHANGED
@@ -0,0 +1,68 @@
|
|
1
|
+
require "json"
|
2
|
+
require "thor"
|
3
|
+
require "typed_data/converter"
|
4
|
+
require "typed_data/restorer"
|
5
|
+
|
6
|
+
module TypedData
|
7
|
+
class CLI < Thor
|
8
|
+
def self.exit_on_failure?
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "convert [file]", "Convert data in an encoding similar to Avro JSON encoding"
|
13
|
+
long_desc <<~DESC
|
14
|
+
This command converts data in an encoding similar to Avro JSON encoding.
|
15
|
+
You can specify the file in JSON format or JSON Lines format.
|
16
|
+
If the file option is ommited, the command read data from stdin.
|
17
|
+
DESC
|
18
|
+
option :schema, desc: "Path to Avro schema file", required: true
|
19
|
+
option :"key-format", desc: "Format for union type key", enum: %w[bigquery avro], default: "bigquery", banner: "FORMAT"
|
20
|
+
def convert(file = nil)
|
21
|
+
process(TypedData::Converter, :convert, file)
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "restore [file]", "Restore converted data"
|
25
|
+
long_desc <<~DESC
|
26
|
+
This command restores converted data.
|
27
|
+
You can specify the file in JSON format or JSON Lines format.
|
28
|
+
If the file option is ommited, the command read data from stdin.
|
29
|
+
DESC
|
30
|
+
option :schema, desc: "Path to Avro schema file", required: true
|
31
|
+
option :"key-format", desc: "Format for union type key", enum: %w[bigquery avro], default: "bigquery", banner: "FORMAT"
|
32
|
+
def restore(file = nil)
|
33
|
+
process(TypedData::Restorer, :restore, file)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def process(processor_class, method_name, file)
|
39
|
+
abort_if_not_exist(options[:schema])
|
40
|
+
abort_if_not_exist(file) if file
|
41
|
+
|
42
|
+
schema = JSON.parse(File.read(options[:schema]))
|
43
|
+
processor = processor_class.new(schema, key_formatter: options[:"key-format"].to_sym)
|
44
|
+
|
45
|
+
input = file ? File.open(file) : $stdin
|
46
|
+
first_line = input.readline.lstrip
|
47
|
+
if first_line.start_with?("[")
|
48
|
+
first_line << input.read
|
49
|
+
JSON.parse(first_line).each do |record|
|
50
|
+
puts processor.public_send(method_name, record).to_json
|
51
|
+
end
|
52
|
+
else
|
53
|
+
records = input
|
54
|
+
puts processor.public_send(method_name, JSON.parse(first_line)).to_json
|
55
|
+
input.each do |line|
|
56
|
+
puts processor.public_send(method_name, JSON.parse(line)).to_json
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def abort_if_not_exist(file)
|
62
|
+
unless File.exist?(file)
|
63
|
+
$stderr.puts("#{file} doesn't exit")
|
64
|
+
exit(1)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/typed_data/converter.rb
CHANGED
@@ -1,111 +1,116 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
require "typed_data/key_formatter"
|
2
3
|
require "typed_data/schema"
|
3
4
|
|
4
5
|
module TypedData
|
5
6
|
class Converter
|
6
|
-
attr_accessor :union_type_key_formatter
|
7
|
-
|
8
7
|
# @param schema [Hash] an Avro schema
|
9
|
-
|
8
|
+
# @param key_formatter [Symbol]
|
9
|
+
def initialize(schema, key_formatter: :bigquery)
|
10
10
|
@schema = Schema.new(schema)
|
11
|
-
@union_type_key_formatter =
|
11
|
+
@union_type_key_formatter = KeyFormatter.find(key_formatter)
|
12
|
+
end
|
13
|
+
|
14
|
+
def union_type_key_formatter=(formatter)
|
15
|
+
warn "DEPRECATION WARNING: #{__method__} is deprecated. Specify the key_formatter :avsc to TypedData::Converter.new instead."
|
16
|
+
@union_type_key_formatter = formatter
|
12
17
|
end
|
13
18
|
|
14
19
|
# @param data [Hash]
|
15
20
|
def convert(data)
|
16
|
-
|
21
|
+
@schema.root_type.accept(self, data)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @param type [TypedData::Schema::Type]
|
25
|
+
# @param value [Object]
|
26
|
+
def visit(type, value)
|
27
|
+
value
|
17
28
|
end
|
18
29
|
|
19
|
-
|
30
|
+
# @param type [TypedData::Schema::BytesType]
|
31
|
+
# @param value [String]
|
32
|
+
def visit_bytes(type, value)
|
33
|
+
[value].pack("m0")
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param type [TypedData::Schema::IntType]
|
37
|
+
# @param logical_type [String, nil] a logical type of the int type
|
38
|
+
# @param value [Integer]
|
39
|
+
def visit_int(type, logical_type, value)
|
40
|
+
case logical_type
|
41
|
+
when "date"
|
42
|
+
(Date.new(1970, 1, 1) + value).to_s
|
43
|
+
when "time-millis"
|
44
|
+
Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%T.%3N")
|
45
|
+
else
|
46
|
+
value
|
47
|
+
end
|
48
|
+
end
|
20
49
|
|
21
|
-
# @param type [
|
50
|
+
# @param type [TypedData::Schema::LongType]
|
51
|
+
# @param logical_type [String, nil] logical type of the long type
|
52
|
+
# @param value [Integer]
|
53
|
+
def visit_long(type, logical_type, value)
|
54
|
+
case logical_type
|
55
|
+
when "time-micros"
|
56
|
+
Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%T.%6N")
|
57
|
+
when "timestamp-millis"
|
58
|
+
Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%F %T.%3N")
|
59
|
+
when "timestamp-micros"
|
60
|
+
Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%F %T.%6N")
|
61
|
+
else
|
62
|
+
value
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# @param type [TypedData::Schema::RecordType]
|
22
67
|
# @param record [Hash{String => Object}]
|
23
|
-
def
|
68
|
+
def visit_record(type, record)
|
24
69
|
record.each_with_object({}) do |(key, value), converted|
|
25
|
-
|
26
|
-
case subtype
|
27
|
-
when Schema::ArrayType
|
28
|
-
converted[key] = convert_array(subtype, value)
|
29
|
-
when Schema::MapType
|
30
|
-
converted[key] = convert_map(subtype, value)
|
31
|
-
when Schema::RecordType
|
32
|
-
converted[key] = convert_record(subtype, value)
|
33
|
-
when Schema::UnionType
|
34
|
-
converted[key] = convert_union(subtype, value)
|
35
|
-
else
|
36
|
-
converted[key] = subtype.coerce(value)
|
37
|
-
end
|
70
|
+
converted[key] = type.find_type(key).accept(self, value)
|
38
71
|
end
|
39
72
|
end
|
40
73
|
|
41
|
-
# @param type [ArrayType]
|
74
|
+
# @param type [TypedData::Schema::ArrayType]
|
42
75
|
# @param array [Array<Object>]
|
43
|
-
def
|
76
|
+
def visit_array(type, array)
|
44
77
|
array.each_with_object([]) do |value, ret|
|
45
78
|
next if value.nil?
|
46
79
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
ret.concat(
|
51
|
-
when Schema::MapType
|
52
|
-
ret << convert_map(subtype, value)
|
53
|
-
when Schema::RecordType
|
54
|
-
ret << convert_record(subtype, value)
|
55
|
-
when Schema::UnionType
|
56
|
-
ret << convert_union(subtype, value)
|
80
|
+
converted_value = type.element_type.accept(self, value)
|
81
|
+
if type.element_type.is_a?(Schema::ArrayType)
|
82
|
+
# BigQuery doesn't support nested arrays
|
83
|
+
ret.concat(converted_value)
|
57
84
|
else
|
58
|
-
ret <<
|
85
|
+
ret << converted_value
|
59
86
|
end
|
60
87
|
end
|
61
88
|
end
|
62
89
|
|
63
|
-
# @param type [MapType]
|
90
|
+
# @param type [TypedData::Schema::MapType]
|
64
91
|
# @param map [Hash{String => Object}]
|
65
|
-
def
|
92
|
+
def visit_map(type, map)
|
66
93
|
map.each_with_object([]) do |(key, value), ret|
|
67
|
-
|
68
|
-
case subtype
|
69
|
-
when Schema::ArrayType
|
70
|
-
value = convert_array(subtype, value)
|
71
|
-
when Schema::MapType
|
72
|
-
value = convert_map(subtype, value)
|
73
|
-
when Schema::RecordType
|
74
|
-
value = convert_record(subtype, value)
|
75
|
-
when Schema::UnionType
|
76
|
-
value = convert_union(subtype, value)
|
77
|
-
else
|
78
|
-
value = subtype.coerce(value)
|
79
|
-
end
|
80
|
-
ret << { "key" => key, "value" => value }
|
94
|
+
ret << { "key" => key, "value" => type.element_type.accept(self, value) }
|
81
95
|
end
|
82
96
|
end
|
83
97
|
|
84
|
-
# @param type [UnionType]
|
98
|
+
# @param type [TypedData::Schema::UnionType]
|
99
|
+
# @param types [Array<TypedData::Schema::Type>] types the union type includes
|
85
100
|
# @param map [Object]
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
converted_value = convert_array(subtype, value)
|
91
|
-
when Schema::MapType
|
92
|
-
converted_value = convert_map(subtype, value)
|
93
|
-
when Schema::RecordType
|
94
|
-
converted_value = convert_record(subtype, value)
|
95
|
-
when Schema::UnionType
|
96
|
-
converted_value = convert_union(subtype, value)
|
97
|
-
when Schema::NullType
|
98
|
-
converted_value = nil
|
99
|
-
else
|
100
|
-
converted_value = subtype.coerce(value)
|
101
|
+
def visit_union(type, types, value)
|
102
|
+
element_type = types.find { |t| t.match?(value) }
|
103
|
+
if element_type.nil?
|
104
|
+
raise Schema::InvalidValue, %Q{the value #{value.inspect} doesn't match the type #{types.map(&:to_s)}}
|
101
105
|
end
|
106
|
+
converted_value = element_type.accept(self, value)
|
102
107
|
|
103
108
|
if type.nullable_single?
|
104
109
|
converted_value
|
105
|
-
elsif
|
110
|
+
elsif element_type.is_a?(Schema::NullType)
|
106
111
|
{}
|
107
112
|
else
|
108
|
-
{ union_type_key_formatter.call(
|
113
|
+
{ @union_type_key_formatter.call(element_type.to_s) => converted_value }
|
109
114
|
end
|
110
115
|
end
|
111
116
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module TypedData
|
2
|
+
class KeyFormatter
|
3
|
+
class UnknownFormatter < StandardError; end
|
4
|
+
|
5
|
+
UNION_TYPE_KEY_FORMATTERS = {
|
6
|
+
bigquery: ->(type) { "#{type}_value" },
|
7
|
+
avro: ->(type) { type.split("_").first },
|
8
|
+
}
|
9
|
+
|
10
|
+
# @param formatter [Symbol]
|
11
|
+
def self.find(formatter)
|
12
|
+
UNION_TYPE_KEY_FORMATTERS.fetch(formatter) do
|
13
|
+
raise UnknownFormatter, "Unknown formatter: #{formatter}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "time"
|
3
|
+
require "typed_data/key_formatter"
|
4
|
+
require "typed_data/schema"
|
5
|
+
|
6
|
+
module TypedData
|
7
|
+
class Restorer
|
8
|
+
# @param schema [Hash] an Avro schema
|
9
|
+
# @param key_formatter [Symbol]
|
10
|
+
def initialize(schema, key_formatter: :bigquery)
|
11
|
+
@schema = Schema.new(schema)
|
12
|
+
@union_type_key_formatter = KeyFormatter.find(key_formatter)
|
13
|
+
end
|
14
|
+
|
15
|
+
# @param data [Hash]
|
16
|
+
def restore(data)
|
17
|
+
@schema.root_type.accept(self, data)
|
18
|
+
end
|
19
|
+
|
20
|
+
# @param type [TypedData::Schema::Type]
|
21
|
+
# @param value [Object]
|
22
|
+
def visit(type, value)
|
23
|
+
value
|
24
|
+
end
|
25
|
+
|
26
|
+
# @param type [TypedData::Schema::BytesType]
|
27
|
+
# @param value [String]
|
28
|
+
def visit_bytes(type, value)
|
29
|
+
value.unpack("m0").first
|
30
|
+
end
|
31
|
+
|
32
|
+
# @param type [TypedData::Schema::IntType]
|
33
|
+
# @param logical_type [String, nil] a logical type of the int type
|
34
|
+
# @param value [Integer]
|
35
|
+
def visit_int(type, logical_type, value)
|
36
|
+
case logical_type
|
37
|
+
when "date"
|
38
|
+
(Date.parse(value) - Date.new(1970, 1, 1)).to_i
|
39
|
+
when "time-millis"
|
40
|
+
t = Time.parse(value)
|
41
|
+
(t.sec + t.min * 60 + t.hour * 60**2) * 10**3 + t.nsec / 10**6
|
42
|
+
else
|
43
|
+
value
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# @param type [TypedData::Schema::LongType]
|
48
|
+
# @param logical_type [String, nil] logical type of the long type
|
49
|
+
# @param value [Integer]
|
50
|
+
def visit_long(type, logical_type, value)
|
51
|
+
case logical_type
|
52
|
+
when "time-micros"
|
53
|
+
t = Time.parse(value)
|
54
|
+
(t.sec + t.min * 60 + t.hour * 60**2) * 10**6 + t.nsec / 10**3
|
55
|
+
when "timestamp-millis"
|
56
|
+
t = parse_as_utc(value)
|
57
|
+
t.to_i * 10**3 + t.nsec / 10**6
|
58
|
+
when "timestamp-micros"
|
59
|
+
t = parse_as_utc(value)
|
60
|
+
t.to_i * 10**6 + t.nsec / 10**3
|
61
|
+
else
|
62
|
+
value
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# @param type [TypedData::Schema::RecordType]
|
67
|
+
# @param record [Hash{String => Object}]
|
68
|
+
def visit_record(type, record)
|
69
|
+
record.each_with_object({}) do |(key, value), restored|
|
70
|
+
restored[key] = type.find_type(key).accept(self, value)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# @param type [TypedData::Schema::ArrayType]
|
75
|
+
# @param array [Array<Object>]
|
76
|
+
def visit_array(type, array)
|
77
|
+
array.each_with_object([]) do |value, ret|
|
78
|
+
next if value.nil?
|
79
|
+
|
80
|
+
if type.element_type.is_a?(Schema::ArrayType)
|
81
|
+
# BigQuery doesn't support nested arrays
|
82
|
+
ret << type.element_type.element_type.accept(self, value)
|
83
|
+
else
|
84
|
+
ret << type.element_type.accept(self, value)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# @param type [TypedData::Schema::MapType]
|
90
|
+
# @param map [Hash{String => Object}]
|
91
|
+
def visit_map(type, array)
|
92
|
+
array.each_with_object({}) do |hash, ret|
|
93
|
+
ret[hash["key"]] = type.element_type.accept(self, hash["value"])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# @param type [TypedData::Schema::UnionType]
|
98
|
+
# @param types [Array<TypedData::Schema::Type>] types the union type includes
|
99
|
+
# @param map [Object]
|
100
|
+
def visit_union(type, types, value)
|
101
|
+
if type.nullable_single?
|
102
|
+
return if value.nil?
|
103
|
+
|
104
|
+
element_type = types.find { |t| !t.is_a?(Schema::NullType) }
|
105
|
+
element_type.accept(self, value)
|
106
|
+
else
|
107
|
+
value_without_nil = value.compact
|
108
|
+
return if value_without_nil.empty?
|
109
|
+
|
110
|
+
k = value_without_nil.keys.first
|
111
|
+
v = value_without_nil.values.first
|
112
|
+
element_type = types.find { |t| k == @union_type_key_formatter.call(t.to_s) }
|
113
|
+
element_type.accept(self, v)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
# @param time [String]
|
120
|
+
def parse_as_utc(time)
|
121
|
+
d = Date._parse(time)
|
122
|
+
Time.utc(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d.fetch(:sec_fraction, 0) * 1000000)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/lib/typed_data/schema.rb
CHANGED
@@ -16,6 +16,8 @@ require "typed_data/schema/errors"
|
|
16
16
|
module TypedData
|
17
17
|
class Schema
|
18
18
|
class << self
|
19
|
+
# @param type [String, Hash{Symbol => Object}, Array<Hash{Symbol => Object}>]
|
20
|
+
# @param logical_type [String, nil]
|
19
21
|
def build_type(type, logical_type = nil)
|
20
22
|
type = type.first if type.is_a?(Array) && type.size == 1
|
21
23
|
|
@@ -23,27 +25,26 @@ module TypedData
|
|
23
25
|
when Array
|
24
26
|
UnionType.new(type)
|
25
27
|
when Hash
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
return build_type(subtype, logical_type)
|
28
|
+
actual_type = type[:type]
|
29
|
+
if type[:logicalType]
|
30
|
+
return build_type(actual_type, type[:logicalType])
|
30
31
|
end
|
31
32
|
|
32
|
-
case
|
33
|
+
case actual_type
|
33
34
|
when "enum"
|
34
|
-
EnumType.new(type[
|
35
|
+
EnumType.new(type[:name], type[:symbols])
|
35
36
|
when "fixed"
|
36
|
-
BytesType.new(type[
|
37
|
+
BytesType.new(type[:name] || "bytes")
|
37
38
|
when "array"
|
38
|
-
items = type[
|
39
|
+
items = type[:items]
|
39
40
|
ArrayType.new(items.is_a?(Array) ? items : [items])
|
40
41
|
when "map"
|
41
|
-
values = type[
|
42
|
+
values = type[:values]
|
42
43
|
MapType.new(values.is_a?(Array) ? values : [values])
|
43
44
|
when "record"
|
44
|
-
RecordType.new(type[
|
45
|
+
RecordType.new(type[:name], type[:fields])
|
45
46
|
else
|
46
|
-
raise UnsupportedType, "Unknown type: #{
|
47
|
+
raise UnsupportedType, "Unknown type: #{actual_type}"
|
47
48
|
end
|
48
49
|
when "boolean"
|
49
50
|
BooleanType.new(type, logical_type)
|
@@ -69,11 +70,25 @@ module TypedData
|
|
69
70
|
|
70
71
|
# @param schema [Hash] an Avro schema
|
71
72
|
def initialize(schema)
|
72
|
-
@schema = schema
|
73
|
-
|
74
|
-
|
73
|
+
@schema = deep_symbolize_keys(schema)
|
74
|
+
@root_type = Schema.build_type(@schema)
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
# @param hash [Object]
|
80
|
+
# @return [Object] an object with symbolized keys
|
81
|
+
def deep_symbolize_keys(o)
|
82
|
+
case o
|
83
|
+
when Array
|
84
|
+
o.map(&method(:deep_symbolize_keys))
|
85
|
+
when Hash
|
86
|
+
o.each_with_object({}) do |(k, v), h|
|
87
|
+
h[k.to_sym] = deep_symbolize_keys(v)
|
88
|
+
end
|
89
|
+
else
|
90
|
+
o
|
75
91
|
end
|
76
|
-
@root_type = RecordType.new(schema["name"] || schema[:name], schema["fields"] || schema[:fields])
|
77
92
|
end
|
78
93
|
end
|
79
94
|
end
|
@@ -4,27 +4,27 @@ require "typed_data/schema/type"
|
|
4
4
|
module TypedData
|
5
5
|
class Schema
|
6
6
|
class ArrayType < Type
|
7
|
-
attr_reader :
|
7
|
+
attr_reader :element_type
|
8
8
|
|
9
9
|
# @param types [Array<String>]
|
10
10
|
def initialize(types)
|
11
|
-
@
|
11
|
+
@element_type = Schema.build_type(types)
|
12
|
+
end
|
13
|
+
|
14
|
+
def accept(visitor, value)
|
15
|
+
visitor.visit_array(self, value)
|
12
16
|
end
|
13
17
|
|
14
18
|
def to_s
|
15
|
-
"array_#{@
|
19
|
+
"array_#{@element_type}"
|
16
20
|
end
|
17
21
|
|
18
22
|
def primitive?
|
19
23
|
false
|
20
24
|
end
|
21
25
|
|
22
|
-
def find_match(value)
|
23
|
-
@type.match?(value) ? @type : @type.find_match(value)
|
24
|
-
end
|
25
|
-
|
26
26
|
def match?(value)
|
27
|
-
value.is_a?(Array) && value.all? { |v| @
|
27
|
+
value.is_a?(Array) && value.all? { |v| @element_type.match?(v) }
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -6,6 +6,10 @@ module TypedData
|
|
6
6
|
VALUE_RANGE = -2**31 .. 2**31 - 1
|
7
7
|
SUPPORTED_LOGICAL_TYPES = %w[date time-millis]
|
8
8
|
|
9
|
+
def accept(visitor, value)
|
10
|
+
visitor.visit_int(self, @logical_type, value)
|
11
|
+
end
|
12
|
+
|
9
13
|
def to_s
|
10
14
|
if @logical_type
|
11
15
|
"#{@name}_#{@logical_type.gsub("-", "_")}"
|
@@ -14,17 +18,6 @@ module TypedData
|
|
14
18
|
end
|
15
19
|
end
|
16
20
|
|
17
|
-
def coerce(value)
|
18
|
-
case @logical_type
|
19
|
-
when "date"
|
20
|
-
(Date.new(1970, 1, 1) + value).to_s
|
21
|
-
when "time-millis"
|
22
|
-
Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%T.%3N")
|
23
|
-
else
|
24
|
-
value
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
21
|
def primitive?
|
29
22
|
true
|
30
23
|
end
|
@@ -5,6 +5,10 @@ module TypedData
|
|
5
5
|
class LongType < Type
|
6
6
|
SUPPORTED_LOGICAL_TYPES = %w[time-micros timestamp-millis timestamp-micros]
|
7
7
|
|
8
|
+
def accept(visitor, value)
|
9
|
+
visitor.visit_long(self, @logical_type, value)
|
10
|
+
end
|
11
|
+
|
8
12
|
def to_s
|
9
13
|
if @logical_type
|
10
14
|
"#{@name}_#{@logical_type.gsub("-", "_")}"
|
@@ -13,19 +17,6 @@ module TypedData
|
|
13
17
|
end
|
14
18
|
end
|
15
19
|
|
16
|
-
def coerce(value)
|
17
|
-
case @logical_type
|
18
|
-
when "time-micros"
|
19
|
-
Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%T.%6N")
|
20
|
-
when "timestamp-millis"
|
21
|
-
Time.at(value / 1_000, value % 1_000 * 1_000).utc.strftime("%F %T.%3N")
|
22
|
-
when "timestamp-micros"
|
23
|
-
Time.at(value / 1_000_000, value % 1_000_000).utc.strftime("%F %T.%6N")
|
24
|
-
else
|
25
|
-
value
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
20
|
def primitive?
|
30
21
|
true
|
31
22
|
end
|
@@ -3,25 +3,27 @@
|
|
3
3
|
module TypedData
|
4
4
|
class Schema
|
5
5
|
class MapType < Type
|
6
|
+
attr_reader :element_type
|
7
|
+
|
6
8
|
# @param types [Array<String>]
|
7
9
|
def initialize(types)
|
8
|
-
@
|
10
|
+
@element_type = Schema.build_type(types)
|
11
|
+
end
|
12
|
+
|
13
|
+
def accept(visitor, value)
|
14
|
+
visitor.visit_map(self, value)
|
9
15
|
end
|
10
16
|
|
11
17
|
def to_s
|
12
|
-
"map_#{@
|
18
|
+
"map_#{@element_type}"
|
13
19
|
end
|
14
20
|
|
15
21
|
def primitive?
|
16
22
|
false
|
17
23
|
end
|
18
24
|
|
19
|
-
def find_match(value)
|
20
|
-
@type.match?(value) ? @type : @type.find_match(value)
|
21
|
-
end
|
22
|
-
|
23
25
|
def match?(value)
|
24
|
-
value.is_a?(Hash) && value.all? { |_, v| @
|
26
|
+
value.is_a?(Hash) && value.all? { |_, v| @element_type.match?(v) }
|
25
27
|
end
|
26
28
|
end
|
27
29
|
end
|
@@ -3,14 +3,19 @@
|
|
3
3
|
module TypedData
|
4
4
|
class Schema
|
5
5
|
class RecordType < Type
|
6
|
+
# @param name [String]
|
6
7
|
# @param fields [Array] an array of "fields" in an Avro schema
|
7
8
|
def initialize(name, fields)
|
8
9
|
@name = name
|
9
10
|
@field_to_type = fields.each_with_object({}) do |field, h|
|
10
|
-
h[field[
|
11
|
+
h[field[:name]] = Schema.build_type(field[:type])
|
11
12
|
end
|
12
13
|
end
|
13
14
|
|
15
|
+
def accept(visitor, value)
|
16
|
+
visitor.visit_record(self, value)
|
17
|
+
end
|
18
|
+
|
14
19
|
def primitive?
|
15
20
|
false
|
16
21
|
end
|
@@ -22,10 +27,6 @@ module TypedData
|
|
22
27
|
end
|
23
28
|
end
|
24
29
|
|
25
|
-
def find_match(value)
|
26
|
-
raise InvalidValue, %Q{the value #{value.inspect} doesn't match the type #{self}}
|
27
|
-
end
|
28
|
-
|
29
30
|
def match?(value)
|
30
31
|
value.is_a?(Hash) && value.all? { |k, v| @field_to_type[k]&.match?(v) }
|
31
32
|
end
|
@@ -8,22 +8,21 @@ module TypedData
|
|
8
8
|
def initialize(types)
|
9
9
|
@types = types.map(&Schema.method(:build_type))
|
10
10
|
@nullable_single = @types.size == 2 && @types.any? { |t| t.is_a?(NullType) }
|
11
|
-
@
|
11
|
+
@nullable_primitive_type = @types.find(&:primitive?) if @nullable_single
|
12
|
+
end
|
13
|
+
|
14
|
+
def accept(visitor, value)
|
15
|
+
visitor.visit_union(self, @types, value)
|
12
16
|
end
|
13
17
|
|
14
18
|
def to_s
|
15
|
-
@
|
19
|
+
@nullable_primitive_type&.to_s || "union_#{@types.map(&:to_s).join("_")}"
|
16
20
|
end
|
17
21
|
|
18
22
|
def primitive?
|
19
23
|
false
|
20
24
|
end
|
21
25
|
|
22
|
-
def find_match(value)
|
23
|
-
@types.find { |t| t.match?(value) } or
|
24
|
-
raise InvalidValue, %Q{the value #{value.inspect} doesn't match the type #{@types.map(&:to_s)}}
|
25
|
-
end
|
26
|
-
|
27
26
|
def match?(value)
|
28
27
|
@types.any? { |t| t.match?(value) }
|
29
28
|
end
|
data/lib/typed_data/version.rb
CHANGED
data/typed_data.gemspec
CHANGED
@@ -22,6 +22,8 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
23
|
spec.require_paths = ["lib"]
|
24
24
|
|
25
|
+
spec.add_runtime_dependency "thor"
|
26
|
+
|
25
27
|
spec.add_development_dependency "avro"
|
26
28
|
spec.add_development_dependency "google-cloud-bigquery"
|
27
29
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: typed_data
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- abicky
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-08-
|
11
|
+
date: 2021-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: avro
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -42,7 +56,8 @@ description: TypedData is a library that converts hash objects managed by an Avr
|
|
42
56
|
schema so that the objects can be loaded into BigQuery.
|
43
57
|
email:
|
44
58
|
- takeshi.arabiki@gmail.com
|
45
|
-
executables:
|
59
|
+
executables:
|
60
|
+
- typed-data
|
46
61
|
extensions: []
|
47
62
|
extra_rdoc_files: []
|
48
63
|
files:
|
@@ -56,8 +71,19 @@ files:
|
|
56
71
|
- Rakefile
|
57
72
|
- bin/console
|
58
73
|
- bin/setup
|
74
|
+
- example/convert.rb
|
75
|
+
- example/convert.sh
|
76
|
+
- example/converted_data.jsonl
|
77
|
+
- example/data.jsonl
|
78
|
+
- example/restore.rb
|
79
|
+
- example/restore.sh
|
80
|
+
- example/schema.avsc
|
81
|
+
- exe/typed-data
|
59
82
|
- lib/typed_data.rb
|
83
|
+
- lib/typed_data/cli.rb
|
60
84
|
- lib/typed_data/converter.rb
|
85
|
+
- lib/typed_data/key_formatter.rb
|
86
|
+
- lib/typed_data/restorer.rb
|
61
87
|
- lib/typed_data/schema.rb
|
62
88
|
- lib/typed_data/schema/array_type.rb
|
63
89
|
- lib/typed_data/schema/boolean_type.rb
|