structurize 2.16.6__py3-none-any.whl → 2.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +1 -0
- avrotize/_version.py +3 -3
- avrotize/avrotocsharp.py +74 -10
- avrotize/avrotojava.py +1130 -51
- avrotize/avrotopython.py +4 -2
- avrotize/commands.json +671 -53
- avrotize/common.py +6 -1
- avrotize/jsonstoavro.py +518 -49
- avrotize/structuretocpp.py +697 -0
- avrotize/structuretocsv.py +365 -0
- avrotize/structuretodatapackage.py +659 -0
- avrotize/structuretodb.py +1125 -0
- avrotize/structuretogo.py +720 -0
- avrotize/structuretographql.py +502 -0
- avrotize/structuretoiceberg.py +355 -0
- avrotize/structuretojava.py +853 -0
- avrotize/structuretokusto.py +639 -0
- avrotize/structuretomd.py +322 -0
- avrotize/structuretoproto.py +764 -0
- avrotize/structuretorust.py +714 -0
- avrotize/structuretoxsd.py +679 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/METADATA +1 -1
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/RECORD +27 -14
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/WHEEL +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/licenses/LICENSE +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/top_level.txt +0 -0
avrotize/jsonstoavro.py
CHANGED
|
@@ -87,6 +87,194 @@ class JsonToAvroConverter:
|
|
|
87
87
|
return True
|
|
88
88
|
return False
|
|
89
89
|
|
|
90
|
+
def detect_discriminated_union(self, json_type: dict):
|
|
91
|
+
"""
|
|
92
|
+
Detect if a JSON schema is a discriminated union pattern using allOf with if/then conditionals.
|
|
93
|
+
|
|
94
|
+
A discriminated union pattern consists of:
|
|
95
|
+
- A base schema with a discriminator field (usually 'type') with an enum
|
|
96
|
+
- An allOf array containing if/then conditionals that add fields based on discriminator value
|
|
97
|
+
|
|
98
|
+
Parameters:
|
|
99
|
+
json_type (dict): The JSON schema object to check
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
list | None: List of discriminator values if pattern detected, None otherwise
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(json_type, dict) or 'allOf' not in json_type:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
# Check for discriminator field with enum values
|
|
108
|
+
properties = json_type.get('properties', {})
|
|
109
|
+
if 'type' not in properties or 'enum' not in properties.get('type', {}):
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
discriminator_values = properties['type']['enum']
|
|
113
|
+
|
|
114
|
+
# Check if allOf contains if/then conditionals
|
|
115
|
+
has_if_then = any(
|
|
116
|
+
isinstance(item, dict) and 'if' in item and 'then' in item
|
|
117
|
+
for item in json_type['allOf']
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if has_if_then and len(discriminator_values) > 0:
|
|
121
|
+
return discriminator_values
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
def handle_inline_conditional_schema(self, json_type: dict) -> Tuple[bool, dict]:
|
|
126
|
+
"""
|
|
127
|
+
Handle inline if/then/else conditional schemas by converting them to appropriate structures.
|
|
128
|
+
|
|
129
|
+
Supports the following patterns:
|
|
130
|
+
1. Type-based conditional: if {properties: {type: {enum: [X]}}}, then {...}, else {...}
|
|
131
|
+
- Converted to oneOf with discriminated variants
|
|
132
|
+
2. Field presence conditional: if {properties: {field: {...}}, required: [field]}
|
|
133
|
+
- Merged into comprehensive type (Avro handles optional fields naturally)
|
|
134
|
+
|
|
135
|
+
Parameters:
|
|
136
|
+
json_type (dict): The JSON schema object to process
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple[bool, dict]: (was_handled, modified_json_type)
|
|
140
|
+
"""
|
|
141
|
+
if not isinstance(json_type, dict) or 'if' not in json_type:
|
|
142
|
+
return (False, json_type)
|
|
143
|
+
|
|
144
|
+
if_clause = json_type.get('if', {})
|
|
145
|
+
then_clause = json_type.get('then', {})
|
|
146
|
+
else_clause = json_type.get('else', None)
|
|
147
|
+
|
|
148
|
+
# Check for type-based discriminator pattern
|
|
149
|
+
# if: {properties: {type: {enum: ["X"]}}}
|
|
150
|
+
if (isinstance(if_clause, dict) and
|
|
151
|
+
'properties' in if_clause and
|
|
152
|
+
'type' in if_clause['properties']):
|
|
153
|
+
|
|
154
|
+
type_prop = if_clause['properties']['type']
|
|
155
|
+
if isinstance(type_prop, dict) and 'enum' in type_prop:
|
|
156
|
+
# This is a type-based conditional - convert to oneOf
|
|
157
|
+
return self._convert_type_conditional_to_oneof(json_type, if_clause, then_clause, else_clause)
|
|
158
|
+
|
|
159
|
+
# Check for field presence pattern
|
|
160
|
+
# if: {properties: {field: {...}}, required: [field]}
|
|
161
|
+
if (isinstance(if_clause, dict) and
|
|
162
|
+
'properties' in if_clause and
|
|
163
|
+
'required' in if_clause):
|
|
164
|
+
# This is a field presence conditional - merge all branches
|
|
165
|
+
return self._merge_conditional_branches(json_type, then_clause, else_clause)
|
|
166
|
+
|
|
167
|
+
# Unsupported pattern
|
|
168
|
+
return (False, json_type)
|
|
169
|
+
|
|
170
|
+
def _convert_type_conditional_to_oneof(self, json_type: dict, if_clause: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
|
|
171
|
+
"""
|
|
172
|
+
Convert a type-based conditional schema to oneOf structure.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
Input: {type: object, properties: {type: {enum: [image, host]}}, if: {...}, then: {...}, else: {...}}
|
|
176
|
+
Output: {oneOf: [then_merged_with_base, else_merged_with_base]}
|
|
177
|
+
"""
|
|
178
|
+
# Create a base type without the conditional parts
|
|
179
|
+
base_type = {}
|
|
180
|
+
for key, value in json_type.items():
|
|
181
|
+
if key not in ('if', 'then', 'else'):
|
|
182
|
+
base_type[key] = copy.deepcopy(value)
|
|
183
|
+
|
|
184
|
+
oneof_variants = []
|
|
185
|
+
|
|
186
|
+
# Process then clause
|
|
187
|
+
if then_clause:
|
|
188
|
+
then_variant = self._merge_conditional_branch(base_type, then_clause)
|
|
189
|
+
oneof_variants.append(then_variant)
|
|
190
|
+
|
|
191
|
+
# Process else clause (which may contain nested if/then/else)
|
|
192
|
+
if else_clause:
|
|
193
|
+
if 'if' in else_clause:
|
|
194
|
+
# Recursive handling of nested conditional
|
|
195
|
+
handled, processed_else = self.handle_inline_conditional_schema(else_clause)
|
|
196
|
+
if handled and 'oneOf' in processed_else:
|
|
197
|
+
# Flatten nested oneOf
|
|
198
|
+
for variant in processed_else['oneOf']:
|
|
199
|
+
merged = self._merge_conditional_branch(base_type, variant)
|
|
200
|
+
oneof_variants.append(merged)
|
|
201
|
+
else:
|
|
202
|
+
else_variant = self._merge_conditional_branch(base_type, else_clause)
|
|
203
|
+
oneof_variants.append(else_variant)
|
|
204
|
+
else:
|
|
205
|
+
else_variant = self._merge_conditional_branch(base_type, else_clause)
|
|
206
|
+
oneof_variants.append(else_variant)
|
|
207
|
+
|
|
208
|
+
if len(oneof_variants) > 0:
|
|
209
|
+
result = copy.deepcopy(base_type)
|
|
210
|
+
# Remove properties since they'll be in the variants
|
|
211
|
+
if 'properties' in result:
|
|
212
|
+
del result['properties']
|
|
213
|
+
if 'additionalProperties' in result:
|
|
214
|
+
del result['additionalProperties']
|
|
215
|
+
if 'required' in result:
|
|
216
|
+
del result['required']
|
|
217
|
+
result['oneOf'] = oneof_variants
|
|
218
|
+
return (True, result)
|
|
219
|
+
|
|
220
|
+
return (False, json_type)
|
|
221
|
+
|
|
222
|
+
def _merge_conditional_branches(self, json_type: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
|
|
223
|
+
"""
|
|
224
|
+
Merge conditional branches for field presence patterns.
|
|
225
|
+
Avro handles optional fields naturally, so we can merge all properties.
|
|
226
|
+
"""
|
|
227
|
+
result = {}
|
|
228
|
+
for key, value in json_type.items():
|
|
229
|
+
if key not in ('if', 'then', 'else'):
|
|
230
|
+
result[key] = copy.deepcopy(value)
|
|
231
|
+
|
|
232
|
+
# Merge properties from then clause
|
|
233
|
+
if then_clause and 'properties' in then_clause:
|
|
234
|
+
if 'properties' not in result:
|
|
235
|
+
result['properties'] = {}
|
|
236
|
+
for prop_name, prop_def in then_clause['properties'].items():
|
|
237
|
+
if prop_name not in result['properties']:
|
|
238
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
239
|
+
|
|
240
|
+
# Merge properties from else clause
|
|
241
|
+
if else_clause and 'properties' in else_clause:
|
|
242
|
+
if 'properties' not in result:
|
|
243
|
+
result['properties'] = {}
|
|
244
|
+
for prop_name, prop_def in else_clause['properties'].items():
|
|
245
|
+
if prop_name not in result['properties']:
|
|
246
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
247
|
+
|
|
248
|
+
return (True, result)
|
|
249
|
+
|
|
250
|
+
def _merge_conditional_branch(self, base: dict, branch: dict) -> dict:
|
|
251
|
+
"""Merge a conditional branch with the base type."""
|
|
252
|
+
result = copy.deepcopy(base)
|
|
253
|
+
|
|
254
|
+
if not branch:
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
# Merge properties
|
|
258
|
+
if 'properties' in branch:
|
|
259
|
+
if 'properties' not in result:
|
|
260
|
+
result['properties'] = {}
|
|
261
|
+
for prop_name, prop_def in branch['properties'].items():
|
|
262
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
263
|
+
|
|
264
|
+
# Merge additionalProperties
|
|
265
|
+
if 'additionalProperties' in branch:
|
|
266
|
+
result['additionalProperties'] = branch['additionalProperties']
|
|
267
|
+
|
|
268
|
+
# Merge required (union of required fields)
|
|
269
|
+
if 'required' in branch:
|
|
270
|
+
if 'required' not in result:
|
|
271
|
+
result['required'] = []
|
|
272
|
+
for req in branch['required']:
|
|
273
|
+
if req not in result['required']:
|
|
274
|
+
result['required'].append(req)
|
|
275
|
+
|
|
276
|
+
return result
|
|
277
|
+
|
|
90
278
|
def flatten_union(self, type_list: list) -> list:
|
|
91
279
|
"""
|
|
92
280
|
Flatten the list of types in a union into a single list.
|
|
@@ -367,7 +555,13 @@ class JsonToAvroConverter:
|
|
|
367
555
|
"""
|
|
368
556
|
if isinstance(json_primitive, list):
|
|
369
557
|
if enum:
|
|
370
|
-
|
|
558
|
+
# Handle mixed-type enums properly using the dedicated helper
|
|
559
|
+
return self.create_enum_for_mixed_types(
|
|
560
|
+
field_name + '_1',
|
|
561
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
562
|
+
enum,
|
|
563
|
+
json_primitive
|
|
564
|
+
)
|
|
371
565
|
else:
|
|
372
566
|
union = []
|
|
373
567
|
for item in json_primitive:
|
|
@@ -558,12 +752,34 @@ class JsonToAvroConverter:
|
|
|
558
752
|
if isinstance(json_type, dict):
|
|
559
753
|
|
|
560
754
|
json_object_type = json_type.get('type')
|
|
755
|
+
# Check if the type is already an Avro schema (e.g., shared discriminator enum)
|
|
756
|
+
# This happens when a discriminated union property was pre-set with an Avro type
|
|
757
|
+
if isinstance(json_object_type, dict) and 'type' in json_object_type and json_object_type.get('type') in ['enum', 'record', 'fixed', 'array', 'map']:
|
|
758
|
+
return self.post_check_avro_type(dependencies, json_object_type)
|
|
561
759
|
if isinstance(json_object_type, list):
|
|
562
760
|
# if the 'type' is a list, we map it back to a string
|
|
563
761
|
# if the list has only one item or if the list has two items
|
|
564
762
|
# and one of them is 'null'
|
|
565
763
|
# otherwise, we will construct and inject a oneOf type
|
|
566
764
|
# and split the type
|
|
765
|
+
|
|
766
|
+
# Special case: if we have a mixed-type enum (e.g., type: ["string", "integer"] with enum),
|
|
767
|
+
# handle it directly here to avoid duplicate processing
|
|
768
|
+
if 'enum' in json_type and any(t in json_object_type for t in ['string', 'integer', 'int']):
|
|
769
|
+
has_null = 'null' in json_object_type
|
|
770
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
771
|
+
local_name + '_1',
|
|
772
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
773
|
+
json_type['enum'],
|
|
774
|
+
json_object_type
|
|
775
|
+
)
|
|
776
|
+
if 'description' in json_type and isinstance(avro_type, dict):
|
|
777
|
+
avro_type['doc'] = json_type['description']
|
|
778
|
+
elif 'description' in json_type and isinstance(avro_type, list):
|
|
779
|
+
# For unions, we can't set doc directly - it will be set on the field
|
|
780
|
+
pass
|
|
781
|
+
return self.post_check_avro_type(dependencies, avro_type)
|
|
782
|
+
|
|
567
783
|
if len(json_object_type) == 1:
|
|
568
784
|
json_object_type = json_object_type[0]
|
|
569
785
|
elif len(json_object_type) == 2 and 'null' in json_object_type:
|
|
@@ -583,18 +799,35 @@ class JsonToAvroConverter:
|
|
|
583
799
|
json_type['oneOf'] = oneof
|
|
584
800
|
|
|
585
801
|
if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
|
|
586
|
-
|
|
587
|
-
|
|
802
|
+
# Try to handle the conditional schema pattern
|
|
803
|
+
conditional_handled = False
|
|
588
804
|
if 'if' in json_type:
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
805
|
+
conditional_handled, json_type = self.handle_inline_conditional_schema(json_type)
|
|
806
|
+
|
|
807
|
+
if not conditional_handled:
|
|
808
|
+
# Only warn for patterns we can't handle
|
|
809
|
+
remaining_conditionals = []
|
|
810
|
+
if 'if' in json_type:
|
|
811
|
+
remaining_conditionals.append('if/then/else')
|
|
812
|
+
if 'dependentSchemas' in json_type:
|
|
813
|
+
remaining_conditionals.append('dependentSchemas')
|
|
814
|
+
if 'dependentRequired' in json_type:
|
|
815
|
+
remaining_conditionals.append('dependentRequired')
|
|
816
|
+
|
|
817
|
+
if remaining_conditionals:
|
|
818
|
+
print(
|
|
819
|
+
f'WARNING: Conditional schema pattern ({", ".join(remaining_conditionals)}) is not fully supported and will be simplified.')
|
|
820
|
+
|
|
821
|
+
if 'if' in json_type:
|
|
822
|
+
del json_type['if']
|
|
823
|
+
if 'then' in json_type:
|
|
824
|
+
del json_type['then']
|
|
825
|
+
if 'else' in json_type:
|
|
826
|
+
del json_type['else']
|
|
827
|
+
if 'dependentSchemas' in json_type:
|
|
828
|
+
del json_type['dependentSchemas']
|
|
829
|
+
if 'dependentRequired' in json_type:
|
|
830
|
+
del json_type['dependentRequired']
|
|
598
831
|
|
|
599
832
|
base_type = json_type.copy()
|
|
600
833
|
if 'oneOf' in base_type:
|
|
@@ -606,20 +839,101 @@ class JsonToAvroConverter:
|
|
|
606
839
|
json_types = []
|
|
607
840
|
|
|
608
841
|
if 'allOf' in json_type:
|
|
609
|
-
# if
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
842
|
+
# Check if this is a discriminated union pattern
|
|
843
|
+
discriminated_union_types = self.detect_discriminated_union(json_type)
|
|
844
|
+
|
|
845
|
+
if discriminated_union_types:
|
|
846
|
+
# Generate separate types for each discriminated variant
|
|
847
|
+
base_props = json_type.get('properties', {})
|
|
848
|
+
discriminator_field = 'type' # The discriminator field
|
|
849
|
+
discriminator_enum = base_props.get(discriminator_field, {}).get('enum', [])
|
|
850
|
+
|
|
851
|
+
# Create a shared enum type for the discriminator field that all variants will reference
|
|
852
|
+
shared_discriminator_enum = None
|
|
853
|
+
if discriminator_enum:
|
|
854
|
+
shared_discriminator_enum = self.create_enum_type(
|
|
855
|
+
discriminator_field,
|
|
856
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
857
|
+
discriminator_enum
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
for allof_item in json_type['allOf']:
|
|
861
|
+
if not (isinstance(allof_item, dict) and 'if' in allof_item and 'then' in allof_item):
|
|
862
|
+
continue
|
|
863
|
+
|
|
864
|
+
# Extract the discriminator value from the if clause
|
|
865
|
+
if_clause = allof_item['if']
|
|
866
|
+
discriminator_value = None
|
|
867
|
+
if (isinstance(if_clause, dict) and
|
|
868
|
+
'properties' in if_clause and
|
|
869
|
+
discriminator_field in if_clause['properties']):
|
|
870
|
+
disc_prop = if_clause['properties'][discriminator_field]
|
|
871
|
+
if 'enum' in disc_prop and len(disc_prop['enum']) > 0:
|
|
872
|
+
discriminator_value = disc_prop['enum'][0]
|
|
873
|
+
|
|
874
|
+
if not discriminator_value:
|
|
875
|
+
continue
|
|
876
|
+
|
|
877
|
+
# Resolve the then clause reference
|
|
878
|
+
then_clause = allof_item['then']
|
|
879
|
+
if isinstance(then_clause, dict) and '$ref' in then_clause:
|
|
880
|
+
resolved_type, _ = self.resolve_reference(then_clause, base_uri, json_schema)
|
|
881
|
+
|
|
882
|
+
# Create a new type combining base properties and resolved type
|
|
883
|
+
variant_type = copy.deepcopy(resolved_type)
|
|
884
|
+
|
|
885
|
+
# Set the variant type name to the discriminator value
|
|
886
|
+
variant_type['title'] = discriminator_value
|
|
887
|
+
|
|
888
|
+
# Preserve description from base type if variant doesn't have one
|
|
889
|
+
if 'description' not in variant_type and 'description' in base_type:
|
|
890
|
+
variant_type['description'] = base_type['description']
|
|
891
|
+
|
|
892
|
+
# Merge base properties into the variant
|
|
893
|
+
if 'properties' not in variant_type:
|
|
894
|
+
variant_type['properties'] = {}
|
|
895
|
+
|
|
896
|
+
for prop_name, prop_def in base_props.items():
|
|
897
|
+
if prop_name not in variant_type['properties']:
|
|
898
|
+
# For non-discriminator fields, copy the property definition
|
|
899
|
+
if prop_name != discriminator_field:
|
|
900
|
+
variant_type['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
901
|
+
|
|
902
|
+
# Set discriminator field to reference the shared enum type
|
|
903
|
+
if shared_discriminator_enum:
|
|
904
|
+
variant_type['properties'][discriminator_field] = {
|
|
905
|
+
'type': shared_discriminator_enum,
|
|
906
|
+
'default': discriminator_value,
|
|
907
|
+
'const': discriminator_value,
|
|
908
|
+
'discriminator': True
|
|
909
|
+
}
|
|
910
|
+
else:
|
|
911
|
+
# Fallback if no enum was found
|
|
912
|
+
variant_type['properties'][discriminator_field] = {
|
|
913
|
+
'type': 'string',
|
|
914
|
+
'default': discriminator_value,
|
|
915
|
+
'const': discriminator_value,
|
|
916
|
+
'discriminator': True
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
# Add union annotation to indicate this is part of a discriminated union
|
|
920
|
+
variant_type['union'] = record_name
|
|
921
|
+
|
|
922
|
+
json_types.append(variant_type)
|
|
923
|
+
else:
|
|
924
|
+
# Original allOf merging logic for non-discriminated unions
|
|
925
|
+
type_list = [copy.deepcopy(base_type)]
|
|
926
|
+
for allof_option in json_type['allOf']:
|
|
927
|
+
while isinstance(allof_option, dict) and '$ref' in allof_option:
|
|
928
|
+
resolved_json_type, resolved_schema = self.resolve_reference(
|
|
929
|
+
allof_option, base_uri, json_schema)
|
|
930
|
+
del allof_option['$ref']
|
|
931
|
+
allof_option = self.merge_json_schemas(
|
|
932
|
+
[allof_option, resolved_json_type])
|
|
933
|
+
type_list.append(copy.deepcopy(allof_option))
|
|
934
|
+
merged_type = self.merge_json_schemas(
|
|
935
|
+
type_list, intersect=False)
|
|
936
|
+
json_types.append(merged_type)
|
|
623
937
|
|
|
624
938
|
if 'oneOf' in json_type:
|
|
625
939
|
# if the json type is a oneOf, we create a type union of all types
|
|
@@ -692,8 +1006,13 @@ class JsonToAvroConverter:
|
|
|
692
1006
|
continue
|
|
693
1007
|
|
|
694
1008
|
subtype_deps: List[str] = []
|
|
695
|
-
|
|
696
|
-
|
|
1009
|
+
# Use title from discriminated union if available, otherwise generate numbered name
|
|
1010
|
+
if isinstance(json_type_option, dict) and 'title' in json_type_option:
|
|
1011
|
+
sub_field_name = avro_name(json_type_option['title'])
|
|
1012
|
+
elif not isinstance(json_type_option, dict) or not '$ref' in json_type_option:
|
|
1013
|
+
sub_field_name = avro_name(local_name + '_' + str(count))
|
|
1014
|
+
else:
|
|
1015
|
+
sub_field_name = None
|
|
697
1016
|
avro_subtype = self.json_type_to_avro_type(
|
|
698
1017
|
json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
699
1018
|
if not avro_subtype:
|
|
@@ -949,7 +1268,7 @@ class JsonToAvroConverter:
|
|
|
949
1268
|
[avro_type, self.create_array_type(generic_type())], avro_schema, '')
|
|
950
1269
|
elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
|
|
951
1270
|
avro_record_type = self.json_schema_object_to_avro_record(
|
|
952
|
-
local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack)
|
|
1271
|
+
local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
953
1272
|
if isinstance(avro_record_type, list):
|
|
954
1273
|
for record_entry in avro_record_type:
|
|
955
1274
|
self.lift_dependencies_from_type(
|
|
@@ -958,16 +1277,44 @@ class JsonToAvroConverter:
|
|
|
958
1277
|
'name', local_name) if isinstance(avro_type, dict) else local_name)
|
|
959
1278
|
self.lift_dependencies_from_type(
|
|
960
1279
|
avro_type, dependencies)
|
|
961
|
-
elif 'enum' in json_type
|
|
962
|
-
#
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1280
|
+
elif 'enum' in json_type:
|
|
1281
|
+
# Handle enums with proper type handling for mixed string/int enums
|
|
1282
|
+
enum_values = json_type['enum']
|
|
1283
|
+
schema_type = json_type.get('type', 'string')
|
|
1284
|
+
|
|
1285
|
+
# For pure string enums with valid symbols, use simple enum without suffix
|
|
1286
|
+
string_values = [v for v in enum_values if isinstance(v, str) and v]
|
|
1287
|
+
int_values = [v for v in enum_values if isinstance(v, int)]
|
|
1288
|
+
|
|
1289
|
+
if not int_values and string_values:
|
|
1290
|
+
# Pure string enum
|
|
1291
|
+
if not self.enum_symbols_need_string_fallback(string_values):
|
|
1292
|
+
# Simple case: valid symbols, just create enum
|
|
1293
|
+
avro_type = self.create_enum_type(
|
|
1294
|
+
local_name,
|
|
1295
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1296
|
+
string_values
|
|
1297
|
+
)
|
|
1298
|
+
else:
|
|
1299
|
+
# Symbols need prefixing, use helper with string fallback
|
|
1300
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
1301
|
+
local_name,
|
|
1302
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1303
|
+
enum_values,
|
|
1304
|
+
schema_type
|
|
1305
|
+
)
|
|
1306
|
+
# Register any embedded enum types in the union
|
|
1307
|
+
self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
|
|
1308
|
+
else:
|
|
1309
|
+
# Mixed or int-only enum, use helper
|
|
1310
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
1311
|
+
local_name + '_1',
|
|
1312
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1313
|
+
enum_values,
|
|
1314
|
+
schema_type
|
|
1315
|
+
)
|
|
1316
|
+
# Register any embedded enum types in the union
|
|
1317
|
+
self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
|
|
971
1318
|
else:
|
|
972
1319
|
avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
|
|
973
1320
|
'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
|
|
@@ -1028,6 +1375,23 @@ class JsonToAvroConverter:
|
|
|
1028
1375
|
else:
|
|
1029
1376
|
return True
|
|
1030
1377
|
|
|
1378
|
+
def register_embedded_types_in_union(self, avro_type, avro_schema, dependencies):
|
|
1379
|
+
"""
|
|
1380
|
+
Register any embedded named types (enum, record, fixed) found within a union type.
|
|
1381
|
+
This ensures that enum types created by create_enum_for_mixed_types are properly
|
|
1382
|
+
registered in the schema and can be referenced by name.
|
|
1383
|
+
"""
|
|
1384
|
+
if isinstance(avro_type, list):
|
|
1385
|
+
for i, member in enumerate(avro_type):
|
|
1386
|
+
if isinstance(member, dict) and 'type' in member and member['type'] in ['enum', 'record', 'fixed']:
|
|
1387
|
+
# Register the embedded type
|
|
1388
|
+
if self.register_type(avro_schema, member):
|
|
1389
|
+
# Replace the inline definition with a reference
|
|
1390
|
+
full_name = self.get_qualified_name(member)
|
|
1391
|
+
avro_type[i] = full_name
|
|
1392
|
+
if full_name not in dependencies:
|
|
1393
|
+
dependencies.append(full_name)
|
|
1394
|
+
|
|
1031
1395
|
def has_composition_keywords(self, json_object: dict) -> bool:
|
|
1032
1396
|
"""Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
|
|
1033
1397
|
return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
|
|
@@ -1089,6 +1453,102 @@ class JsonToAvroConverter:
|
|
|
1089
1453
|
'symbols': [avro_name(s) for s in symbols]
|
|
1090
1454
|
}
|
|
1091
1455
|
|
|
1456
|
+
def enum_symbols_need_string_fallback(self, symbols: list) -> bool:
|
|
1457
|
+
"""
|
|
1458
|
+
Check if any enum symbols will be transformed by avro_name().
|
|
1459
|
+
If symbols are prefixed (e.g., "1" -> "_1"), we need a string fallback
|
|
1460
|
+
in the union to handle original JSON values during deserialization.
|
|
1461
|
+
"""
|
|
1462
|
+
for s in symbols:
|
|
1463
|
+
if isinstance(s, str) and s:
|
|
1464
|
+
if avro_name(s) != s:
|
|
1465
|
+
return True
|
|
1466
|
+
return False
|
|
1467
|
+
|
|
1468
|
+
def create_enum_for_mixed_types(self, name: str, namespace: str, enum_values: list, json_types: list) -> dict | list:
|
|
1469
|
+
"""
|
|
1470
|
+
Create an Avro type for enums with mixed or special type requirements.
|
|
1471
|
+
|
|
1472
|
+
Handles:
|
|
1473
|
+
- Pure string enum with valid symbols -> enum
|
|
1474
|
+
- Pure string enum with prefixed symbols -> [enum, string]
|
|
1475
|
+
- Pure int enum -> int (with doc hint about allowed values)
|
|
1476
|
+
- Mixed string/int enum -> [enum, string, int]
|
|
1477
|
+
|
|
1478
|
+
Args:
|
|
1479
|
+
name: The enum type name
|
|
1480
|
+
namespace: The namespace for the enum
|
|
1481
|
+
enum_values: The list of enum values from JSON Schema
|
|
1482
|
+
json_types: The JSON Schema type(s), e.g., "string", "integer", or ["string", "integer"]
|
|
1483
|
+
|
|
1484
|
+
Returns:
|
|
1485
|
+
Avro type: either an enum dict, a primitive string, or a union list
|
|
1486
|
+
"""
|
|
1487
|
+
if not isinstance(json_types, list):
|
|
1488
|
+
json_types = [json_types]
|
|
1489
|
+
|
|
1490
|
+
# Normalize type names
|
|
1491
|
+
has_string = 'string' in json_types
|
|
1492
|
+
has_int = 'integer' in json_types or 'int' in json_types
|
|
1493
|
+
has_null = 'null' in json_types
|
|
1494
|
+
|
|
1495
|
+
# Separate string and int enum values
|
|
1496
|
+
string_values = [v for v in enum_values if isinstance(v, str) and v]
|
|
1497
|
+
int_values = [v for v in enum_values if isinstance(v, int)]
|
|
1498
|
+
|
|
1499
|
+
# Pure integer enum case
|
|
1500
|
+
if has_int and not has_string and not string_values:
|
|
1501
|
+
# Just use int - no enum type needed for pure int enums
|
|
1502
|
+
# The doc will contain the allowed values hint
|
|
1503
|
+
result = 'int'
|
|
1504
|
+
if has_null:
|
|
1505
|
+
result = ['null', result]
|
|
1506
|
+
return result
|
|
1507
|
+
|
|
1508
|
+
# Build the enum from string values (or string representations of all values)
|
|
1509
|
+
if string_values:
|
|
1510
|
+
enum_symbols = list(set(string_values))
|
|
1511
|
+
else:
|
|
1512
|
+
# No string values but has_string type - shouldn't happen normally
|
|
1513
|
+
enum_symbols = []
|
|
1514
|
+
|
|
1515
|
+
if not enum_symbols:
|
|
1516
|
+
# No valid enum symbols, fall back to primitive types
|
|
1517
|
+
union = []
|
|
1518
|
+
if has_null:
|
|
1519
|
+
union.append('null')
|
|
1520
|
+
if has_string:
|
|
1521
|
+
union.append('string')
|
|
1522
|
+
if has_int:
|
|
1523
|
+
union.append('int')
|
|
1524
|
+
return union if len(union) > 1 else (union[0] if union else 'string')
|
|
1525
|
+
|
|
1526
|
+
# Create the enum type
|
|
1527
|
+
avro_enum = self.create_enum_type(name, namespace, enum_symbols)
|
|
1528
|
+
|
|
1529
|
+
# Determine if we need additional types in union
|
|
1530
|
+
needs_string_fallback = self.enum_symbols_need_string_fallback(enum_symbols)
|
|
1531
|
+
|
|
1532
|
+
# Build the union
|
|
1533
|
+
union = []
|
|
1534
|
+
if has_null:
|
|
1535
|
+
union.append('null')
|
|
1536
|
+
union.append(avro_enum)
|
|
1537
|
+
|
|
1538
|
+
# Add string fallback if symbols were prefixed OR if this is a mixed type enum
|
|
1539
|
+
if needs_string_fallback or has_int:
|
|
1540
|
+
union.append('string')
|
|
1541
|
+
|
|
1542
|
+
# Add int if the schema allows integers
|
|
1543
|
+
if has_int:
|
|
1544
|
+
union.append('int')
|
|
1545
|
+
|
|
1546
|
+
# Return enum directly if no union needed
|
|
1547
|
+
if len(union) == 1:
|
|
1548
|
+
return union[0]
|
|
1549
|
+
|
|
1550
|
+
return union
|
|
1551
|
+
|
|
1092
1552
|
def create_array_type(self, items: list | dict | str) -> dict:
|
|
1093
1553
|
"""Create an Avro array type."""
|
|
1094
1554
|
return {
|
|
@@ -1141,7 +1601,7 @@ class JsonToAvroConverter:
|
|
|
1141
1601
|
"""Get the qualified name of an Avro type."""
|
|
1142
1602
|
return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
|
|
1143
1603
|
|
|
1144
|
-
def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list) -> dict | list | str | None:
|
|
1604
|
+
def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth: int = 1) -> dict | list | str | None:
|
|
1145
1605
|
"""Convert a JSON schema object declaration to an Avro record."""
|
|
1146
1606
|
dependencies: List[str] = []
|
|
1147
1607
|
avro_type: list | dict | str = {}
|
|
@@ -1150,7 +1610,7 @@ class JsonToAvroConverter:
|
|
|
1150
1610
|
if self.has_composition_keywords(json_object):
|
|
1151
1611
|
# we will merge allOf, oneOf, anyOf into a union record type
|
|
1152
1612
|
type = self.json_type_to_avro_type(
|
|
1153
|
-
json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
|
|
1613
|
+
json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1154
1614
|
if isinstance(type, str):
|
|
1155
1615
|
# we are skipping references and primitives
|
|
1156
1616
|
return None
|
|
@@ -1186,7 +1646,7 @@ class JsonToAvroConverter:
|
|
|
1186
1646
|
f'WARN: Standalone array type {name} will be wrapped in a record')
|
|
1187
1647
|
deps: List[str] = []
|
|
1188
1648
|
array_type = self.json_type_to_avro_type(json_object, name, avro_name(
|
|
1189
|
-
name), namespace, deps, json_schema, base_uri, avro_schema, record_stack)
|
|
1649
|
+
name), namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1190
1650
|
avro_array = self.create_wrapper_record(
|
|
1191
1651
|
avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
|
|
1192
1652
|
self.merge_description_into_doc(json_object, avro_array)
|
|
@@ -1206,6 +1666,9 @@ class JsonToAvroConverter:
|
|
|
1206
1666
|
namespace, record_stack[-1] + "_types")
|
|
1207
1667
|
# at this point we have a record type
|
|
1208
1668
|
avro_record = self.create_avro_record(record_name, namespace, [])
|
|
1669
|
+
# Check if this record has a 'union' annotation from discriminated union pattern
|
|
1670
|
+
if 'union' in json_object:
|
|
1671
|
+
avro_record['union'] = json_object['union']
|
|
1209
1672
|
# we need to prevent circular dependencies, so we will maintain a stack of the in-progress
|
|
1210
1673
|
# records and will resolve the cycle as we go. if this record is already in the stack, we will
|
|
1211
1674
|
# just return a reference to a record that contains this record
|
|
@@ -1236,6 +1699,7 @@ class JsonToAvroConverter:
|
|
|
1236
1699
|
const = None
|
|
1237
1700
|
default = None
|
|
1238
1701
|
description = None
|
|
1702
|
+
discriminator = None
|
|
1239
1703
|
for json_field_type in json_field_types:
|
|
1240
1704
|
# skip fields with an bad or empty type
|
|
1241
1705
|
if not isinstance(json_field_type, dict):
|
|
@@ -1249,9 +1713,11 @@ class JsonToAvroConverter:
|
|
|
1249
1713
|
default = default_value
|
|
1250
1714
|
# get the description from the field type
|
|
1251
1715
|
description = json_field_type.get('description', description)
|
|
1716
|
+
# check for discriminator annotation
|
|
1717
|
+
discriminator = json_field_type.get('discriminator', discriminator)
|
|
1252
1718
|
# convert the JSON-type field to an Avro-type field
|
|
1253
1719
|
avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
|
|
1254
|
-
json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack))
|
|
1720
|
+
json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
|
|
1255
1721
|
if isinstance(avro_field_type, list):
|
|
1256
1722
|
avro_field_type = self.flatten_union(
|
|
1257
1723
|
avro_field_type)
|
|
@@ -1289,6 +1755,8 @@ class JsonToAvroConverter:
|
|
|
1289
1755
|
avro_field['default'] = default
|
|
1290
1756
|
if description:
|
|
1291
1757
|
avro_field['doc'] = description
|
|
1758
|
+
if discriminator:
|
|
1759
|
+
avro_field['discriminator'] = discriminator
|
|
1292
1760
|
field_type_list.append(avro_field_type)
|
|
1293
1761
|
avro_field_ref = {
|
|
1294
1762
|
'name': avro_name(field_name),
|
|
@@ -1310,7 +1778,7 @@ class JsonToAvroConverter:
|
|
|
1310
1778
|
# we don't have any fields, but we have an array type, so we create a record with an 'items' field
|
|
1311
1779
|
avro_record = self.create_array_type(
|
|
1312
1780
|
self.json_type_to_avro_type(
|
|
1313
|
-
json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
|
|
1781
|
+
json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1314
1782
|
if 'items' in json_object
|
|
1315
1783
|
else generic_type())
|
|
1316
1784
|
else:
|
|
@@ -1324,7 +1792,7 @@ class JsonToAvroConverter:
|
|
|
1324
1792
|
for pattern_name, props in pattern_props.items():
|
|
1325
1793
|
deps = []
|
|
1326
1794
|
prop_type = self.ensure_type(self.json_type_to_avro_type(
|
|
1327
|
-
props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack))
|
|
1795
|
+
props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
|
|
1328
1796
|
if self.is_standalone_avro_type(prop_type):
|
|
1329
1797
|
self.lift_dependencies_from_type(prop_type, deps)
|
|
1330
1798
|
self.set_avro_type_value(
|
|
@@ -1350,7 +1818,7 @@ class JsonToAvroConverter:
|
|
|
1350
1818
|
additional_props = json_object['additionalProperties']
|
|
1351
1819
|
deps = []
|
|
1352
1820
|
values_type = self.json_type_to_avro_type(
|
|
1353
|
-
additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
|
|
1821
|
+
additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1354
1822
|
if self.is_standalone_avro_type(values_type):
|
|
1355
1823
|
self.lift_dependencies_from_type(values_type, deps)
|
|
1356
1824
|
self.set_avro_type_value(
|
|
@@ -1457,11 +1925,12 @@ class JsonToAvroConverter:
|
|
|
1457
1925
|
if 'unmerged_types' in merge_result:
|
|
1458
1926
|
del merge_result['unmerged_types']
|
|
1459
1927
|
if isinstance(merge_result, list):
|
|
1460
|
-
# unmerged field containers have fields
|
|
1461
|
-
|
|
1462
|
-
type, 'name', type['name'] + '_item')
|
|
1928
|
+
# unmerged field containers have fields - wrap the union in a record
|
|
1929
|
+
# Keep the original name since references expect it
|
|
1463
1930
|
self.set_avro_type_value(
|
|
1464
1931
|
type, 'fields', [{'name': 'value', 'type': merge_result}])
|
|
1932
|
+
if 'unmerged_types' in type:
|
|
1933
|
+
del type['unmerged_types']
|
|
1465
1934
|
merge_result = copy.deepcopy(type)
|
|
1466
1935
|
set_schema_node(find_fn, merge_result, avro_schema)
|
|
1467
1936
|
|