structurize 2.16.5__py3-none-any.whl → 2.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/jsonstoavro.py CHANGED
@@ -87,6 +87,194 @@ class JsonToAvroConverter:
87
87
  return True
88
88
  return False
89
89
 
90
+ def detect_discriminated_union(self, json_type: dict):
91
+ """
92
+ Detect if a JSON schema is a discriminated union pattern using allOf with if/then conditionals.
93
+
94
+ A discriminated union pattern consists of:
95
+ - A base schema with a discriminator field (usually 'type') with an enum
96
+ - An allOf array containing if/then conditionals that add fields based on discriminator value
97
+
98
+ Parameters:
99
+ json_type (dict): The JSON schema object to check
100
+
101
+ Returns:
102
+ list | None: List of discriminator values if pattern detected, None otherwise
103
+ """
104
+ if not isinstance(json_type, dict) or 'allOf' not in json_type:
105
+ return None
106
+
107
+ # Check for discriminator field with enum values
108
+ properties = json_type.get('properties', {})
109
+ if 'type' not in properties or 'enum' not in properties.get('type', {}):
110
+ return None
111
+
112
+ discriminator_values = properties['type']['enum']
113
+
114
+ # Check if allOf contains if/then conditionals
115
+ has_if_then = any(
116
+ isinstance(item, dict) and 'if' in item and 'then' in item
117
+ for item in json_type['allOf']
118
+ )
119
+
120
+ if has_if_then and len(discriminator_values) > 0:
121
+ return discriminator_values
122
+
123
+ return None
124
+
125
+ def handle_inline_conditional_schema(self, json_type: dict) -> Tuple[bool, dict]:
126
+ """
127
+ Handle inline if/then/else conditional schemas by converting them to appropriate structures.
128
+
129
+ Supports the following patterns:
130
+ 1. Type-based conditional: if {properties: {type: {enum: [X]}}}, then {...}, else {...}
131
+ - Converted to oneOf with discriminated variants
132
+ 2. Field presence conditional: if {properties: {field: {...}}, required: [field]}
133
+ - Merged into comprehensive type (Avro handles optional fields naturally)
134
+
135
+ Parameters:
136
+ json_type (dict): The JSON schema object to process
137
+
138
+ Returns:
139
+ Tuple[bool, dict]: (was_handled, modified_json_type)
140
+ """
141
+ if not isinstance(json_type, dict) or 'if' not in json_type:
142
+ return (False, json_type)
143
+
144
+ if_clause = json_type.get('if', {})
145
+ then_clause = json_type.get('then', {})
146
+ else_clause = json_type.get('else', None)
147
+
148
+ # Check for type-based discriminator pattern
149
+ # if: {properties: {type: {enum: ["X"]}}}
150
+ if (isinstance(if_clause, dict) and
151
+ 'properties' in if_clause and
152
+ 'type' in if_clause['properties']):
153
+
154
+ type_prop = if_clause['properties']['type']
155
+ if isinstance(type_prop, dict) and 'enum' in type_prop:
156
+ # This is a type-based conditional - convert to oneOf
157
+ return self._convert_type_conditional_to_oneof(json_type, if_clause, then_clause, else_clause)
158
+
159
+ # Check for field presence pattern
160
+ # if: {properties: {field: {...}}, required: [field]}
161
+ if (isinstance(if_clause, dict) and
162
+ 'properties' in if_clause and
163
+ 'required' in if_clause):
164
+ # This is a field presence conditional - merge all branches
165
+ return self._merge_conditional_branches(json_type, then_clause, else_clause)
166
+
167
+ # Unsupported pattern
168
+ return (False, json_type)
169
+
170
+ def _convert_type_conditional_to_oneof(self, json_type: dict, if_clause: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
171
+ """
172
+ Convert a type-based conditional schema to oneOf structure.
173
+
174
+ Example:
175
+ Input: {type: object, properties: {type: {enum: [image, host]}}, if: {...}, then: {...}, else: {...}}
176
+ Output: {oneOf: [then_merged_with_base, else_merged_with_base]}
177
+ """
178
+ # Create a base type without the conditional parts
179
+ base_type = {}
180
+ for key, value in json_type.items():
181
+ if key not in ('if', 'then', 'else'):
182
+ base_type[key] = copy.deepcopy(value)
183
+
184
+ oneof_variants = []
185
+
186
+ # Process then clause
187
+ if then_clause:
188
+ then_variant = self._merge_conditional_branch(base_type, then_clause)
189
+ oneof_variants.append(then_variant)
190
+
191
+ # Process else clause (which may contain nested if/then/else)
192
+ if else_clause:
193
+ if 'if' in else_clause:
194
+ # Recursive handling of nested conditional
195
+ handled, processed_else = self.handle_inline_conditional_schema(else_clause)
196
+ if handled and 'oneOf' in processed_else:
197
+ # Flatten nested oneOf
198
+ for variant in processed_else['oneOf']:
199
+ merged = self._merge_conditional_branch(base_type, variant)
200
+ oneof_variants.append(merged)
201
+ else:
202
+ else_variant = self._merge_conditional_branch(base_type, else_clause)
203
+ oneof_variants.append(else_variant)
204
+ else:
205
+ else_variant = self._merge_conditional_branch(base_type, else_clause)
206
+ oneof_variants.append(else_variant)
207
+
208
+ if len(oneof_variants) > 0:
209
+ result = copy.deepcopy(base_type)
210
+ # Remove properties since they'll be in the variants
211
+ if 'properties' in result:
212
+ del result['properties']
213
+ if 'additionalProperties' in result:
214
+ del result['additionalProperties']
215
+ if 'required' in result:
216
+ del result['required']
217
+ result['oneOf'] = oneof_variants
218
+ return (True, result)
219
+
220
+ return (False, json_type)
221
+
222
+ def _merge_conditional_branches(self, json_type: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
223
+ """
224
+ Merge conditional branches for field presence patterns.
225
+ Avro handles optional fields naturally, so we can merge all properties.
226
+ """
227
+ result = {}
228
+ for key, value in json_type.items():
229
+ if key not in ('if', 'then', 'else'):
230
+ result[key] = copy.deepcopy(value)
231
+
232
+ # Merge properties from then clause
233
+ if then_clause and 'properties' in then_clause:
234
+ if 'properties' not in result:
235
+ result['properties'] = {}
236
+ for prop_name, prop_def in then_clause['properties'].items():
237
+ if prop_name not in result['properties']:
238
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
239
+
240
+ # Merge properties from else clause
241
+ if else_clause and 'properties' in else_clause:
242
+ if 'properties' not in result:
243
+ result['properties'] = {}
244
+ for prop_name, prop_def in else_clause['properties'].items():
245
+ if prop_name not in result['properties']:
246
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
247
+
248
+ return (True, result)
249
+
250
+ def _merge_conditional_branch(self, base: dict, branch: dict) -> dict:
251
+ """Merge a conditional branch with the base type."""
252
+ result = copy.deepcopy(base)
253
+
254
+ if not branch:
255
+ return result
256
+
257
+ # Merge properties
258
+ if 'properties' in branch:
259
+ if 'properties' not in result:
260
+ result['properties'] = {}
261
+ for prop_name, prop_def in branch['properties'].items():
262
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
263
+
264
+ # Merge additionalProperties
265
+ if 'additionalProperties' in branch:
266
+ result['additionalProperties'] = branch['additionalProperties']
267
+
268
+ # Merge required (union of required fields)
269
+ if 'required' in branch:
270
+ if 'required' not in result:
271
+ result['required'] = []
272
+ for req in branch['required']:
273
+ if req not in result['required']:
274
+ result['required'].append(req)
275
+
276
+ return result
277
+
90
278
  def flatten_union(self, type_list: list) -> list:
91
279
  """
92
280
  Flatten the list of types in a union into a single list.
@@ -367,7 +555,13 @@ class JsonToAvroConverter:
367
555
  """
368
556
  if isinstance(json_primitive, list):
369
557
  if enum:
370
- json_primitive = 'string'
558
+ # Handle mixed-type enums properly using the dedicated helper
559
+ return self.create_enum_for_mixed_types(
560
+ field_name + '_1',
561
+ self.compose_namespace(namespace, record_name + '_types'),
562
+ enum,
563
+ json_primitive
564
+ )
371
565
  else:
372
566
  union = []
373
567
  for item in json_primitive:
@@ -558,12 +752,34 @@ class JsonToAvroConverter:
558
752
  if isinstance(json_type, dict):
559
753
 
560
754
  json_object_type = json_type.get('type')
755
+ # Check if the type is already an Avro schema (e.g., shared discriminator enum)
756
+ # This happens when a discriminated union property was pre-set with an Avro type
757
+ if isinstance(json_object_type, dict) and 'type' in json_object_type and json_object_type.get('type') in ['enum', 'record', 'fixed', 'array', 'map']:
758
+ return self.post_check_avro_type(dependencies, json_object_type)
561
759
  if isinstance(json_object_type, list):
562
760
  # if the 'type' is a list, we map it back to a string
563
761
  # if the list has only one item or if the list has two items
564
762
  # and one of them is 'null'
565
763
  # otherwise, we will construct and inject a oneOf type
566
764
  # and split the type
765
+
766
+ # Special case: if we have a mixed-type enum (e.g., type: ["string", "integer"] with enum),
767
+ # handle it directly here to avoid duplicate processing
768
+ if 'enum' in json_type and any(t in json_object_type for t in ['string', 'integer', 'int']):
769
+ has_null = 'null' in json_object_type
770
+ avro_type = self.create_enum_for_mixed_types(
771
+ local_name + '_1',
772
+ self.compose_namespace(namespace, record_name + '_types'),
773
+ json_type['enum'],
774
+ json_object_type
775
+ )
776
+ if 'description' in json_type and isinstance(avro_type, dict):
777
+ avro_type['doc'] = json_type['description']
778
+ elif 'description' in json_type and isinstance(avro_type, list):
779
+ # For unions, we can't set doc directly - it will be set on the field
780
+ pass
781
+ return self.post_check_avro_type(dependencies, avro_type)
782
+
567
783
  if len(json_object_type) == 1:
568
784
  json_object_type = json_object_type[0]
569
785
  elif len(json_object_type) == 2 and 'null' in json_object_type:
@@ -583,18 +799,35 @@ class JsonToAvroConverter:
583
799
  json_type['oneOf'] = oneof
584
800
 
585
801
  if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
586
- print(
587
- 'WARNING: Conditional schema is not supported and will be ignored.')
802
+ # Try to handle the conditional schema pattern
803
+ conditional_handled = False
588
804
  if 'if' in json_type:
589
- del json_type['if']
590
- if 'then' in json_type:
591
- del json_type['then']
592
- if 'else' in json_type:
593
- del json_type['else']
594
- if 'dependentSchemas' in json_type:
595
- del json_type['dependentSchemas']
596
- if 'dependentRequired' in json_type:
597
- del json_type['dependentRequired']
805
+ conditional_handled, json_type = self.handle_inline_conditional_schema(json_type)
806
+
807
+ if not conditional_handled:
808
+ # Only warn for patterns we can't handle
809
+ remaining_conditionals = []
810
+ if 'if' in json_type:
811
+ remaining_conditionals.append('if/then/else')
812
+ if 'dependentSchemas' in json_type:
813
+ remaining_conditionals.append('dependentSchemas')
814
+ if 'dependentRequired' in json_type:
815
+ remaining_conditionals.append('dependentRequired')
816
+
817
+ if remaining_conditionals:
818
+ print(
819
+ f'WARNING: Conditional schema pattern ({", ".join(remaining_conditionals)}) is not fully supported and will be simplified.')
820
+
821
+ if 'if' in json_type:
822
+ del json_type['if']
823
+ if 'then' in json_type:
824
+ del json_type['then']
825
+ if 'else' in json_type:
826
+ del json_type['else']
827
+ if 'dependentSchemas' in json_type:
828
+ del json_type['dependentSchemas']
829
+ if 'dependentRequired' in json_type:
830
+ del json_type['dependentRequired']
598
831
 
599
832
  base_type = json_type.copy()
600
833
  if 'oneOf' in base_type:
@@ -606,20 +839,101 @@ class JsonToAvroConverter:
606
839
  json_types = []
607
840
 
608
841
  if 'allOf' in json_type:
609
- # if the json type is an allOf, we merge all types into one
610
- # this may be lossy if aspects of the types overlap but differ
611
- type_list = [copy.deepcopy(base_type)]
612
- for allof_option in json_type['allOf']:
613
- while isinstance(allof_option, dict) and '$ref' in allof_option:
614
- resolved_json_type, resolved_schema = self.resolve_reference(
615
- allof_option, base_uri, json_schema)
616
- del allof_option['$ref']
617
- allof_option = self.merge_json_schemas(
618
- [allof_option, resolved_json_type])
619
- type_list.append(copy.deepcopy(allof_option))
620
- merged_type = self.merge_json_schemas(
621
- type_list, intersect=False)
622
- json_types.append(merged_type)
842
+ # Check if this is a discriminated union pattern
843
+ discriminated_union_types = self.detect_discriminated_union(json_type)
844
+
845
+ if discriminated_union_types:
846
+ # Generate separate types for each discriminated variant
847
+ base_props = json_type.get('properties', {})
848
+ discriminator_field = 'type' # The discriminator field
849
+ discriminator_enum = base_props.get(discriminator_field, {}).get('enum', [])
850
+
851
+ # Create a shared enum type for the discriminator field that all variants will reference
852
+ shared_discriminator_enum = None
853
+ if discriminator_enum:
854
+ shared_discriminator_enum = self.create_enum_type(
855
+ discriminator_field,
856
+ self.compose_namespace(namespace, record_name + '_types'),
857
+ discriminator_enum
858
+ )
859
+
860
+ for allof_item in json_type['allOf']:
861
+ if not (isinstance(allof_item, dict) and 'if' in allof_item and 'then' in allof_item):
862
+ continue
863
+
864
+ # Extract the discriminator value from the if clause
865
+ if_clause = allof_item['if']
866
+ discriminator_value = None
867
+ if (isinstance(if_clause, dict) and
868
+ 'properties' in if_clause and
869
+ discriminator_field in if_clause['properties']):
870
+ disc_prop = if_clause['properties'][discriminator_field]
871
+ if 'enum' in disc_prop and len(disc_prop['enum']) > 0:
872
+ discriminator_value = disc_prop['enum'][0]
873
+
874
+ if not discriminator_value:
875
+ continue
876
+
877
+ # Resolve the then clause reference
878
+ then_clause = allof_item['then']
879
+ if isinstance(then_clause, dict) and '$ref' in then_clause:
880
+ resolved_type, _ = self.resolve_reference(then_clause, base_uri, json_schema)
881
+
882
+ # Create a new type combining base properties and resolved type
883
+ variant_type = copy.deepcopy(resolved_type)
884
+
885
+ # Set the variant type name to the discriminator value
886
+ variant_type['title'] = discriminator_value
887
+
888
+ # Preserve description from base type if variant doesn't have one
889
+ if 'description' not in variant_type and 'description' in base_type:
890
+ variant_type['description'] = base_type['description']
891
+
892
+ # Merge base properties into the variant
893
+ if 'properties' not in variant_type:
894
+ variant_type['properties'] = {}
895
+
896
+ for prop_name, prop_def in base_props.items():
897
+ if prop_name not in variant_type['properties']:
898
+ # For non-discriminator fields, copy the property definition
899
+ if prop_name != discriminator_field:
900
+ variant_type['properties'][prop_name] = copy.deepcopy(prop_def)
901
+
902
+ # Set discriminator field to reference the shared enum type
903
+ if shared_discriminator_enum:
904
+ variant_type['properties'][discriminator_field] = {
905
+ 'type': shared_discriminator_enum,
906
+ 'default': discriminator_value,
907
+ 'const': discriminator_value,
908
+ 'discriminator': True
909
+ }
910
+ else:
911
+ # Fallback if no enum was found
912
+ variant_type['properties'][discriminator_field] = {
913
+ 'type': 'string',
914
+ 'default': discriminator_value,
915
+ 'const': discriminator_value,
916
+ 'discriminator': True
917
+ }
918
+
919
+ # Add union annotation to indicate this is part of a discriminated union
920
+ variant_type['union'] = record_name
921
+
922
+ json_types.append(variant_type)
923
+ else:
924
+ # Original allOf merging logic for non-discriminated unions
925
+ type_list = [copy.deepcopy(base_type)]
926
+ for allof_option in json_type['allOf']:
927
+ while isinstance(allof_option, dict) and '$ref' in allof_option:
928
+ resolved_json_type, resolved_schema = self.resolve_reference(
929
+ allof_option, base_uri, json_schema)
930
+ del allof_option['$ref']
931
+ allof_option = self.merge_json_schemas(
932
+ [allof_option, resolved_json_type])
933
+ type_list.append(copy.deepcopy(allof_option))
934
+ merged_type = self.merge_json_schemas(
935
+ type_list, intersect=False)
936
+ json_types.append(merged_type)
623
937
 
624
938
  if 'oneOf' in json_type:
625
939
  # if the json type is a oneOf, we create a type union of all types
@@ -692,8 +1006,13 @@ class JsonToAvroConverter:
692
1006
  continue
693
1007
 
694
1008
  subtype_deps: List[str] = []
695
- sub_field_name = avro_name(local_name + '_' + str(count)) if not isinstance(
696
- json_type_option, dict) or not '$ref' in json_type_option else None
1009
+ # Use title from discriminated union if available, otherwise generate numbered name
1010
+ if isinstance(json_type_option, dict) and 'title' in json_type_option:
1011
+ sub_field_name = avro_name(json_type_option['title'])
1012
+ elif not isinstance(json_type_option, dict) or not '$ref' in json_type_option:
1013
+ sub_field_name = avro_name(local_name + '_' + str(count))
1014
+ else:
1015
+ sub_field_name = None
697
1016
  avro_subtype = self.json_type_to_avro_type(
698
1017
  json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
699
1018
  if not avro_subtype:
@@ -949,7 +1268,7 @@ class JsonToAvroConverter:
949
1268
  [avro_type, self.create_array_type(generic_type())], avro_schema, '')
950
1269
  elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
951
1270
  avro_record_type = self.json_schema_object_to_avro_record(
952
- local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack)
1271
+ local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
953
1272
  if isinstance(avro_record_type, list):
954
1273
  for record_entry in avro_record_type:
955
1274
  self.lift_dependencies_from_type(
@@ -958,16 +1277,44 @@ class JsonToAvroConverter:
958
1277
  'name', local_name) if isinstance(avro_type, dict) else local_name)
959
1278
  self.lift_dependencies_from_type(
960
1279
  avro_type, dependencies)
961
- elif 'enum' in json_type and (not 'type' in json_type or json_type['type'] == "string"):
962
- # we skip all enums that are not of implicit or explicit type 'string'
963
- enum = [avro_name(e) for e in json_type['enum'] if isinstance(
964
- e, str) and e != '']
965
- if len(enum) > 0:
966
- # if the enum ends up empty (only non-strings in the enum), we will skip it
967
- enum = list(set(enum))
968
- if len(enum) > 0:
969
- avro_type = self.create_enum_type(local_name, self.compose_namespace(
970
- namespace, record_name + '_types'), enum)
1280
+ elif 'enum' in json_type:
1281
+ # Handle enums with proper type handling for mixed string/int enums
1282
+ enum_values = json_type['enum']
1283
+ schema_type = json_type.get('type', 'string')
1284
+
1285
+ # For pure string enums with valid symbols, use simple enum without suffix
1286
+ string_values = [v for v in enum_values if isinstance(v, str) and v]
1287
+ int_values = [v for v in enum_values if isinstance(v, int)]
1288
+
1289
+ if not int_values and string_values:
1290
+ # Pure string enum
1291
+ if not self.enum_symbols_need_string_fallback(string_values):
1292
+ # Simple case: valid symbols, just create enum
1293
+ avro_type = self.create_enum_type(
1294
+ local_name,
1295
+ self.compose_namespace(namespace, record_name + '_types'),
1296
+ string_values
1297
+ )
1298
+ else:
1299
+ # Symbols need prefixing, use helper with string fallback
1300
+ avro_type = self.create_enum_for_mixed_types(
1301
+ local_name,
1302
+ self.compose_namespace(namespace, record_name + '_types'),
1303
+ enum_values,
1304
+ schema_type
1305
+ )
1306
+ # Register any embedded enum types in the union
1307
+ self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
1308
+ else:
1309
+ # Mixed or int-only enum, use helper
1310
+ avro_type = self.create_enum_for_mixed_types(
1311
+ local_name + '_1',
1312
+ self.compose_namespace(namespace, record_name + '_types'),
1313
+ enum_values,
1314
+ schema_type
1315
+ )
1316
+ # Register any embedded enum types in the union
1317
+ self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
971
1318
  else:
972
1319
  avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
973
1320
  'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
@@ -1028,6 +1375,23 @@ class JsonToAvroConverter:
1028
1375
  else:
1029
1376
  return True
1030
1377
 
1378
+ def register_embedded_types_in_union(self, avro_type, avro_schema, dependencies):
1379
+ """
1380
+ Register any embedded named types (enum, record, fixed) found within a union type.
1381
+ This ensures that enum types created by create_enum_for_mixed_types are properly
1382
+ registered in the schema and can be referenced by name.
1383
+ """
1384
+ if isinstance(avro_type, list):
1385
+ for i, member in enumerate(avro_type):
1386
+ if isinstance(member, dict) and 'type' in member and member['type'] in ['enum', 'record', 'fixed']:
1387
+ # Register the embedded type
1388
+ if self.register_type(avro_schema, member):
1389
+ # Replace the inline definition with a reference
1390
+ full_name = self.get_qualified_name(member)
1391
+ avro_type[i] = full_name
1392
+ if full_name not in dependencies:
1393
+ dependencies.append(full_name)
1394
+
1031
1395
  def has_composition_keywords(self, json_object: dict) -> bool:
1032
1396
  """Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
1033
1397
  return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
@@ -1089,6 +1453,102 @@ class JsonToAvroConverter:
1089
1453
  'symbols': [avro_name(s) for s in symbols]
1090
1454
  }
1091
1455
 
1456
+ def enum_symbols_need_string_fallback(self, symbols: list) -> bool:
1457
+ """
1458
+ Check if any enum symbols will be transformed by avro_name().
1459
+ If symbols are prefixed (e.g., "1" -> "_1"), we need a string fallback
1460
+ in the union to handle original JSON values during deserialization.
1461
+ """
1462
+ for s in symbols:
1463
+ if isinstance(s, str) and s:
1464
+ if avro_name(s) != s:
1465
+ return True
1466
+ return False
1467
+
1468
+ def create_enum_for_mixed_types(self, name: str, namespace: str, enum_values: list, json_types: list) -> dict | list:
1469
+ """
1470
+ Create an Avro type for enums with mixed or special type requirements.
1471
+
1472
+ Handles:
1473
+ - Pure string enum with valid symbols -> enum
1474
+ - Pure string enum with prefixed symbols -> [enum, string]
1475
+ - Pure int enum -> int (with doc hint about allowed values)
1476
+ - Mixed string/int enum -> [enum, string, int]
1477
+
1478
+ Args:
1479
+ name: The enum type name
1480
+ namespace: The namespace for the enum
1481
+ enum_values: The list of enum values from JSON Schema
1482
+ json_types: The JSON Schema type(s), e.g., "string", "integer", or ["string", "integer"]
1483
+
1484
+ Returns:
1485
+ Avro type: either an enum dict, a primitive string, or a union list
1486
+ """
1487
+ if not isinstance(json_types, list):
1488
+ json_types = [json_types]
1489
+
1490
+ # Normalize type names
1491
+ has_string = 'string' in json_types
1492
+ has_int = 'integer' in json_types or 'int' in json_types
1493
+ has_null = 'null' in json_types
1494
+
1495
+ # Separate string and int enum values
1496
+ string_values = [v for v in enum_values if isinstance(v, str) and v]
1497
+ int_values = [v for v in enum_values if isinstance(v, int)]
1498
+
1499
+ # Pure integer enum case
1500
+ if has_int and not has_string and not string_values:
1501
+ # Just use int - no enum type needed for pure int enums
1502
+ # The doc will contain the allowed values hint
1503
+ result = 'int'
1504
+ if has_null:
1505
+ result = ['null', result]
1506
+ return result
1507
+
1508
+ # Build the enum from string values (or string representations of all values)
1509
+ if string_values:
1510
+ enum_symbols = list(set(string_values))
1511
+ else:
1512
+ # No string values but has_string type - shouldn't happen normally
1513
+ enum_symbols = []
1514
+
1515
+ if not enum_symbols:
1516
+ # No valid enum symbols, fall back to primitive types
1517
+ union = []
1518
+ if has_null:
1519
+ union.append('null')
1520
+ if has_string:
1521
+ union.append('string')
1522
+ if has_int:
1523
+ union.append('int')
1524
+ return union if len(union) > 1 else (union[0] if union else 'string')
1525
+
1526
+ # Create the enum type
1527
+ avro_enum = self.create_enum_type(name, namespace, enum_symbols)
1528
+
1529
+ # Determine if we need additional types in union
1530
+ needs_string_fallback = self.enum_symbols_need_string_fallback(enum_symbols)
1531
+
1532
+ # Build the union
1533
+ union = []
1534
+ if has_null:
1535
+ union.append('null')
1536
+ union.append(avro_enum)
1537
+
1538
+ # Add string fallback if symbols were prefixed OR if this is a mixed type enum
1539
+ if needs_string_fallback or has_int:
1540
+ union.append('string')
1541
+
1542
+ # Add int if the schema allows integers
1543
+ if has_int:
1544
+ union.append('int')
1545
+
1546
+ # Return enum directly if no union needed
1547
+ if len(union) == 1:
1548
+ return union[0]
1549
+
1550
+ return union
1551
+
1092
1552
  def create_array_type(self, items: list | dict | str) -> dict:
1093
1553
  """Create an Avro array type."""
1094
1554
  return {
@@ -1141,7 +1601,7 @@ class JsonToAvroConverter:
1141
1601
  """Get the qualified name of an Avro type."""
1142
1602
  return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
1143
1603
 
1144
- def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list) -> dict | list | str | None:
1604
+ def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth: int = 1) -> dict | list | str | None:
1145
1605
  """Convert a JSON schema object declaration to an Avro record."""
1146
1606
  dependencies: List[str] = []
1147
1607
  avro_type: list | dict | str = {}
@@ -1150,7 +1610,7 @@ class JsonToAvroConverter:
1150
1610
  if self.has_composition_keywords(json_object):
1151
1611
  # we will merge allOf, oneOf, anyOf into a union record type
1152
1612
  type = self.json_type_to_avro_type(
1153
- json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1613
+ json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1154
1614
  if isinstance(type, str):
1155
1615
  # we are skipping references and primitives
1156
1616
  return None
@@ -1186,7 +1646,7 @@ class JsonToAvroConverter:
1186
1646
  f'WARN: Standalone array type {name} will be wrapped in a record')
1187
1647
  deps: List[str] = []
1188
1648
  array_type = self.json_type_to_avro_type(json_object, name, avro_name(
1189
- name), namespace, deps, json_schema, base_uri, avro_schema, record_stack)
1649
+ name), namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1190
1650
  avro_array = self.create_wrapper_record(
1191
1651
  avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
1192
1652
  self.merge_description_into_doc(json_object, avro_array)
@@ -1206,6 +1666,9 @@ class JsonToAvroConverter:
1206
1666
  namespace, record_stack[-1] + "_types")
1207
1667
  # at this point we have a record type
1208
1668
  avro_record = self.create_avro_record(record_name, namespace, [])
1669
+ # Check if this record has a 'union' annotation from discriminated union pattern
1670
+ if 'union' in json_object:
1671
+ avro_record['union'] = json_object['union']
1209
1672
  # we need to prevent circular dependencies, so we will maintain a stack of the in-progress
1210
1673
  # records and will resolve the cycle as we go. if this record is already in the stack, we will
1211
1674
  # just return a reference to a record that contains this record
@@ -1236,6 +1699,7 @@ class JsonToAvroConverter:
1236
1699
  const = None
1237
1700
  default = None
1238
1701
  description = None
1702
+ discriminator = None
1239
1703
  for json_field_type in json_field_types:
1240
1704
  # skip fields with an bad or empty type
1241
1705
  if not isinstance(json_field_type, dict):
@@ -1249,9 +1713,11 @@ class JsonToAvroConverter:
1249
1713
  default = default_value
1250
1714
  # get the description from the field type
1251
1715
  description = json_field_type.get('description', description)
1716
+ # check for discriminator annotation
1717
+ discriminator = json_field_type.get('discriminator', discriminator)
1252
1718
  # convert the JSON-type field to an Avro-type field
1253
1719
  avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
1254
- json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack))
1720
+ json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
1255
1721
  if isinstance(avro_field_type, list):
1256
1722
  avro_field_type = self.flatten_union(
1257
1723
  avro_field_type)
@@ -1289,6 +1755,8 @@ class JsonToAvroConverter:
1289
1755
  avro_field['default'] = default
1290
1756
  if description:
1291
1757
  avro_field['doc'] = description
1758
+ if discriminator:
1759
+ avro_field['discriminator'] = discriminator
1292
1760
  field_type_list.append(avro_field_type)
1293
1761
  avro_field_ref = {
1294
1762
  'name': avro_name(field_name),
@@ -1310,7 +1778,7 @@ class JsonToAvroConverter:
1310
1778
  # we don't have any fields, but we have an array type, so we create a record with an 'items' field
1311
1779
  avro_record = self.create_array_type(
1312
1780
  self.json_type_to_avro_type(
1313
- json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1781
+ json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1314
1782
  if 'items' in json_object
1315
1783
  else generic_type())
1316
1784
  else:
@@ -1324,7 +1792,7 @@ class JsonToAvroConverter:
1324
1792
  for pattern_name, props in pattern_props.items():
1325
1793
  deps = []
1326
1794
  prop_type = self.ensure_type(self.json_type_to_avro_type(
1327
- props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack))
1795
+ props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
1328
1796
  if self.is_standalone_avro_type(prop_type):
1329
1797
  self.lift_dependencies_from_type(prop_type, deps)
1330
1798
  self.set_avro_type_value(
@@ -1350,7 +1818,7 @@ class JsonToAvroConverter:
1350
1818
  additional_props = json_object['additionalProperties']
1351
1819
  deps = []
1352
1820
  values_type = self.json_type_to_avro_type(
1353
- additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1821
+ additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1354
1822
  if self.is_standalone_avro_type(values_type):
1355
1823
  self.lift_dependencies_from_type(values_type, deps)
1356
1824
  self.set_avro_type_value(
@@ -1457,11 +1925,12 @@ class JsonToAvroConverter:
1457
1925
  if 'unmerged_types' in merge_result:
1458
1926
  del merge_result['unmerged_types']
1459
1927
  if isinstance(merge_result, list):
1460
- # unmerged field containers have fields.
1461
- self.set_avro_type_value(
1462
- type, 'name', type['name'] + '_item')
1928
+ # unmerged field containers have fields - wrap the union in a record
1929
+ # Keep the original name since references expect it
1463
1930
  self.set_avro_type_value(
1464
1931
  type, 'fields', [{'name': 'value', 'type': merge_result}])
1932
+ if 'unmerged_types' in type:
1933
+ del type['unmerged_types']
1465
1934
  merge_result = copy.deepcopy(type)
1466
1935
  set_schema_node(find_fn, merge_result, avro_schema)
1467
1936