structurize 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. avrotize/__init__.py +64 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp.py +483 -0
  7. avrotize/avrotocsharp.py +1075 -0
  8. avrotize/avrotocsv.py +121 -0
  9. avrotize/avrotodatapackage.py +173 -0
  10. avrotize/avrotodb.py +1383 -0
  11. avrotize/avrotogo.py +476 -0
  12. avrotize/avrotographql.py +197 -0
  13. avrotize/avrotoiceberg.py +210 -0
  14. avrotize/avrotojava.py +2156 -0
  15. avrotize/avrotojs.py +250 -0
  16. avrotize/avrotojsons.py +481 -0
  17. avrotize/avrotojstruct.py +345 -0
  18. avrotize/avrotokusto.py +364 -0
  19. avrotize/avrotomd.py +137 -0
  20. avrotize/avrotools.py +168 -0
  21. avrotize/avrotoparquet.py +208 -0
  22. avrotize/avrotoproto.py +359 -0
  23. avrotize/avrotopython.py +624 -0
  24. avrotize/avrotorust.py +435 -0
  25. avrotize/avrotots.py +598 -0
  26. avrotize/avrotoxsd.py +344 -0
  27. avrotize/cddltostructure.py +1841 -0
  28. avrotize/commands.json +3337 -0
  29. avrotize/common.py +834 -0
  30. avrotize/constants.py +72 -0
  31. avrotize/csvtoavro.py +132 -0
  32. avrotize/datapackagetoavro.py +76 -0
  33. avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
  34. avrotize/dependencies/typescript/node22/package.json +16 -0
  35. avrotize/dependency_resolver.py +348 -0
  36. avrotize/dependency_version.py +432 -0
  37. avrotize/jsonstoavro.py +2167 -0
  38. avrotize/jsonstostructure.py +2642 -0
  39. avrotize/jstructtoavro.py +878 -0
  40. avrotize/kstructtoavro.py +93 -0
  41. avrotize/kustotoavro.py +455 -0
  42. avrotize/parquettoavro.py +157 -0
  43. avrotize/proto2parser.py +498 -0
  44. avrotize/proto3parser.py +403 -0
  45. avrotize/prototoavro.py +382 -0
  46. avrotize/structuretocddl.py +597 -0
  47. avrotize/structuretocpp.py +697 -0
  48. avrotize/structuretocsharp.py +2295 -0
  49. avrotize/structuretocsv.py +365 -0
  50. avrotize/structuretodatapackage.py +659 -0
  51. avrotize/structuretodb.py +1125 -0
  52. avrotize/structuretogo.py +720 -0
  53. avrotize/structuretographql.py +502 -0
  54. avrotize/structuretoiceberg.py +355 -0
  55. avrotize/structuretojava.py +853 -0
  56. avrotize/structuretojsons.py +498 -0
  57. avrotize/structuretokusto.py +639 -0
  58. avrotize/structuretomd.py +322 -0
  59. avrotize/structuretoproto.py +764 -0
  60. avrotize/structuretopython.py +772 -0
  61. avrotize/structuretorust.py +714 -0
  62. avrotize/structuretots.py +653 -0
  63. avrotize/structuretoxsd.py +679 -0
  64. avrotize/xsdtoavro.py +413 -0
  65. structurize-2.19.0.dist-info/METADATA +107 -0
  66. structurize-2.19.0.dist-info/RECORD +70 -0
  67. structurize-2.19.0.dist-info/WHEEL +5 -0
  68. structurize-2.19.0.dist-info/entry_points.txt +2 -0
  69. structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
  70. structurize-2.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,878 @@
1
+ """
2
+ JSON Structure to Avro Schema Converter
3
+
4
+ Converts JSON Structure documents to Apache Avro schema format.
5
+ This is the reverse operation of avrotojstruct.py.
6
+ """
7
+
8
+ import json
9
+ from typing import Any, Dict, List, Union, Optional
10
+
11
+
12
+ class JsonStructureToAvro:
13
+ """
14
+ Convert JSON Structure documents to Avro schema format.
15
+ """
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize the converter."""
19
+ self.structure_doc: Optional[Dict[str, Any]] = None
20
+ self.converted_types: Dict[str, Dict[str, Any]] = {}
21
+
22
+ def convert(self, structure_schema: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
23
+ """
24
+ Convert a JSON Structure document to Avro schema.
25
+
26
+ Args:
27
+ structure_schema: The JSON Structure document
28
+
29
+ Returns:
30
+ Avro schema (dict or list of dicts)
31
+ """
32
+ self.structure_doc = structure_schema
33
+ self.converted_types.clear()
34
+
35
+ # Check if this is an inline type (type at root) or uses $root
36
+ root_ref = structure_schema.get('$root')
37
+ has_inline_type = 'type' in structure_schema
38
+
39
+ if has_inline_type:
40
+ # Inline type at root - convert directly
41
+ name = structure_schema.get('name', 'Root')
42
+ namespace = None # Root level doesn't have namespace
43
+
44
+ # Also convert any definitions that might be referenced
45
+ definitions = structure_schema.get('definitions', {})
46
+ if definitions:
47
+ for def_path, def_schema in self._flatten_definitions(definitions).items():
48
+ self._convert_definition(def_path, def_schema)
49
+
50
+ root_schema = self._convert_type_from_schema(structure_schema, namespace, name)
51
+
52
+ # If there are referenced types, return all as a list
53
+ if self.converted_types:
54
+ # Filter out abstract types
55
+ concrete_types = [schema for schema in self.converted_types.values()
56
+ if not (schema.get('type') == 'null' and 'Abstract type' in schema.get('doc', ''))]
57
+ return [root_schema] + concrete_types if concrete_types else root_schema
58
+
59
+ return root_schema
60
+
61
+ if not root_ref:
62
+ raise ValueError("JSON Structure document must have either 'type' or '$root' property")
63
+
64
+ # Extract definitions
65
+ definitions = structure_schema.get('definitions', {})
66
+ if not definitions:
67
+ raise ValueError("JSON Structure document with $root must have definitions")
68
+
69
+ # Convert all definitions first
70
+ for def_path, def_schema in self._flatten_definitions(definitions).items():
71
+ self._convert_definition(def_path, def_schema)
72
+
73
+ # Get the root schema
74
+ root_path = root_ref.replace('#/definitions/', '')
75
+ root_schema = self.converted_types.get(root_path)
76
+
77
+ if not root_schema:
78
+ raise ValueError(f"Root type {root_path} not found in converted types")
79
+
80
+ # Return single schema or list depending on how many types were defined
81
+ if len(self.converted_types) == 1:
82
+ return root_schema
83
+ else:
84
+ # Return all schemas as a list
85
+ return list(self.converted_types.values())
86
+
87
+ def _flatten_definitions(self, definitions: Dict[str, Any], prefix: str = '') -> Dict[str, Dict[str, Any]]:
88
+ """
89
+ Flatten nested definitions into a flat dictionary with paths as keys.
90
+
91
+ Args:
92
+ definitions: Nested definitions dictionary
93
+ prefix: Current path prefix
94
+
95
+ Returns:
96
+ Flattened dictionary {path: definition_schema}
97
+ """
98
+ flattened = {}
99
+
100
+ for key, value in definitions.items():
101
+ path = f"{prefix}/{key}" if prefix else key
102
+
103
+ if isinstance(value, dict):
104
+ # Check if this is a type definition (has 'type' or other schema properties)
105
+ if 'type' in value or 'oneOf' in value or 'allOf' in value:
106
+ flattened[path] = value
107
+ else:
108
+ # It's a namespace, recurse
109
+ flattened.update(self._flatten_definitions(value, path))
110
+
111
+ return flattened
112
+
113
+ def _resolve_base_schema(self, ref: str) -> Optional[Dict[str, Any]]:
114
+ """
115
+ Resolve a $ref to its schema definition.
116
+
117
+ Args:
118
+ ref: Reference string like "#/definitions/BaseEntity"
119
+
120
+ Returns:
121
+ The resolved schema or None if not found
122
+ """
123
+ if not ref.startswith('#/definitions/'):
124
+ return None
125
+
126
+ if not self.structure_doc:
127
+ return None
128
+
129
+ ref_path = ref.replace('#/definitions/', '')
130
+ definitions = self.structure_doc.get('definitions', {})
131
+
132
+ # Navigate through nested definitions
133
+ parts = ref_path.split('/')
134
+ current = definitions
135
+ for part in parts:
136
+ if isinstance(current, dict) and part in current:
137
+ current = current[part]
138
+ else:
139
+ return None
140
+
141
+ return current if isinstance(current, dict) else None
142
+
143
+ def _merge_base_properties(self, schema: Dict[str, Any]) -> Dict[str, Any]:
144
+ """
145
+ Merge properties from base type(s) via $extends.
146
+
147
+ Args:
148
+ schema: Type schema that may have $extends
149
+
150
+ Returns:
151
+ Schema with merged properties
152
+ """
153
+ extends_ref = schema.get('$extends')
154
+ if not extends_ref:
155
+ return schema
156
+
157
+ # Resolve the base type
158
+ base_schema = self._resolve_base_schema(extends_ref)
159
+ if not base_schema:
160
+ return schema
161
+
162
+ # Recursively merge base's base
163
+ base_schema = self._merge_base_properties(base_schema)
164
+
165
+ # Create merged schema
166
+ merged = dict(schema)
167
+
168
+ # Merge properties - child properties override base
169
+ base_properties = base_schema.get('properties', {})
170
+ child_properties = schema.get('properties', {})
171
+
172
+ if base_properties or child_properties:
173
+ merged['properties'] = {**base_properties, **child_properties}
174
+
175
+ # Merge required fields
176
+ base_required = base_schema.get('required', [])
177
+ child_required = schema.get('required', [])
178
+
179
+ if base_required or child_required:
180
+ # Combine and deduplicate
181
+ all_required = list(set(base_required + child_required))
182
+ merged['required'] = all_required
183
+
184
+ # Add note about inheritance in description (only if not already present)
185
+ if base_schema.get('abstract'):
186
+ base_name = extends_ref.split('/')[-1]
187
+ note = f"(extends abstract {base_name})"
188
+ if 'description' in merged and merged['description']:
189
+ # Only add if not already in description
190
+ if "extends abstract" not in merged['description'].lower():
191
+ merged['description'] = f"{merged['description']} {note}"
192
+ else:
193
+ merged['description'] = f"Extends abstract {base_name}"
194
+
195
+ return merged
196
+
197
+ def _build_doc_with_annotations(self, schema: Dict[str, Any], base_doc: Optional[str] = None) -> Optional[str]:
198
+ """
199
+ Build documentation string including constraint annotations.
200
+
201
+ Args:
202
+ schema: Property schema with possible annotations
203
+ base_doc: Base documentation from description field
204
+
205
+ Returns:
206
+ Enhanced documentation string or None
207
+ """
208
+ parts = []
209
+
210
+ if base_doc:
211
+ parts.append(base_doc)
212
+
213
+ # Add constraint annotations
214
+ annotations = []
215
+
216
+ if 'maxLength' in schema:
217
+ annotations.append(f"maxLength: {schema['maxLength']}")
218
+
219
+ if 'minLength' in schema:
220
+ annotations.append(f"minLength: {schema['minLength']}")
221
+
222
+ if 'precision' in schema:
223
+ annotations.append(f"precision: {schema['precision']}")
224
+
225
+ if 'scale' in schema:
226
+ annotations.append(f"scale: {schema['scale']}")
227
+
228
+ if 'pattern' in schema:
229
+ annotations.append(f"pattern: {schema['pattern']}")
230
+
231
+ if 'minimum' in schema:
232
+ annotations.append(f"minimum: {schema['minimum']}")
233
+
234
+ if 'maximum' in schema:
235
+ annotations.append(f"maximum: {schema['maximum']}")
236
+
237
+ if 'contentEncoding' in schema:
238
+ annotations.append(f"encoding: {schema['contentEncoding']}")
239
+
240
+ if 'contentMediaType' in schema:
241
+ annotations.append(f"mediaType: {schema['contentMediaType']}")
242
+
243
+ if 'contentCompression' in schema:
244
+ annotations.append(f"compression: {schema['contentCompression']}")
245
+
246
+ if annotations:
247
+ parts.append(f"[{', '.join(annotations)}]")
248
+
249
+ return ' '.join(parts) if parts else None
250
+
251
+ def _convert_definition(self, def_path: str, def_schema: Dict[str, Any]) -> Dict[str, Any]:
252
+ """
253
+ Convert a single type definition from JSON Structure to Avro.
254
+
255
+ Args:
256
+ def_path: The definition path (used as type name)
257
+ def_schema: The JSON Structure type definition
258
+
259
+ Returns:
260
+ Avro schema for this type
261
+ """
262
+ # Skip abstract types - they're not directly instantiable
263
+ if def_schema.get('abstract'):
264
+ # Store a placeholder but don't convert
265
+ return {'type': 'null', 'doc': f'Abstract type: {def_path}'}
266
+
267
+ # Merge base type properties if $extends is present
268
+ merged_schema = self._merge_base_properties(def_schema)
269
+
270
+ # Parse namespace and name from path
271
+ if '/' in def_path:
272
+ parts = def_path.split('/')
273
+ namespace = '.'.join(parts[:-1])
274
+ name = parts[-1]
275
+ else:
276
+ namespace = None
277
+ name = def_path
278
+
279
+ avro_schema = self._convert_type_from_schema(merged_schema, namespace, name)
280
+ self.converted_types[def_path] = avro_schema
281
+ return avro_schema
282
+
283
+ def _convert_type_from_schema(self, def_schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
284
+ """
285
+ Convert a type definition based on its schema properties.
286
+
287
+ Args:
288
+ def_schema: The JSON Structure type definition
289
+ namespace: The namespace for the type
290
+ name: The name of the type
291
+
292
+ Returns:
293
+ Avro schema for this type
294
+ """
295
+ avro_schema: Dict[str, Any] = {}
296
+
297
+ # Handle different JSON Structure types
298
+ type_value = def_schema.get('type')
299
+
300
+ if type_value == 'object':
301
+ avro_schema = self._convert_object(def_schema, namespace, name)
302
+ elif type_value == 'string' and 'enum' in def_schema:
303
+ avro_schema = self._convert_enum(def_schema, namespace, name)
304
+ elif type_value == 'binary' and 'byteLength' in def_schema:
305
+ avro_schema = self._convert_fixed(def_schema, namespace, name)
306
+ elif 'oneOf' in def_schema:
307
+ avro_schema = self._convert_union(def_schema, namespace, name)
308
+ elif type_value == 'choice':
309
+ avro_schema = self._convert_choice(def_schema, namespace, name)
310
+ elif type_value == 'set':
311
+ avro_schema = self._convert_set(def_schema, namespace, name)
312
+ elif type_value == 'tuple':
313
+ avro_schema = self._convert_tuple(def_schema, namespace, name)
314
+ elif type_value == 'any':
315
+ avro_schema = self._convert_any(def_schema, namespace, name)
316
+ elif type_value == 'array':
317
+ # Array as top-level type needs wrapping in a record
318
+ avro_schema = {
319
+ 'type': 'record',
320
+ 'name': name,
321
+ 'fields': [{
322
+ 'name': 'items',
323
+ 'type': {
324
+ 'type': 'array',
325
+ 'items': self._convert_type_reference(def_schema.get('items', 'string'))
326
+ }
327
+ }]
328
+ }
329
+ if namespace:
330
+ avro_schema['namespace'] = namespace
331
+ elif type_value == 'map':
332
+ # Map as top-level type needs wrapping in a record
333
+ avro_schema = {
334
+ 'type': 'record',
335
+ 'name': name,
336
+ 'fields': [{
337
+ 'name': 'values',
338
+ 'type': {
339
+ 'type': 'map',
340
+ 'values': self._convert_type_reference(def_schema.get('values', 'string'))
341
+ }
342
+ }]
343
+ }
344
+ if namespace:
345
+ avro_schema['namespace'] = namespace
346
+ else:
347
+ # It might be a simple type alias or logical type
348
+ avro_schema = self._convert_simple_type(def_schema, namespace, name)
349
+
350
+ return avro_schema
351
+
352
+ def _convert_object(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
353
+ """Convert JSON Structure object to Avro record."""
354
+ # Merge base properties if $extends is present
355
+ merged_schema = self._merge_base_properties(schema)
356
+
357
+ avro_record: Dict[str, Any] = {
358
+ 'type': 'record',
359
+ 'name': name
360
+ }
361
+
362
+ if namespace:
363
+ avro_record['namespace'] = namespace
364
+
365
+ if 'description' in merged_schema:
366
+ avro_record['doc'] = merged_schema['description']
367
+
368
+ # Convert properties to fields
369
+ properties = merged_schema.get('properties', {})
370
+ required = merged_schema.get('required', [])
371
+
372
+ fields = []
373
+ for prop_name, prop_schema in properties.items():
374
+ field = {
375
+ 'name': prop_name,
376
+ 'type': self._convert_type_reference(prop_schema)
377
+ }
378
+
379
+ # Build documentation with annotations
380
+ doc = self._build_doc_with_annotations(
381
+ prop_schema,
382
+ prop_schema.get('description')
383
+ )
384
+ if doc:
385
+ field['doc'] = doc
386
+
387
+ # Handle default values
388
+ if 'default' in prop_schema:
389
+ field['default'] = prop_schema['default']
390
+ elif prop_name not in required:
391
+ # Optional field - make it nullable with null default
392
+ if isinstance(field['type'], list):
393
+ if 'null' not in field['type']:
394
+ field['type'] = ['null'] + field['type']
395
+ else:
396
+ field['type'] = ['null', field['type']]
397
+ field['default'] = None
398
+
399
+ fields.append(field)
400
+
401
+ avro_record['fields'] = fields
402
+ return avro_record
403
+
404
+ def _convert_enum(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
405
+ """Convert JSON Structure enum to Avro enum."""
406
+ avro_enum: Dict[str, Any] = {
407
+ 'type': 'enum',
408
+ 'name': name,
409
+ 'symbols': schema['enum']
410
+ }
411
+
412
+ if namespace:
413
+ avro_enum['namespace'] = namespace
414
+
415
+ if 'description' in schema:
416
+ avro_enum['doc'] = schema['description']
417
+
418
+ if 'default' in schema:
419
+ avro_enum['default'] = schema['default']
420
+
421
+ return avro_enum
422
+
423
+ def _convert_fixed(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
424
+ """Convert JSON Structure fixed-length binary to Avro fixed."""
425
+ avro_fixed: Dict[str, Any] = {
426
+ 'type': 'fixed',
427
+ 'name': name,
428
+ 'size': schema['byteLength']
429
+ }
430
+
431
+ if namespace:
432
+ avro_fixed['namespace'] = namespace
433
+
434
+ if 'description' in schema:
435
+ avro_fixed['doc'] = schema['description']
436
+
437
+ return avro_fixed
438
+
439
+ def _convert_union(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
440
+ """Convert JSON Structure oneOf to Avro union or record."""
441
+ # Check if this is a proper union with $extends (discriminated union)
442
+ one_of = schema.get('oneOf', [])
443
+
444
+ # For now, create a simple record that can hold the union
445
+ # TODO: Implement proper discriminated union mapping
446
+ avro_record: Dict[str, Any] = {
447
+ 'type': 'record',
448
+ 'name': name
449
+ }
450
+
451
+ if namespace:
452
+ avro_record['namespace'] = namespace
453
+
454
+ if 'description' in schema:
455
+ avro_record['doc'] = schema['description']
456
+
457
+ # Create a union field
458
+ union_types = [self._convert_type_reference(choice) for choice in one_of]
459
+
460
+ avro_record['fields'] = [{
461
+ 'name': 'value',
462
+ 'type': union_types
463
+ }]
464
+
465
+ return avro_record
466
+
467
+ def _convert_choice(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
468
+ """Convert JSON Structure choice to Avro union with discriminator support.
469
+
470
+ For tagged unions (no selector): Creates an enum discriminator field + union field.
471
+ For inline unions (with selector): Ensures selector field exists in each choice type with default value.
472
+ """
473
+ choices = schema.get('choices', {})
474
+ selector = schema.get('selector')
475
+ extends_ref = schema.get('$extends')
476
+
477
+ if extends_ref and selector:
478
+ # Inline union (Section 3.2.3.7.2) - selector field is part of the data
479
+ # Each choice type should include the selector field with its choice name as default
480
+ avro_record: Dict[str, Any] = {
481
+ 'type': 'record',
482
+ 'name': name
483
+ }
484
+
485
+ if namespace:
486
+ avro_record['namespace'] = namespace
487
+
488
+ if 'description' in schema:
489
+ avro_record['doc'] = schema['description']
490
+ else:
491
+ avro_record['doc'] = f'Inline union with selector field: {selector}'
492
+
493
+ # Build union of choice types
494
+ # Note: The choice types themselves should have the selector field with defaults
495
+ # This would require modifying the referenced types, which we'll handle
496
+ # by documenting the expectation
497
+ union_types = []
498
+ for choice_name, choice_schema in choices.items():
499
+ choice_type = self._convert_type_reference(choice_schema)
500
+ union_types.append(choice_type)
501
+
502
+ # Create wrapper record with union field
503
+ avro_record['fields'] = [{
504
+ 'name': 'value',
505
+ 'type': union_types,
506
+ 'doc': f'Union of choice types. Each type includes "{selector}" field with its discriminator value.'
507
+ }]
508
+
509
+ return avro_record
510
+ else:
511
+ # Tagged union (Section 3.2.3.7.1) - discriminator is the choice key
512
+ # Create enum for type-safe discriminator + union field for value
513
+
514
+ # Build enum type for discriminator
515
+ enum_name = f'{name}Type'
516
+ choice_names = list(choices.keys())
517
+
518
+ discriminator_enum: Dict[str, Any] = {
519
+ 'type': 'enum',
520
+ 'name': enum_name,
521
+ 'symbols': choice_names
522
+ }
523
+
524
+ if namespace:
525
+ discriminator_enum['namespace'] = namespace
526
+
527
+ # Build union of choice types
528
+ union_types = []
529
+ for choice_name, choice_schema in choices.items():
530
+ choice_type = self._convert_type_reference(choice_schema)
531
+ union_types.append(choice_type)
532
+
533
+ # Create wrapper record with discriminator + union fields
534
+ avro_record: Dict[str, Any] = {
535
+ 'type': 'record',
536
+ 'name': name
537
+ }
538
+
539
+ if namespace:
540
+ avro_record['namespace'] = namespace
541
+
542
+ if 'description' in schema:
543
+ avro_record['doc'] = schema['description']
544
+ else:
545
+ avro_record['doc'] = 'Tagged union with explicit discriminator'
546
+
547
+ avro_record['fields'] = [
548
+ {
549
+ 'name': 'choiceType',
550
+ 'type': discriminator_enum,
551
+ 'doc': 'Discriminator indicating which type is present in the value field'
552
+ },
553
+ {
554
+ 'name': 'value',
555
+ 'type': union_types,
556
+ 'doc': 'The actual value of the selected choice type'
557
+ }
558
+ ]
559
+
560
+ return avro_record
561
+
562
+ def _convert_set(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
563
+ """Convert JSON Structure set to Avro array (sets are represented as arrays in Avro)."""
564
+ avro_record: Dict[str, Any] = {
565
+ 'type': 'record',
566
+ 'name': name
567
+ }
568
+
569
+ if namespace:
570
+ avro_record['namespace'] = namespace
571
+
572
+ if 'description' in schema:
573
+ avro_record['doc'] = schema['description'] + ' (Set - unique unordered elements)'
574
+ else:
575
+ avro_record['doc'] = 'Set - unique unordered elements'
576
+
577
+ # Sets are represented as arrays in Avro
578
+ items_type = schema.get('items', 'string')
579
+ avro_record['fields'] = [{
580
+ 'name': 'items',
581
+ 'type': {
582
+ 'type': 'array',
583
+ 'items': self._convert_type_reference(items_type)
584
+ }
585
+ }]
586
+
587
+ return avro_record
588
+
589
+ def _convert_tuple(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
590
+ """Convert JSON Structure tuple to Avro record with ordered fields."""
591
+ avro_record: Dict[str, Any] = {
592
+ 'type': 'record',
593
+ 'name': name
594
+ }
595
+
596
+ if namespace:
597
+ avro_record['namespace'] = namespace
598
+
599
+ if 'description' in schema:
600
+ avro_record['doc'] = schema['description']
601
+
602
+ # Tuples have a fixed set of items with specific types
603
+ tuple_items = schema.get('tuple', [])
604
+ fields = []
605
+
606
+ for idx, item_schema in enumerate(tuple_items):
607
+ fields.append({
608
+ 'name': f'item{idx}',
609
+ 'type': self._convert_type_reference(item_schema)
610
+ })
611
+
612
+ avro_record['fields'] = fields
613
+ return avro_record
614
+
615
+ def _convert_any(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
616
+ """Convert JSON Structure 'any' type to Avro union of all basic types."""
617
+ avro_record: Dict[str, Any] = {
618
+ 'type': 'record',
619
+ 'name': name
620
+ }
621
+
622
+ if namespace:
623
+ avro_record['namespace'] = namespace
624
+
625
+ if 'description' in schema:
626
+ avro_record['doc'] = schema['description'] + ' (Any type)'
627
+ else:
628
+ avro_record['doc'] = 'Any type'
629
+
630
+ # In Avro, 'any' can be represented as a union of all basic types
631
+ # or as a string containing JSON
632
+ avro_record['fields'] = [{
633
+ 'name': 'value',
634
+ 'type': ['null', 'boolean', 'int', 'long', 'float', 'double', 'string', 'bytes']
635
+ }]
636
+
637
+ return avro_record
638
+
639
+ def _convert_simple_type(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
640
+ """Convert a simple type (possibly with logical type) to an Avro record with a single field."""
641
+ type_value = schema.get('type')
642
+ logical_type = schema.get('logicalType')
643
+
644
+ # For simple types, we create a record wrapper with a 'value' field
645
+ avro_record: Dict[str, Any] = {
646
+ 'type': 'record',
647
+ 'name': name
648
+ }
649
+
650
+ if namespace:
651
+ avro_record['namespace'] = namespace
652
+
653
+ if 'description' in schema:
654
+ avro_record['doc'] = schema['description']
655
+
656
+ # Determine the field type
657
+ if logical_type:
658
+ field_type = self._map_logical_type(logical_type, type_value)
659
+ else:
660
+ field_type = self._convert_type_reference(schema)
661
+
662
+ avro_record['fields'] = [{
663
+ 'name': 'value',
664
+ 'type': field_type
665
+ }]
666
+
667
+ return avro_record
668
+
669
+ def _convert_type_reference(self, schema: Union[Dict[str, Any], str]) -> Union[str, Dict[str, Any], List]:
670
+ """
671
+ Convert a type reference or inline type definition.
672
+
673
+ Args:
674
+ schema: Type schema or reference
675
+
676
+ Returns:
677
+ Avro type (string, dict, or list for union)
678
+ """
679
+ if isinstance(schema, str):
680
+ return self._map_primitive_type(schema)
681
+
682
+ if not isinstance(schema, dict):
683
+ raise ValueError(f"Invalid type schema: {schema}")
684
+
685
+ # Handle $ref
686
+ if '$ref' in schema:
687
+ ref = schema['$ref']
688
+ if ref.startswith('#/definitions/'):
689
+ ref_path = ref.replace('#/definitions/', '')
690
+ # Convert path format back to Avro namespace.name format
691
+ return ref_path.replace('/', '.')
692
+ raise ValueError(f"Unsupported reference format: {ref}")
693
+
694
+ # Handle inline types
695
+ type_value = schema.get('type')
696
+
697
+ # Handle union types (type is an array like ["string", "null"])
698
+ if isinstance(type_value, list):
699
+ return [self._map_primitive_type(t) if isinstance(t, str) else self._convert_type_reference(t) for t in type_value]
700
+
701
+ # Handle choice types
702
+ if type_value == 'choice':
703
+ # For nested choice types, we need to convert them fully
704
+ # Generate a unique name based on the choices
705
+ choices = schema.get('choices', {})
706
+ choice_name = f"Choice_{'_'.join(choices.keys())}" if choices else "Choice"
707
+ return self._convert_choice(schema, None, choice_name)
708
+
709
+ # Handle set types
710
+ if type_value == 'set':
711
+ # Sets are represented as arrays in Avro
712
+ return {
713
+ 'type': 'array',
714
+ 'items': self._convert_type_reference(schema.get('items', 'string'))
715
+ }
716
+
717
+ # Handle tuple types
718
+ if type_value == 'tuple':
719
+ # Tuples need to be records - generate unique name
720
+ tuple_name = f"Tuple_{len(schema.get('tuple', []))}_items"
721
+ return self._convert_tuple(schema, None, tuple_name)
722
+
723
+ # Handle any types
724
+ if type_value == 'any':
725
+ # Return union of all basic types
726
+ return ['null', 'boolean', 'int', 'long', 'float', 'double', 'string', 'bytes']
727
+
728
+ if type_value == 'array':
729
+ return {
730
+ 'type': 'array',
731
+ 'items': self._convert_type_reference(schema['items'])
732
+ }
733
+
734
+ if type_value == 'map':
735
+ return {
736
+ 'type': 'map',
737
+ 'values': self._convert_type_reference(schema['values'])
738
+ }
739
+
740
+ # Handle logical types
741
+ logical_type = schema.get('logicalType')
742
+ if logical_type:
743
+ return self._map_logical_type(logical_type, type_value)
744
+
745
+ # Primitive type
746
+ if type_value:
747
+ return self._map_primitive_type(type_value)
748
+
749
+ raise ValueError(f"Cannot convert type schema: {schema}")
750
+
751
+ def _map_primitive_type(self, struct_type: str) -> Union[str, Dict[str, Any]]:
752
+ """Map JSON Structure primitive type to Avro primitive type.
753
+
754
+ For temporal types, returns Avrotize Schema format with string base type
755
+ and logical type annotation (RFC 3339 format).
756
+ """
757
+ # Simple types without logical type annotation
758
+ simple_type_mapping = {
759
+ 'null': 'null',
760
+ 'boolean': 'boolean',
761
+ # Integer types
762
+ 'int8': 'int',
763
+ 'int16': 'int',
764
+ 'int32': 'int',
765
+ 'int64': 'long',
766
+ 'uint8': 'int',
767
+ 'uint16': 'int',
768
+ 'uint32': 'long',
769
+ 'uint64': 'long',
770
+ 'int128': 'string', # Too large for Avro numeric types
771
+ 'uint128': 'string',
772
+ # Floating point types
773
+ 'float8': 'float',
774
+ 'float16': 'float',
775
+ 'float32': 'float',
776
+ 'float': 'float',
777
+ 'float64': 'double',
778
+ 'double': 'double',
779
+ 'number': 'double', # Generic number → double
780
+ # String and binary types
781
+ 'string': 'string',
782
+ 'binary': 'bytes',
783
+ 'bytes': 'bytes',
784
+ # Other types
785
+ 'uri': 'string',
786
+ 'jsonpointer': 'string',
787
+ }
788
+
789
+ # Temporal types with Avrotize Schema string-based logical types (RFC 3339 format)
790
+ temporal_type_mapping = {
791
+ 'date': {'type': 'string', 'logicalType': 'date'}, # RFC 3339 full-date
792
+ 'datetime': {'type': 'string', 'logicalType': 'timestamp-millis'}, # RFC 3339 date-time
793
+ 'time': {'type': 'string', 'logicalType': 'time-millis'}, # RFC 3339 partial-time
794
+ 'duration': {'type': 'string', 'logicalType': 'duration'}, # RFC 3339 duration
795
+ 'timestamp': {'type': 'string', 'logicalType': 'timestamp-millis'}, # RFC 3339 date-time
796
+ }
797
+
798
+ # Special types with logical type annotation
799
+ special_type_mapping = {
800
+ 'uuid': {'type': 'string', 'logicalType': 'uuid'},
801
+ 'decimal': {'type': 'string', 'logicalType': 'decimal'}, # Avrotize extension: decimal on string
802
+ }
803
+
804
+ # Check in order: temporal, special, simple
805
+ if struct_type in temporal_type_mapping:
806
+ return temporal_type_mapping[struct_type]
807
+ if struct_type in special_type_mapping:
808
+ return special_type_mapping[struct_type]
809
+ if struct_type in simple_type_mapping:
810
+ return simple_type_mapping[struct_type]
811
+
812
+ # Fallback to the type as-is
813
+ return struct_type
814
+
815
+ def _map_logical_type(self, logical_type: str, base_type: Optional[str]) -> Dict[str, Any]:
816
+ """Map JSON Structure logical type to Avro/Avrotize logical type.
817
+
818
+ Uses Avrotize Schema extensions for string-based temporal types (RFC 3339 format).
819
+ """
820
+ # Avrotize Schema: temporal types on string (RFC 3339 format)
821
+ logical_mapping = {
822
+ # Timestamps
823
+ 'timestampMicros': {'type': 'string', 'logicalType': 'timestamp-micros'},
824
+ 'timestampMillis': {'type': 'string', 'logicalType': 'timestamp-millis'},
825
+ 'timestamp-micros': {'type': 'string', 'logicalType': 'timestamp-micros'},
826
+ 'timestamp-millis': {'type': 'string', 'logicalType': 'timestamp-millis'},
827
+ # Local timestamps (no timezone)
828
+ 'localTimestampMicros': {'type': 'string', 'logicalType': 'local-timestamp-micros'},
829
+ 'localTimestampMillis': {'type': 'string', 'logicalType': 'local-timestamp-millis'},
830
+ 'local-timestamp-micros': {'type': 'string', 'logicalType': 'local-timestamp-micros'},
831
+ 'local-timestamp-millis': {'type': 'string', 'logicalType': 'local-timestamp-millis'},
832
+ # Date and time
833
+ 'date': {'type': 'string', 'logicalType': 'date'},
834
+ 'time-millis': {'type': 'string', 'logicalType': 'time-millis'},
835
+ 'time-micros': {'type': 'string', 'logicalType': 'time-micros'},
836
+ 'timeMillis': {'type': 'string', 'logicalType': 'time-millis'},
837
+ 'timeMicros': {'type': 'string', 'logicalType': 'time-micros'},
838
+ # Duration
839
+ 'duration': {'type': 'string', 'logicalType': 'duration'},
840
+ # UUID
841
+ 'uuid': {'type': 'string', 'logicalType': 'uuid'},
842
+ # Decimal (Avrotize extension: on string)
843
+ 'decimal': {'type': 'string', 'logicalType': 'decimal'},
844
+ }
845
+
846
+ if logical_type in logical_mapping:
847
+ return logical_mapping[logical_type]
848
+
849
+ # Fallback to base type
850
+ if base_type:
851
+ mapped = self._map_primitive_type(base_type)
852
+ if isinstance(mapped, dict):
853
+ return mapped
854
+ return {'type': mapped}
855
+
856
+ return {'type': 'string'}
857
+
858
+
859
+ def convert_json_structure_to_avro(
860
+ structure_file: str,
861
+ avro_file: str
862
+ ) -> None:
863
+ """
864
+ Convert a JSON Structure file to Avro schema file.
865
+
866
+ Args:
867
+ structure_file: Path to input JSON Structure file
868
+ avro_file: Path to output Avro schema file
869
+ """
870
+ converter = JsonStructureToAvro()
871
+
872
+ with open(structure_file, 'r', encoding='utf-8') as f:
873
+ structure_schema = json.load(f)
874
+
875
+ avro_schema = converter.convert(structure_schema)
876
+
877
+ with open(avro_file, 'w', encoding='utf-8') as f:
878
+ json.dump(avro_schema, f, indent=2)