structurize 3.1.1__tar.gz → 3.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {structurize-3.1.1/structurize.egg-info → structurize-3.2.1}/PKG-INFO +1 -1
  2. {structurize-3.1.1 → structurize-3.2.1}/avrotize/_version.py +3 -3
  3. structurize-3.2.1/avrotize/choice_inference.py +443 -0
  4. {structurize-3.1.1 → structurize-3.2.1}/avrotize/commands.json +27 -3
  5. {structurize-3.1.1 → structurize-3.2.1}/avrotize/jsontoschema.py +8 -4
  6. {structurize-3.1.1 → structurize-3.2.1}/avrotize/schema_inference.py +516 -3
  7. {structurize-3.1.1 → structurize-3.2.1}/avrotize/sqltoavro.py +10 -4
  8. {structurize-3.1.1 → structurize-3.2.1/structurize.egg-info}/PKG-INFO +1 -1
  9. {structurize-3.1.1 → structurize-3.2.1}/structurize.egg-info/SOURCES.txt +2 -0
  10. {structurize-3.1.1 → structurize-3.2.1}/.gitignore +0 -0
  11. {structurize-3.1.1 → structurize-3.2.1}/LICENSE +0 -0
  12. {structurize-3.1.1 → structurize-3.2.1}/MANIFEST.in +0 -0
  13. {structurize-3.1.1 → structurize-3.2.1}/README.md +0 -0
  14. {structurize-3.1.1 → structurize-3.2.1}/avrotize/__init__.py +0 -0
  15. {structurize-3.1.1 → structurize-3.2.1}/avrotize/__main__.py +0 -0
  16. {structurize-3.1.1 → structurize-3.2.1}/avrotize/asn1toavro.py +0 -0
  17. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotize.py +0 -0
  18. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotocpp.py +0 -0
  19. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotocsharp.py +0 -0
  20. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotocsv.py +0 -0
  21. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotodatapackage.py +0 -0
  22. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotodb.py +0 -0
  23. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotogo.py +0 -0
  24. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotographql.py +0 -0
  25. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotoiceberg.py +0 -0
  26. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotojava.py +0 -0
  27. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotojs.py +0 -0
  28. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotojsons.py +0 -0
  29. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotojstruct.py +0 -0
  30. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotokusto.py +0 -0
  31. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotomd.py +0 -0
  32. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotools.py +0 -0
  33. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotoparquet.py +0 -0
  34. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotoproto.py +0 -0
  35. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotopython.py +0 -0
  36. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotorust.py +0 -0
  37. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotots.py +0 -0
  38. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrotoxsd.py +0 -0
  39. {structurize-3.1.1 → structurize-3.2.1}/avrotize/avrovalidator.py +0 -0
  40. {structurize-3.1.1 → structurize-3.2.1}/avrotize/cddltostructure.py +0 -0
  41. {structurize-3.1.1 → structurize-3.2.1}/avrotize/common.py +0 -0
  42. {structurize-3.1.1 → structurize-3.2.1}/avrotize/constants.py +0 -0
  43. {structurize-3.1.1 → structurize-3.2.1}/avrotize/csvtoavro.py +0 -0
  44. {structurize-3.1.1 → structurize-3.2.1}/avrotize/datapackagetoavro.py +0 -0
  45. {structurize-3.1.1 → structurize-3.2.1}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
  46. {structurize-3.1.1 → structurize-3.2.1}/avrotize/dependencies/typescript/node22/package.json +0 -0
  47. {structurize-3.1.1 → structurize-3.2.1}/avrotize/dependency_resolver.py +0 -0
  48. {structurize-3.1.1 → structurize-3.2.1}/avrotize/dependency_version.py +0 -0
  49. {structurize-3.1.1 → structurize-3.2.1}/avrotize/jsonstoavro.py +0 -0
  50. {structurize-3.1.1 → structurize-3.2.1}/avrotize/jsonstostructure.py +0 -0
  51. {structurize-3.1.1 → structurize-3.2.1}/avrotize/jstructtoavro.py +0 -0
  52. {structurize-3.1.1 → structurize-3.2.1}/avrotize/kstructtoavro.py +0 -0
  53. {structurize-3.1.1 → structurize-3.2.1}/avrotize/kustotoavro.py +0 -0
  54. {structurize-3.1.1 → structurize-3.2.1}/avrotize/openapitostructure.py +0 -0
  55. {structurize-3.1.1 → structurize-3.2.1}/avrotize/parquettoavro.py +0 -0
  56. {structurize-3.1.1 → structurize-3.2.1}/avrotize/proto2parser.py +0 -0
  57. {structurize-3.1.1 → structurize-3.2.1}/avrotize/proto3parser.py +0 -0
  58. {structurize-3.1.1 → structurize-3.2.1}/avrotize/prototoavro.py +0 -0
  59. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretocddl.py +0 -0
  60. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretocpp.py +0 -0
  61. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretocsharp.py +0 -0
  62. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretocsv.py +0 -0
  63. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretodatapackage.py +0 -0
  64. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretodb.py +0 -0
  65. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretogo.py +0 -0
  66. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretographql.py +0 -0
  67. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretoiceberg.py +0 -0
  68. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretojava.py +0 -0
  69. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretojs.py +0 -0
  70. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretojsons.py +0 -0
  71. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretokusto.py +0 -0
  72. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretomd.py +0 -0
  73. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretoproto.py +0 -0
  74. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretopython.py +0 -0
  75. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretorust.py +0 -0
  76. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretots.py +0 -0
  77. {structurize-3.1.1 → structurize-3.2.1}/avrotize/structuretoxsd.py +0 -0
  78. {structurize-3.1.1 → structurize-3.2.1}/avrotize/validate.py +0 -0
  79. {structurize-3.1.1 → structurize-3.2.1}/avrotize/xmltoschema.py +0 -0
  80. {structurize-3.1.1 → structurize-3.2.1}/avrotize/xsdtoavro.py +0 -0
  81. {structurize-3.1.1 → structurize-3.2.1}/build.ps1 +0 -0
  82. {structurize-3.1.1 → structurize-3.2.1}/build.sh +0 -0
  83. {structurize-3.1.1 → structurize-3.2.1}/pyproject.toml +0 -0
  84. {structurize-3.1.1 → structurize-3.2.1}/setup.cfg +0 -0
  85. {structurize-3.1.1 → structurize-3.2.1}/structurize.egg-info/dependency_links.txt +0 -0
  86. {structurize-3.1.1 → structurize-3.2.1}/structurize.egg-info/entry_points.txt +0 -0
  87. {structurize-3.1.1 → structurize-3.2.1}/structurize.egg-info/requires.txt +0 -0
  88. {structurize-3.1.1 → structurize-3.2.1}/structurize.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structurize
3
- Version: 3.1.1
3
+ Version: 3.2.1
4
4
  Summary: Tools to convert from and to JSON Structure from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '3.1.1'
32
- __version_tuple__ = version_tuple = (3, 1, 1)
31
+ __version__ = version = '3.2.1'
32
+ __version_tuple__ = version_tuple = (3, 2, 1)
33
33
 
34
- __commit_id__ = commit_id = 'ge20c22879'
34
+ __commit_id__ = commit_id = 'gfc8429a20'
@@ -0,0 +1,443 @@
1
+ """Discriminated Union (Choice) Inference for JSON data.
2
+
3
+ This module detects discriminated unions in JSON data, identifying:
4
+ 1. Discriminator fields that correlate with schema variants
5
+ 2. Nested discriminators in envelope patterns (e.g., CloudEvents with typed payload)
6
+ 3. Sparse data vs distinct types
7
+
8
+ The inference uses Jaccard similarity clustering on field signatures,
9
+ then detects fields whose values correlate strongly with cluster membership.
10
+ """
11
+
12
+ from collections import defaultdict
13
+ from dataclasses import dataclass, field
14
+ from typing import Any, Dict, List, Optional, Set, Tuple
15
+
16
+
17
+ @dataclass
18
+ class DocumentInfo:
19
+ """Information about a single JSON document."""
20
+ index: int
21
+ data: Dict[str, Any]
22
+ field_signature: frozenset # Set of top-level field names
23
+ field_values: Dict[str, str] # String field values (potential discriminators)
24
+
25
+
26
+ @dataclass
27
+ class SchemaCluster:
28
+ """A cluster of documents with similar schemas."""
29
+ id: int
30
+ documents: List[DocumentInfo] = field(default_factory=list)
31
+ merged_signature: Set[str] = field(default_factory=set)
32
+ required_fields: Set[str] = field(default_factory=set)
33
+
34
+ def add_document(self, doc: DocumentInfo):
35
+ self.documents.append(doc)
36
+ if not self.merged_signature:
37
+ self.merged_signature = set(doc.field_signature)
38
+ self.required_fields = set(doc.field_signature)
39
+ else:
40
+ self.merged_signature |= doc.field_signature
41
+ self.required_fields &= doc.field_signature
42
+
43
+
44
+ @dataclass
45
+ class DiscriminatorCandidate:
46
+ """A potential discriminator field."""
47
+ field_name: str
48
+ values: Set[str]
49
+ correlation: Dict[str, int] # value -> cluster_id mapping
50
+ correlation_score: float # 0-1, how well values map to clusters
51
+
52
+
53
+ @dataclass
54
+ class NestedDiscriminatorResult:
55
+ """Result of nested discriminator analysis."""
56
+ field_path: str # e.g., "payload.type" or "data.kind"
57
+ discriminator_field: str # The actual discriminator within the nested object
58
+ values: Set[str] # The discriminator values
59
+ nested_clusters: List[SchemaCluster]
60
+
61
+
62
+ @dataclass
63
+ class ChoiceInferenceResult:
64
+ """Result of choice type inference."""
65
+ is_choice: bool # True if this is a discriminated union
66
+ discriminator_field: Optional[str] # Top-level discriminator field name
67
+ discriminator_values: Set[str] # The discriminator values
68
+ clusters: List[SchemaCluster] # Schema clusters (variants)
69
+ nested_discriminator: Optional[NestedDiscriminatorResult] # For envelope patterns
70
+ recommendation: str # Human-readable description
71
+
72
+
73
+ def jaccard_similarity(set1: frozenset, set2: frozenset) -> float:
74
+ """Compute Jaccard similarity between two sets."""
75
+ if not set1 and not set2:
76
+ return 1.0
77
+ intersection = len(set1 & set2)
78
+ union = len(set1 | set2)
79
+ return intersection / union if union > 0 else 0.0
80
+
81
+
82
+ def _extract_document_info(index: int, data: Dict[str, Any]) -> DocumentInfo:
83
+ """Extract analysis info from a JSON document."""
84
+ field_signature = frozenset(data.keys())
85
+
86
+ # Extract potential discriminator values (string, number, boolean)
87
+ field_values = {}
88
+ for key, value in data.items():
89
+ if isinstance(value, str):
90
+ field_values[key] = value
91
+ elif isinstance(value, (int, float)) and not isinstance(value, bool):
92
+ field_values[key] = str(value)
93
+ elif isinstance(value, bool):
94
+ field_values[key] = str(value).lower()
95
+
96
+ return DocumentInfo(
97
+ index=index,
98
+ data=data,
99
+ field_signature=field_signature,
100
+ field_values=field_values
101
+ )
102
+
103
+
104
+ def _cluster_by_similarity(
105
+ documents: List[DocumentInfo],
106
+ similarity_threshold: float = 0.5
107
+ ) -> List[SchemaCluster]:
108
+ """Cluster documents by field signature similarity.
109
+
110
+ Uses a two-pass approach:
111
+ 1. Initial greedy clustering
112
+ 2. Refinement pass to reassign documents to better-matching clusters
113
+ """
114
+ clusters: List[SchemaCluster] = []
115
+
116
+ # First pass: greedy clustering
117
+ for doc in documents:
118
+ best_cluster = None
119
+ best_similarity = 0.0
120
+
121
+ for cluster in clusters:
122
+ if cluster.documents:
123
+ rep_sig = cluster.documents[0].field_signature
124
+ sim = jaccard_similarity(doc.field_signature, rep_sig)
125
+ if sim >= similarity_threshold and sim > best_similarity:
126
+ best_cluster = cluster
127
+ best_similarity = sim
128
+
129
+ if best_cluster:
130
+ best_cluster.add_document(doc)
131
+ else:
132
+ new_cluster = SchemaCluster(id=len(clusters))
133
+ new_cluster.add_document(doc)
134
+ clusters.append(new_cluster)
135
+
136
+ # Second pass: reassign documents to better-matching clusters
137
+ changed = True
138
+ max_iterations = 3
139
+ iteration = 0
140
+
141
+ while changed and iteration < max_iterations:
142
+ changed = False
143
+ iteration += 1
144
+
145
+ for cluster in clusters:
146
+ docs_to_move = []
147
+ for doc in cluster.documents:
148
+ best_cluster = cluster
149
+ best_similarity = jaccard_similarity(doc.field_signature,
150
+ cluster.documents[0].field_signature if cluster.documents else frozenset())
151
+
152
+ for other_cluster in clusters:
153
+ if other_cluster.id == cluster.id or not other_cluster.documents:
154
+ continue
155
+
156
+ rep_sig = other_cluster.documents[0].field_signature
157
+ sim = jaccard_similarity(doc.field_signature, rep_sig)
158
+
159
+ if sim > best_similarity + 0.1: # Must be significantly better
160
+ best_cluster = other_cluster
161
+ best_similarity = sim
162
+
163
+ if best_cluster.id != cluster.id:
164
+ docs_to_move.append((doc, best_cluster))
165
+
166
+ for doc, new_cluster in docs_to_move:
167
+ cluster.documents.remove(doc)
168
+ new_cluster.add_document(doc)
169
+ changed = True
170
+
171
+ # Remove empty clusters and renumber
172
+ clusters = [c for c in clusters if c.documents]
173
+ for i, c in enumerate(clusters):
174
+ c.id = i
175
+
176
+ return clusters
177
+
178
+
179
+ def _detect_discriminators(
180
+ documents: List[DocumentInfo],
181
+ clusters: List[SchemaCluster],
182
+ min_correlation: float = 0.9
183
+ ) -> List[DiscriminatorCandidate]:
184
+ """Detect fields that correlate strongly with cluster membership."""
185
+ doc_to_cluster: Dict[int, int] = {}
186
+ for cluster in clusters:
187
+ for doc in cluster.documents:
188
+ doc_to_cluster[doc.index] = cluster.id
189
+
190
+ field_presence: Dict[str, int] = defaultdict(int)
191
+ field_values_by_doc: Dict[str, Dict[int, str]] = defaultdict(dict)
192
+
193
+ for doc in documents:
194
+ for field_name, value in doc.field_values.items():
195
+ field_presence[field_name] += 1
196
+ field_values_by_doc[field_name][doc.index] = value
197
+
198
+ candidates = []
199
+ total_docs = len(documents)
200
+ num_clusters = len(clusters)
201
+
202
+ for field_name, presence_count in field_presence.items():
203
+ if presence_count < total_docs * 0.8:
204
+ continue
205
+
206
+ values = set(field_values_by_doc[field_name].values())
207
+
208
+ # Skip likely unique IDs (>80% unique AND much more values than clusters)
209
+ uniqueness_ratio = len(values) / total_docs
210
+ values_vs_clusters = len(values) / num_clusters if num_clusters > 0 else len(values)
211
+ if uniqueness_ratio > 0.8 and values_vs_clusters > 3:
212
+ continue
213
+
214
+ if len(values) < 2:
215
+ continue
216
+
217
+ # Single cluster with multiple values - check if values create distinct groups
218
+ if len(clusters) == 1:
219
+ value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
220
+ for doc in documents:
221
+ if field_name in doc.field_values:
222
+ value_to_docs[doc.field_values[field_name]].append(doc)
223
+
224
+ if len(value_to_docs) >= 2:
225
+ all_values = list(value_to_docs.keys())
226
+ inter_sims = []
227
+ for i, v1 in enumerate(all_values):
228
+ for v2 in all_values[i+1:]:
229
+ docs1 = value_to_docs[v1]
230
+ docs2 = value_to_docs[v2]
231
+ if docs1 and docs2:
232
+ sim = jaccard_similarity(docs1[0].field_signature, docs2[0].field_signature)
233
+ inter_sims.append(sim)
234
+
235
+ avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
236
+
237
+ if avg_inter_sim < 0.7:
238
+ correlation = {v: i for i, v in enumerate(all_values)}
239
+ candidates.append(DiscriminatorCandidate(
240
+ field_name=field_name,
241
+ values=values,
242
+ correlation=correlation,
243
+ correlation_score=1.0 - avg_inter_sim
244
+ ))
245
+ continue
246
+
247
+ # Multiple clusters - check correlation
248
+ value_to_clusters: Dict[str, Set[int]] = defaultdict(set)
249
+
250
+ for doc_idx, value in field_values_by_doc[field_name].items():
251
+ cluster_id = doc_to_cluster[doc_idx]
252
+ value_to_clusters[value].add(cluster_id)
253
+
254
+ if len(values) < len(clusters):
255
+ continue
256
+
257
+ perfect_mappings = sum(1 for v, c in value_to_clusters.items() if len(c) == 1)
258
+ correlation_score = perfect_mappings / len(values) if values else 0
259
+
260
+ if correlation_score >= min_correlation:
261
+ correlation = {}
262
+ for value, cluster_ids in value_to_clusters.items():
263
+ correlation[value] = list(cluster_ids)[0]
264
+
265
+ candidates.append(DiscriminatorCandidate(
266
+ field_name=field_name,
267
+ values=values,
268
+ correlation=correlation,
269
+ correlation_score=correlation_score
270
+ ))
271
+
272
+ candidates.sort(key=lambda c: c.correlation_score, reverse=True)
273
+ return candidates
274
+
275
+
276
+ def _recluster_by_discriminator(
277
+ documents: List[DocumentInfo],
278
+ discriminator: DiscriminatorCandidate
279
+ ) -> List[SchemaCluster]:
280
+ """Re-cluster documents based on a detected discriminator field."""
281
+ clusters_by_value: Dict[str, SchemaCluster] = {}
282
+
283
+ for doc in documents:
284
+ value = doc.field_values.get(discriminator.field_name)
285
+ if value:
286
+ if value not in clusters_by_value:
287
+ clusters_by_value[value] = SchemaCluster(id=len(clusters_by_value))
288
+ clusters_by_value[value].add_document(doc)
289
+
290
+ return list(clusters_by_value.values())
291
+
292
+
293
+ def _detect_nested_discriminator(
294
+ documents: List[DocumentInfo],
295
+ max_depth: int = 2
296
+ ) -> Optional[NestedDiscriminatorResult]:
297
+ """Check if any nested object field contains a discriminated union."""
298
+ if max_depth <= 0:
299
+ return None
300
+
301
+ total_docs = len(documents)
302
+ field_objects: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
303
+
304
+ for doc in documents:
305
+ for field_name, value in doc.data.items():
306
+ if isinstance(value, dict) and value:
307
+ field_objects[field_name].append(value)
308
+
309
+ for field_name, nested_objects in field_objects.items():
310
+ if len(nested_objects) < total_docs * 0.8:
311
+ continue
312
+
313
+ nested_docs = [_extract_document_info(i, obj) for i, obj in enumerate(nested_objects)]
314
+ nested_clusters = _cluster_by_similarity(nested_docs, similarity_threshold=0.5)
315
+
316
+ if len(nested_clusters) >= 2:
317
+ nested_discriminators = _detect_discriminators(nested_docs, nested_clusters)
318
+ if nested_discriminators:
319
+ best = nested_discriminators[0]
320
+ if best.correlation_score >= 0.8:
321
+ return NestedDiscriminatorResult(
322
+ field_path=f"{field_name}.{best.field_name}",
323
+ discriminator_field=best.field_name,
324
+ values=best.values,
325
+ nested_clusters=nested_clusters
326
+ )
327
+
328
+ if len(nested_clusters) == 1:
329
+ nested_discriminators = _detect_discriminators(nested_docs, nested_clusters)
330
+ if nested_discriminators:
331
+ best = nested_discriminators[0]
332
+ if best.correlation_score > 0.3:
333
+ reclustered = _recluster_by_discriminator(nested_docs, best)
334
+ if len(reclustered) >= 2:
335
+ return NestedDiscriminatorResult(
336
+ field_path=f"{field_name}.{best.field_name}",
337
+ discriminator_field=best.field_name,
338
+ values=best.values,
339
+ nested_clusters=reclustered
340
+ )
341
+
342
+ if max_depth > 1:
343
+ deeper_result = _detect_nested_discriminator(nested_docs, max_depth - 1)
344
+ if deeper_result:
345
+ deeper_result.field_path = f"{field_name}.{deeper_result.field_path}"
346
+ return deeper_result
347
+
348
+ return None
349
+
350
+
351
+ def infer_choice_type(
352
+ values: List[Dict[str, Any]],
353
+ similarity_threshold: float = 0.5,
354
+ detect_nested: bool = True
355
+ ) -> ChoiceInferenceResult:
356
+ """Analyze JSON values to detect discriminated unions (choice types).
357
+
358
+ Args:
359
+ values: List of JSON objects to analyze
360
+ similarity_threshold: Jaccard similarity threshold for clustering (0-1)
361
+ detect_nested: Whether to detect nested discriminators (e.g., in CloudEvents payload)
362
+
363
+ Returns:
364
+ ChoiceInferenceResult with analysis results
365
+ """
366
+ if not values:
367
+ return ChoiceInferenceResult(
368
+ is_choice=False,
369
+ discriminator_field=None,
370
+ discriminator_values=set(),
371
+ clusters=[],
372
+ nested_discriminator=None,
373
+ recommendation="Empty input"
374
+ )
375
+
376
+ # Filter to dict values only
377
+ dict_values = [v for v in values if isinstance(v, dict)]
378
+ if not dict_values:
379
+ return ChoiceInferenceResult(
380
+ is_choice=False,
381
+ discriminator_field=None,
382
+ discriminator_values=set(),
383
+ clusters=[],
384
+ nested_discriminator=None,
385
+ recommendation="No object values found"
386
+ )
387
+
388
+ documents = [_extract_document_info(i, v) for i, v in enumerate(dict_values)]
389
+ clusters = _cluster_by_similarity(documents, similarity_threshold)
390
+ discriminators = _detect_discriminators(documents, clusters)
391
+
392
+ # Re-cluster by discriminator if found in single-cluster scenario
393
+ if len(clusters) == 1 and discriminators:
394
+ best = discriminators[0]
395
+ if best.correlation_score > 0.3:
396
+ clusters = _recluster_by_discriminator(documents, best)
397
+
398
+ # Single cluster = check for nested discriminator or sparse data
399
+ if len(clusters) == 1:
400
+ nested_result = None
401
+ if detect_nested:
402
+ nested_result = _detect_nested_discriminator(documents, max_depth=2)
403
+
404
+ if nested_result:
405
+ return ChoiceInferenceResult(
406
+ is_choice=True,
407
+ discriminator_field=None,
408
+ discriminator_values=set(),
409
+ clusters=clusters,
410
+ nested_discriminator=nested_result,
411
+ recommendation=f"Envelope with nested discriminator at '{nested_result.field_path}'"
412
+ )
413
+
414
+ return ChoiceInferenceResult(
415
+ is_choice=False,
416
+ discriminator_field=None,
417
+ discriminator_values=set(),
418
+ clusters=clusters,
419
+ nested_discriminator=None,
420
+ recommendation="Single type with optional fields"
421
+ )
422
+
423
+ # Multiple clusters with discriminator = discriminated union
424
+ if discriminators:
425
+ best = discriminators[0]
426
+ return ChoiceInferenceResult(
427
+ is_choice=True,
428
+ discriminator_field=best.field_name,
429
+ discriminator_values=best.values,
430
+ clusters=clusters,
431
+ nested_discriminator=None,
432
+ recommendation=f"Discriminated union on field '{best.field_name}' with {len(clusters)} variants"
433
+ )
434
+
435
+ # Multiple clusters without discriminator = undiscriminated union
436
+ return ChoiceInferenceResult(
437
+ is_choice=True,
438
+ discriminator_field=None,
439
+ discriminator_values=set(),
440
+ clusters=clusters,
441
+ nested_discriminator=None,
442
+ recommendation=f"Undiscriminated union with {len(clusters)} distinct types"
443
+ )
@@ -1017,7 +1017,8 @@
1017
1017
  "infer_json_schema": "args.infer_json",
1018
1018
  "infer_xml_schema": "args.infer_xml",
1019
1019
  "username": "args.username",
1020
- "password": "args.password"
1020
+ "password": "args.password",
1021
+ "infer_choices": "args.infer_choices"
1021
1022
  }
1022
1023
  },
1023
1024
  "extensions": [],
@@ -1117,6 +1118,13 @@
1117
1118
  "help": "Infer schema for XML columns",
1118
1119
  "default": true,
1119
1120
  "required": false
1121
+ },
1122
+ {
1123
+ "name": "--infer-choices",
1124
+ "type": "bool",
1125
+ "action": "store_true",
1126
+ "help": "Detect discriminated unions in JSON/XML columns and emit as unions with discriminator field defaults",
1127
+ "required": false
1120
1128
  }
1121
1129
  ],
1122
1130
  "suggested_output_file_path": "{database}.avsc",
@@ -1154,7 +1162,8 @@
1154
1162
  "avro_schema_file": "output_file_path",
1155
1163
  "type_name": "args.type_name",
1156
1164
  "avro_namespace": "args.namespace",
1157
- "sample_size": "args.sample_size"
1165
+ "sample_size": "args.sample_size",
1166
+ "infer_choices": "args.infer_choices"
1158
1167
  }
1159
1168
  },
1160
1169
  "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1191,6 +1200,13 @@
1191
1200
  "help": "Maximum number of records to sample (0 = all)",
1192
1201
  "default": 0,
1193
1202
  "required": false
1203
+ },
1204
+ {
1205
+ "name": "--infer-choices",
1206
+ "type": "bool",
1207
+ "action": "store_true",
1208
+ "help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
1209
+ "required": false
1194
1210
  }
1195
1211
  ],
1196
1212
  "suggested_output_file_path": "{input_file_name}.avsc",
@@ -1222,7 +1238,8 @@
1222
1238
  "jstruct_schema_file": "output_file_path",
1223
1239
  "type_name": "args.type_name",
1224
1240
  "base_id": "args.base_id",
1225
- "sample_size": "args.sample_size"
1241
+ "sample_size": "args.sample_size",
1242
+ "infer_choices": "args.infer_choices"
1226
1243
  }
1227
1244
  },
1228
1245
  "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1260,6 +1277,13 @@
1260
1277
  "help": "Maximum number of records to sample (0 = all)",
1261
1278
  "default": 0,
1262
1279
  "required": false
1280
+ },
1281
+ {
1282
+ "name": "--infer-choices",
1283
+ "type": "bool",
1284
+ "action": "store_true",
1285
+ "help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
1286
+ "required": false
1263
1287
  }
1264
1288
  ],
1265
1289
  "suggested_output_file_path": "{input_file_name}.jstruct.json",
@@ -21,7 +21,8 @@ def convert_json_to_avro(
21
21
  avro_schema_file: str,
22
22
  type_name: str = 'Document',
23
23
  avro_namespace: str = '',
24
- sample_size: int = 0
24
+ sample_size: int = 0,
25
+ infer_choices: bool = False
25
26
  ) -> None:
26
27
  """Infers Avro schema from JSON files.
27
28
 
@@ -35,6 +36,7 @@ def convert_json_to_avro(
35
36
  type_name: Name for the root type
36
37
  avro_namespace: Namespace for generated Avro types
37
38
  sample_size: Maximum number of records to sample (0 = all)
39
+ infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
38
40
  """
39
41
  if not input_files:
40
42
  raise ValueError("At least one input file is required")
@@ -44,7 +46,7 @@ def convert_json_to_avro(
44
46
  if not values:
45
47
  raise ValueError("No valid JSON data found in input files")
46
48
 
47
- inferrer = AvroSchemaInferrer(namespace=avro_namespace)
49
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
48
50
  schema = inferrer.infer_from_json_values(type_name, values)
49
51
 
50
52
  # Ensure output directory exists
@@ -61,7 +63,8 @@ def convert_json_to_jstruct(
61
63
  jstruct_schema_file: str,
62
64
  type_name: str = 'Document',
63
65
  base_id: str = 'https://example.com/',
64
- sample_size: int = 0
66
+ sample_size: int = 0,
67
+ infer_choices: bool = False
65
68
  ) -> None:
66
69
  """Infers JSON Structure schema from JSON files.
67
70
 
@@ -74,6 +77,7 @@ def convert_json_to_jstruct(
74
77
  type_name: Name for the root type
75
78
  base_id: Base URI for $id generation
76
79
  sample_size: Maximum number of records to sample (0 = all)
80
+ infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
77
81
  """
78
82
  if not input_files:
79
83
  raise ValueError("At least one input file is required")
@@ -83,7 +87,7 @@ def convert_json_to_jstruct(
83
87
  if not values:
84
88
  raise ValueError("No valid JSON data found in input files")
85
89
 
86
- inferrer = JsonStructureSchemaInferrer(base_id=base_id)
90
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
87
91
  schema = inferrer.infer_from_json_values(type_name, values)
88
92
 
89
93
  # Ensure output directory exists