structurize 3.1.0__tar.gz → 3.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {structurize-3.1.0/structurize.egg-info → structurize-3.2.0}/PKG-INFO +1 -1
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/_version.py +3 -3
- structurize-3.2.0/avrotize/choice_inference.py +443 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/commands.json +28 -3
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/jsontoschema.py +8 -4
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/schema_inference.py +390 -1
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/sqltoavro.py +10 -4
- {structurize-3.1.0 → structurize-3.2.0/structurize.egg-info}/PKG-INFO +1 -1
- {structurize-3.1.0 → structurize-3.2.0}/structurize.egg-info/SOURCES.txt +2 -0
- {structurize-3.1.0 → structurize-3.2.0}/.gitignore +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/LICENSE +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/MANIFEST.in +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/README.md +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/__init__.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/__main__.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/asn1toavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotize.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotocpp.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotocsharp.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotocsv.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotodatapackage.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotodb.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotogo.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotographql.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotoiceberg.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotojava.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotojs.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotojsons.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotojstruct.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotokusto.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotomd.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotools.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotoparquet.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotoproto.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotopython.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotorust.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotots.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrotoxsd.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/avrovalidator.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/cddltostructure.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/common.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/constants.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/csvtoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/datapackagetoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/dependencies/typescript/node22/package.json +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/dependency_resolver.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/dependency_version.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/jsonstoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/jsonstostructure.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/jstructtoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/kstructtoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/kustotoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/openapitostructure.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/parquettoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/proto2parser.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/proto3parser.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/prototoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretocddl.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretocpp.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretocsharp.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretocsv.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretodatapackage.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretodb.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretogo.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretographql.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretoiceberg.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretojava.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretojs.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretojsons.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretokusto.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretomd.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretoproto.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretopython.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretorust.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretots.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/structuretoxsd.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/validate.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/xmltoschema.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/avrotize/xsdtoavro.py +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/build.ps1 +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/build.sh +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/pyproject.toml +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/setup.cfg +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/structurize.egg-info/dependency_links.txt +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/structurize.egg-info/entry_points.txt +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/structurize.egg-info/requires.txt +0 -0
- {structurize-3.1.0 → structurize-3.2.0}/structurize.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: structurize
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.2.0
|
|
4
4
|
Summary: Tools to convert from and to JSON Structure from various other schema languages.
|
|
5
5
|
Author-email: Clemens Vasters <clemensv@microsoft.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '3.
|
|
32
|
-
__version_tuple__ = version_tuple = (3,
|
|
31
|
+
__version__ = version = '3.2.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (3, 2, 0)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'gb9324402c'
|
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""Discriminated Union (Choice) Inference for JSON data.
|
|
2
|
+
|
|
3
|
+
This module detects discriminated unions in JSON data, identifying:
|
|
4
|
+
1. Discriminator fields that correlate with schema variants
|
|
5
|
+
2. Nested discriminators in envelope patterns (e.g., CloudEvents with typed payload)
|
|
6
|
+
3. Sparse data vs distinct types
|
|
7
|
+
|
|
8
|
+
The inference uses Jaccard similarity clustering on field signatures,
|
|
9
|
+
then detects fields whose values correlate strongly with cluster membership.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class DocumentInfo:
|
|
19
|
+
"""Information about a single JSON document."""
|
|
20
|
+
index: int
|
|
21
|
+
data: Dict[str, Any]
|
|
22
|
+
field_signature: frozenset # Set of top-level field names
|
|
23
|
+
field_values: Dict[str, str] # String field values (potential discriminators)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SchemaCluster:
|
|
28
|
+
"""A cluster of documents with similar schemas."""
|
|
29
|
+
id: int
|
|
30
|
+
documents: List[DocumentInfo] = field(default_factory=list)
|
|
31
|
+
merged_signature: Set[str] = field(default_factory=set)
|
|
32
|
+
required_fields: Set[str] = field(default_factory=set)
|
|
33
|
+
|
|
34
|
+
def add_document(self, doc: DocumentInfo):
|
|
35
|
+
self.documents.append(doc)
|
|
36
|
+
if not self.merged_signature:
|
|
37
|
+
self.merged_signature = set(doc.field_signature)
|
|
38
|
+
self.required_fields = set(doc.field_signature)
|
|
39
|
+
else:
|
|
40
|
+
self.merged_signature |= doc.field_signature
|
|
41
|
+
self.required_fields &= doc.field_signature
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class DiscriminatorCandidate:
|
|
46
|
+
"""A potential discriminator field."""
|
|
47
|
+
field_name: str
|
|
48
|
+
values: Set[str]
|
|
49
|
+
correlation: Dict[str, int] # value -> cluster_id mapping
|
|
50
|
+
correlation_score: float # 0-1, how well values map to clusters
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class NestedDiscriminatorResult:
|
|
55
|
+
"""Result of nested discriminator analysis."""
|
|
56
|
+
field_path: str # e.g., "payload.type" or "data.kind"
|
|
57
|
+
discriminator_field: str # The actual discriminator within the nested object
|
|
58
|
+
values: Set[str] # The discriminator values
|
|
59
|
+
nested_clusters: List[SchemaCluster]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class ChoiceInferenceResult:
|
|
64
|
+
"""Result of choice type inference."""
|
|
65
|
+
is_choice: bool # True if this is a discriminated union
|
|
66
|
+
discriminator_field: Optional[str] # Top-level discriminator field name
|
|
67
|
+
discriminator_values: Set[str] # The discriminator values
|
|
68
|
+
clusters: List[SchemaCluster] # Schema clusters (variants)
|
|
69
|
+
nested_discriminator: Optional[NestedDiscriminatorResult] # For envelope patterns
|
|
70
|
+
recommendation: str # Human-readable description
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def jaccard_similarity(set1: frozenset, set2: frozenset) -> float:
|
|
74
|
+
"""Compute Jaccard similarity between two sets."""
|
|
75
|
+
if not set1 and not set2:
|
|
76
|
+
return 1.0
|
|
77
|
+
intersection = len(set1 & set2)
|
|
78
|
+
union = len(set1 | set2)
|
|
79
|
+
return intersection / union if union > 0 else 0.0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _extract_document_info(index: int, data: Dict[str, Any]) -> DocumentInfo:
|
|
83
|
+
"""Extract analysis info from a JSON document."""
|
|
84
|
+
field_signature = frozenset(data.keys())
|
|
85
|
+
|
|
86
|
+
# Extract potential discriminator values (string, number, boolean)
|
|
87
|
+
field_values = {}
|
|
88
|
+
for key, value in data.items():
|
|
89
|
+
if isinstance(value, str):
|
|
90
|
+
field_values[key] = value
|
|
91
|
+
elif isinstance(value, (int, float)) and not isinstance(value, bool):
|
|
92
|
+
field_values[key] = str(value)
|
|
93
|
+
elif isinstance(value, bool):
|
|
94
|
+
field_values[key] = str(value).lower()
|
|
95
|
+
|
|
96
|
+
return DocumentInfo(
|
|
97
|
+
index=index,
|
|
98
|
+
data=data,
|
|
99
|
+
field_signature=field_signature,
|
|
100
|
+
field_values=field_values
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _cluster_by_similarity(
|
|
105
|
+
documents: List[DocumentInfo],
|
|
106
|
+
similarity_threshold: float = 0.5
|
|
107
|
+
) -> List[SchemaCluster]:
|
|
108
|
+
"""Cluster documents by field signature similarity.
|
|
109
|
+
|
|
110
|
+
Uses a two-pass approach:
|
|
111
|
+
1. Initial greedy clustering
|
|
112
|
+
2. Refinement pass to reassign documents to better-matching clusters
|
|
113
|
+
"""
|
|
114
|
+
clusters: List[SchemaCluster] = []
|
|
115
|
+
|
|
116
|
+
# First pass: greedy clustering
|
|
117
|
+
for doc in documents:
|
|
118
|
+
best_cluster = None
|
|
119
|
+
best_similarity = 0.0
|
|
120
|
+
|
|
121
|
+
for cluster in clusters:
|
|
122
|
+
if cluster.documents:
|
|
123
|
+
rep_sig = cluster.documents[0].field_signature
|
|
124
|
+
sim = jaccard_similarity(doc.field_signature, rep_sig)
|
|
125
|
+
if sim >= similarity_threshold and sim > best_similarity:
|
|
126
|
+
best_cluster = cluster
|
|
127
|
+
best_similarity = sim
|
|
128
|
+
|
|
129
|
+
if best_cluster:
|
|
130
|
+
best_cluster.add_document(doc)
|
|
131
|
+
else:
|
|
132
|
+
new_cluster = SchemaCluster(id=len(clusters))
|
|
133
|
+
new_cluster.add_document(doc)
|
|
134
|
+
clusters.append(new_cluster)
|
|
135
|
+
|
|
136
|
+
# Second pass: reassign documents to better-matching clusters
|
|
137
|
+
changed = True
|
|
138
|
+
max_iterations = 3
|
|
139
|
+
iteration = 0
|
|
140
|
+
|
|
141
|
+
while changed and iteration < max_iterations:
|
|
142
|
+
changed = False
|
|
143
|
+
iteration += 1
|
|
144
|
+
|
|
145
|
+
for cluster in clusters:
|
|
146
|
+
docs_to_move = []
|
|
147
|
+
for doc in cluster.documents:
|
|
148
|
+
best_cluster = cluster
|
|
149
|
+
best_similarity = jaccard_similarity(doc.field_signature,
|
|
150
|
+
cluster.documents[0].field_signature if cluster.documents else frozenset())
|
|
151
|
+
|
|
152
|
+
for other_cluster in clusters:
|
|
153
|
+
if other_cluster.id == cluster.id or not other_cluster.documents:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
rep_sig = other_cluster.documents[0].field_signature
|
|
157
|
+
sim = jaccard_similarity(doc.field_signature, rep_sig)
|
|
158
|
+
|
|
159
|
+
if sim > best_similarity + 0.1: # Must be significantly better
|
|
160
|
+
best_cluster = other_cluster
|
|
161
|
+
best_similarity = sim
|
|
162
|
+
|
|
163
|
+
if best_cluster.id != cluster.id:
|
|
164
|
+
docs_to_move.append((doc, best_cluster))
|
|
165
|
+
|
|
166
|
+
for doc, new_cluster in docs_to_move:
|
|
167
|
+
cluster.documents.remove(doc)
|
|
168
|
+
new_cluster.add_document(doc)
|
|
169
|
+
changed = True
|
|
170
|
+
|
|
171
|
+
# Remove empty clusters and renumber
|
|
172
|
+
clusters = [c for c in clusters if c.documents]
|
|
173
|
+
for i, c in enumerate(clusters):
|
|
174
|
+
c.id = i
|
|
175
|
+
|
|
176
|
+
return clusters
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _detect_discriminators(
|
|
180
|
+
documents: List[DocumentInfo],
|
|
181
|
+
clusters: List[SchemaCluster],
|
|
182
|
+
min_correlation: float = 0.9
|
|
183
|
+
) -> List[DiscriminatorCandidate]:
|
|
184
|
+
"""Detect fields that correlate strongly with cluster membership."""
|
|
185
|
+
doc_to_cluster: Dict[int, int] = {}
|
|
186
|
+
for cluster in clusters:
|
|
187
|
+
for doc in cluster.documents:
|
|
188
|
+
doc_to_cluster[doc.index] = cluster.id
|
|
189
|
+
|
|
190
|
+
field_presence: Dict[str, int] = defaultdict(int)
|
|
191
|
+
field_values_by_doc: Dict[str, Dict[int, str]] = defaultdict(dict)
|
|
192
|
+
|
|
193
|
+
for doc in documents:
|
|
194
|
+
for field_name, value in doc.field_values.items():
|
|
195
|
+
field_presence[field_name] += 1
|
|
196
|
+
field_values_by_doc[field_name][doc.index] = value
|
|
197
|
+
|
|
198
|
+
candidates = []
|
|
199
|
+
total_docs = len(documents)
|
|
200
|
+
num_clusters = len(clusters)
|
|
201
|
+
|
|
202
|
+
for field_name, presence_count in field_presence.items():
|
|
203
|
+
if presence_count < total_docs * 0.8:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
values = set(field_values_by_doc[field_name].values())
|
|
207
|
+
|
|
208
|
+
# Skip likely unique IDs (>80% unique AND much more values than clusters)
|
|
209
|
+
uniqueness_ratio = len(values) / total_docs
|
|
210
|
+
values_vs_clusters = len(values) / num_clusters if num_clusters > 0 else len(values)
|
|
211
|
+
if uniqueness_ratio > 0.8 and values_vs_clusters > 3:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
if len(values) < 2:
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# Single cluster with multiple values - check if values create distinct groups
|
|
218
|
+
if len(clusters) == 1:
|
|
219
|
+
value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
|
|
220
|
+
for doc in documents:
|
|
221
|
+
if field_name in doc.field_values:
|
|
222
|
+
value_to_docs[doc.field_values[field_name]].append(doc)
|
|
223
|
+
|
|
224
|
+
if len(value_to_docs) >= 2:
|
|
225
|
+
all_values = list(value_to_docs.keys())
|
|
226
|
+
inter_sims = []
|
|
227
|
+
for i, v1 in enumerate(all_values):
|
|
228
|
+
for v2 in all_values[i+1:]:
|
|
229
|
+
docs1 = value_to_docs[v1]
|
|
230
|
+
docs2 = value_to_docs[v2]
|
|
231
|
+
if docs1 and docs2:
|
|
232
|
+
sim = jaccard_similarity(docs1[0].field_signature, docs2[0].field_signature)
|
|
233
|
+
inter_sims.append(sim)
|
|
234
|
+
|
|
235
|
+
avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
|
|
236
|
+
|
|
237
|
+
if avg_inter_sim < 0.7:
|
|
238
|
+
correlation = {v: i for i, v in enumerate(all_values)}
|
|
239
|
+
candidates.append(DiscriminatorCandidate(
|
|
240
|
+
field_name=field_name,
|
|
241
|
+
values=values,
|
|
242
|
+
correlation=correlation,
|
|
243
|
+
correlation_score=1.0 - avg_inter_sim
|
|
244
|
+
))
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# Multiple clusters - check correlation
|
|
248
|
+
value_to_clusters: Dict[str, Set[int]] = defaultdict(set)
|
|
249
|
+
|
|
250
|
+
for doc_idx, value in field_values_by_doc[field_name].items():
|
|
251
|
+
cluster_id = doc_to_cluster[doc_idx]
|
|
252
|
+
value_to_clusters[value].add(cluster_id)
|
|
253
|
+
|
|
254
|
+
if len(values) < len(clusters):
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
perfect_mappings = sum(1 for v, c in value_to_clusters.items() if len(c) == 1)
|
|
258
|
+
correlation_score = perfect_mappings / len(values) if values else 0
|
|
259
|
+
|
|
260
|
+
if correlation_score >= min_correlation:
|
|
261
|
+
correlation = {}
|
|
262
|
+
for value, cluster_ids in value_to_clusters.items():
|
|
263
|
+
correlation[value] = list(cluster_ids)[0]
|
|
264
|
+
|
|
265
|
+
candidates.append(DiscriminatorCandidate(
|
|
266
|
+
field_name=field_name,
|
|
267
|
+
values=values,
|
|
268
|
+
correlation=correlation,
|
|
269
|
+
correlation_score=correlation_score
|
|
270
|
+
))
|
|
271
|
+
|
|
272
|
+
candidates.sort(key=lambda c: c.correlation_score, reverse=True)
|
|
273
|
+
return candidates
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _recluster_by_discriminator(
|
|
277
|
+
documents: List[DocumentInfo],
|
|
278
|
+
discriminator: DiscriminatorCandidate
|
|
279
|
+
) -> List[SchemaCluster]:
|
|
280
|
+
"""Re-cluster documents based on a detected discriminator field."""
|
|
281
|
+
clusters_by_value: Dict[str, SchemaCluster] = {}
|
|
282
|
+
|
|
283
|
+
for doc in documents:
|
|
284
|
+
value = doc.field_values.get(discriminator.field_name)
|
|
285
|
+
if value:
|
|
286
|
+
if value not in clusters_by_value:
|
|
287
|
+
clusters_by_value[value] = SchemaCluster(id=len(clusters_by_value))
|
|
288
|
+
clusters_by_value[value].add_document(doc)
|
|
289
|
+
|
|
290
|
+
return list(clusters_by_value.values())
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _detect_nested_discriminator(
|
|
294
|
+
documents: List[DocumentInfo],
|
|
295
|
+
max_depth: int = 2
|
|
296
|
+
) -> Optional[NestedDiscriminatorResult]:
|
|
297
|
+
"""Check if any nested object field contains a discriminated union."""
|
|
298
|
+
if max_depth <= 0:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
total_docs = len(documents)
|
|
302
|
+
field_objects: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
303
|
+
|
|
304
|
+
for doc in documents:
|
|
305
|
+
for field_name, value in doc.data.items():
|
|
306
|
+
if isinstance(value, dict) and value:
|
|
307
|
+
field_objects[field_name].append(value)
|
|
308
|
+
|
|
309
|
+
for field_name, nested_objects in field_objects.items():
|
|
310
|
+
if len(nested_objects) < total_docs * 0.8:
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
nested_docs = [_extract_document_info(i, obj) for i, obj in enumerate(nested_objects)]
|
|
314
|
+
nested_clusters = _cluster_by_similarity(nested_docs, similarity_threshold=0.5)
|
|
315
|
+
|
|
316
|
+
if len(nested_clusters) >= 2:
|
|
317
|
+
nested_discriminators = _detect_discriminators(nested_docs, nested_clusters)
|
|
318
|
+
if nested_discriminators:
|
|
319
|
+
best = nested_discriminators[0]
|
|
320
|
+
if best.correlation_score >= 0.8:
|
|
321
|
+
return NestedDiscriminatorResult(
|
|
322
|
+
field_path=f"{field_name}.{best.field_name}",
|
|
323
|
+
discriminator_field=best.field_name,
|
|
324
|
+
values=best.values,
|
|
325
|
+
nested_clusters=nested_clusters
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if len(nested_clusters) == 1:
|
|
329
|
+
nested_discriminators = _detect_discriminators(nested_docs, nested_clusters)
|
|
330
|
+
if nested_discriminators:
|
|
331
|
+
best = nested_discriminators[0]
|
|
332
|
+
if best.correlation_score > 0.3:
|
|
333
|
+
reclustered = _recluster_by_discriminator(nested_docs, best)
|
|
334
|
+
if len(reclustered) >= 2:
|
|
335
|
+
return NestedDiscriminatorResult(
|
|
336
|
+
field_path=f"{field_name}.{best.field_name}",
|
|
337
|
+
discriminator_field=best.field_name,
|
|
338
|
+
values=best.values,
|
|
339
|
+
nested_clusters=reclustered
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
if max_depth > 1:
|
|
343
|
+
deeper_result = _detect_nested_discriminator(nested_docs, max_depth - 1)
|
|
344
|
+
if deeper_result:
|
|
345
|
+
deeper_result.field_path = f"{field_name}.{deeper_result.field_path}"
|
|
346
|
+
return deeper_result
|
|
347
|
+
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def infer_choice_type(
|
|
352
|
+
values: List[Dict[str, Any]],
|
|
353
|
+
similarity_threshold: float = 0.5,
|
|
354
|
+
detect_nested: bool = True
|
|
355
|
+
) -> ChoiceInferenceResult:
|
|
356
|
+
"""Analyze JSON values to detect discriminated unions (choice types).
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
values: List of JSON objects to analyze
|
|
360
|
+
similarity_threshold: Jaccard similarity threshold for clustering (0-1)
|
|
361
|
+
detect_nested: Whether to detect nested discriminators (e.g., in CloudEvents payload)
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
ChoiceInferenceResult with analysis results
|
|
365
|
+
"""
|
|
366
|
+
if not values:
|
|
367
|
+
return ChoiceInferenceResult(
|
|
368
|
+
is_choice=False,
|
|
369
|
+
discriminator_field=None,
|
|
370
|
+
discriminator_values=set(),
|
|
371
|
+
clusters=[],
|
|
372
|
+
nested_discriminator=None,
|
|
373
|
+
recommendation="Empty input"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Filter to dict values only
|
|
377
|
+
dict_values = [v for v in values if isinstance(v, dict)]
|
|
378
|
+
if not dict_values:
|
|
379
|
+
return ChoiceInferenceResult(
|
|
380
|
+
is_choice=False,
|
|
381
|
+
discriminator_field=None,
|
|
382
|
+
discriminator_values=set(),
|
|
383
|
+
clusters=[],
|
|
384
|
+
nested_discriminator=None,
|
|
385
|
+
recommendation="No object values found"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
documents = [_extract_document_info(i, v) for i, v in enumerate(dict_values)]
|
|
389
|
+
clusters = _cluster_by_similarity(documents, similarity_threshold)
|
|
390
|
+
discriminators = _detect_discriminators(documents, clusters)
|
|
391
|
+
|
|
392
|
+
# Re-cluster by discriminator if found in single-cluster scenario
|
|
393
|
+
if len(clusters) == 1 and discriminators:
|
|
394
|
+
best = discriminators[0]
|
|
395
|
+
if best.correlation_score > 0.3:
|
|
396
|
+
clusters = _recluster_by_discriminator(documents, best)
|
|
397
|
+
|
|
398
|
+
# Single cluster = check for nested discriminator or sparse data
|
|
399
|
+
if len(clusters) == 1:
|
|
400
|
+
nested_result = None
|
|
401
|
+
if detect_nested:
|
|
402
|
+
nested_result = _detect_nested_discriminator(documents, max_depth=2)
|
|
403
|
+
|
|
404
|
+
if nested_result:
|
|
405
|
+
return ChoiceInferenceResult(
|
|
406
|
+
is_choice=True,
|
|
407
|
+
discriminator_field=None,
|
|
408
|
+
discriminator_values=set(),
|
|
409
|
+
clusters=clusters,
|
|
410
|
+
nested_discriminator=nested_result,
|
|
411
|
+
recommendation=f"Envelope with nested discriminator at '{nested_result.field_path}'"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return ChoiceInferenceResult(
|
|
415
|
+
is_choice=False,
|
|
416
|
+
discriminator_field=None,
|
|
417
|
+
discriminator_values=set(),
|
|
418
|
+
clusters=clusters,
|
|
419
|
+
nested_discriminator=None,
|
|
420
|
+
recommendation="Single type with optional fields"
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Multiple clusters with discriminator = discriminated union
|
|
424
|
+
if discriminators:
|
|
425
|
+
best = discriminators[0]
|
|
426
|
+
return ChoiceInferenceResult(
|
|
427
|
+
is_choice=True,
|
|
428
|
+
discriminator_field=best.field_name,
|
|
429
|
+
discriminator_values=best.values,
|
|
430
|
+
clusters=clusters,
|
|
431
|
+
nested_discriminator=None,
|
|
432
|
+
recommendation=f"Discriminated union on field '{best.field_name}' with {len(clusters)} variants"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Multiple clusters without discriminator = undiscriminated union
|
|
436
|
+
return ChoiceInferenceResult(
|
|
437
|
+
is_choice=True,
|
|
438
|
+
discriminator_field=None,
|
|
439
|
+
discriminator_values=set(),
|
|
440
|
+
clusters=clusters,
|
|
441
|
+
nested_discriminator=None,
|
|
442
|
+
recommendation=f"Undiscriminated union with {len(clusters)} distinct types"
|
|
443
|
+
)
|
|
@@ -1001,6 +1001,7 @@
|
|
|
1001
1001
|
"command": "sql2a",
|
|
1002
1002
|
"description": "Convert SQL schema to Avrotize schema (connects to live database)",
|
|
1003
1003
|
"group": "5_SQL",
|
|
1004
|
+
"skip_input_file_handling": true,
|
|
1004
1005
|
"function": {
|
|
1005
1006
|
"name": "avrotize.sqltoavro.convert_sql_to_avro",
|
|
1006
1007
|
"args": {
|
|
@@ -1016,7 +1017,8 @@
|
|
|
1016
1017
|
"infer_json_schema": "args.infer_json",
|
|
1017
1018
|
"infer_xml_schema": "args.infer_xml",
|
|
1018
1019
|
"username": "args.username",
|
|
1019
|
-
"password": "args.password"
|
|
1020
|
+
"password": "args.password",
|
|
1021
|
+
"infer_choices": "args.infer_choices"
|
|
1020
1022
|
}
|
|
1021
1023
|
},
|
|
1022
1024
|
"extensions": [],
|
|
@@ -1116,6 +1118,13 @@
|
|
|
1116
1118
|
"help": "Infer schema for XML columns",
|
|
1117
1119
|
"default": true,
|
|
1118
1120
|
"required": false
|
|
1121
|
+
},
|
|
1122
|
+
{
|
|
1123
|
+
"name": "--infer-choices",
|
|
1124
|
+
"type": "bool",
|
|
1125
|
+
"action": "store_true",
|
|
1126
|
+
"help": "Detect discriminated unions in JSON/XML columns and emit as unions with discriminator field defaults",
|
|
1127
|
+
"required": false
|
|
1119
1128
|
}
|
|
1120
1129
|
],
|
|
1121
1130
|
"suggested_output_file_path": "{database}.avsc",
|
|
@@ -1153,7 +1162,8 @@
|
|
|
1153
1162
|
"avro_schema_file": "output_file_path",
|
|
1154
1163
|
"type_name": "args.type_name",
|
|
1155
1164
|
"avro_namespace": "args.namespace",
|
|
1156
|
-
"sample_size": "args.sample_size"
|
|
1165
|
+
"sample_size": "args.sample_size",
|
|
1166
|
+
"infer_choices": "args.infer_choices"
|
|
1157
1167
|
}
|
|
1158
1168
|
},
|
|
1159
1169
|
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
@@ -1190,6 +1200,13 @@
|
|
|
1190
1200
|
"help": "Maximum number of records to sample (0 = all)",
|
|
1191
1201
|
"default": 0,
|
|
1192
1202
|
"required": false
|
|
1203
|
+
},
|
|
1204
|
+
{
|
|
1205
|
+
"name": "--infer-choices",
|
|
1206
|
+
"type": "bool",
|
|
1207
|
+
"action": "store_true",
|
|
1208
|
+
"help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
|
|
1209
|
+
"required": false
|
|
1193
1210
|
}
|
|
1194
1211
|
],
|
|
1195
1212
|
"suggested_output_file_path": "{input_file_name}.avsc",
|
|
@@ -1221,7 +1238,8 @@
|
|
|
1221
1238
|
"jstruct_schema_file": "output_file_path",
|
|
1222
1239
|
"type_name": "args.type_name",
|
|
1223
1240
|
"base_id": "args.base_id",
|
|
1224
|
-
"sample_size": "args.sample_size"
|
|
1241
|
+
"sample_size": "args.sample_size",
|
|
1242
|
+
"infer_choices": "args.infer_choices"
|
|
1225
1243
|
}
|
|
1226
1244
|
},
|
|
1227
1245
|
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
@@ -1259,6 +1277,13 @@
|
|
|
1259
1277
|
"help": "Maximum number of records to sample (0 = all)",
|
|
1260
1278
|
"default": 0,
|
|
1261
1279
|
"required": false
|
|
1280
|
+
},
|
|
1281
|
+
{
|
|
1282
|
+
"name": "--infer-choices",
|
|
1283
|
+
"type": "bool",
|
|
1284
|
+
"action": "store_true",
|
|
1285
|
+
"help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
|
|
1286
|
+
"required": false
|
|
1262
1287
|
}
|
|
1263
1288
|
],
|
|
1264
1289
|
"suggested_output_file_path": "{input_file_name}.jstruct.json",
|
|
@@ -21,7 +21,8 @@ def convert_json_to_avro(
|
|
|
21
21
|
avro_schema_file: str,
|
|
22
22
|
type_name: str = 'Document',
|
|
23
23
|
avro_namespace: str = '',
|
|
24
|
-
sample_size: int = 0
|
|
24
|
+
sample_size: int = 0,
|
|
25
|
+
infer_choices: bool = False
|
|
25
26
|
) -> None:
|
|
26
27
|
"""Infers Avro schema from JSON files.
|
|
27
28
|
|
|
@@ -35,6 +36,7 @@ def convert_json_to_avro(
|
|
|
35
36
|
type_name: Name for the root type
|
|
36
37
|
avro_namespace: Namespace for generated Avro types
|
|
37
38
|
sample_size: Maximum number of records to sample (0 = all)
|
|
39
|
+
infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
|
|
38
40
|
"""
|
|
39
41
|
if not input_files:
|
|
40
42
|
raise ValueError("At least one input file is required")
|
|
@@ -44,7 +46,7 @@ def convert_json_to_avro(
|
|
|
44
46
|
if not values:
|
|
45
47
|
raise ValueError("No valid JSON data found in input files")
|
|
46
48
|
|
|
47
|
-
inferrer = AvroSchemaInferrer(namespace=avro_namespace)
|
|
49
|
+
inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
|
|
48
50
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
49
51
|
|
|
50
52
|
# Ensure output directory exists
|
|
@@ -61,7 +63,8 @@ def convert_json_to_jstruct(
|
|
|
61
63
|
jstruct_schema_file: str,
|
|
62
64
|
type_name: str = 'Document',
|
|
63
65
|
base_id: str = 'https://example.com/',
|
|
64
|
-
sample_size: int = 0
|
|
66
|
+
sample_size: int = 0,
|
|
67
|
+
infer_choices: bool = False
|
|
65
68
|
) -> None:
|
|
66
69
|
"""Infers JSON Structure schema from JSON files.
|
|
67
70
|
|
|
@@ -74,6 +77,7 @@ def convert_json_to_jstruct(
|
|
|
74
77
|
type_name: Name for the root type
|
|
75
78
|
base_id: Base URI for $id generation
|
|
76
79
|
sample_size: Maximum number of records to sample (0 = all)
|
|
80
|
+
infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
|
|
77
81
|
"""
|
|
78
82
|
if not input_files:
|
|
79
83
|
raise ValueError("At least one input file is required")
|
|
@@ -83,7 +87,7 @@ def convert_json_to_jstruct(
|
|
|
83
87
|
if not values:
|
|
84
88
|
raise ValueError("No valid JSON data found in input files")
|
|
85
89
|
|
|
86
|
-
inferrer = JsonStructureSchemaInferrer(base_id=base_id)
|
|
90
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
|
|
87
91
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
88
92
|
|
|
89
93
|
# Ensure output directory exists
|