structurize 3.2.1__tar.gz → 3.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {structurize-3.2.1/structurize.egg-info → structurize-3.3.0}/PKG-INFO +1 -1
  2. {structurize-3.2.1 → structurize-3.3.0}/avrotize/_version.py +3 -3
  3. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotize.py +4 -1
  4. {structurize-3.2.1 → structurize-3.3.0}/avrotize/choice_inference.py +121 -2
  5. {structurize-3.2.1 → structurize-3.3.0}/avrotize/commands.json +26 -2
  6. {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsontoschema.py +11 -4
  7. {structurize-3.2.1 → structurize-3.3.0}/avrotize/schema_inference.py +788 -8
  8. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocsharp.py +59 -19
  9. {structurize-3.2.1 → structurize-3.3.0/structurize.egg-info}/PKG-INFO +1 -1
  10. {structurize-3.2.1 → structurize-3.3.0}/.gitignore +0 -0
  11. {structurize-3.2.1 → structurize-3.3.0}/LICENSE +0 -0
  12. {structurize-3.2.1 → structurize-3.3.0}/MANIFEST.in +0 -0
  13. {structurize-3.2.1 → structurize-3.3.0}/README.md +0 -0
  14. {structurize-3.2.1 → structurize-3.3.0}/avrotize/__init__.py +0 -0
  15. {structurize-3.2.1 → structurize-3.3.0}/avrotize/__main__.py +0 -0
  16. {structurize-3.2.1 → structurize-3.3.0}/avrotize/asn1toavro.py +0 -0
  17. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocpp.py +0 -0
  18. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocsharp.py +0 -0
  19. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocsv.py +0 -0
  20. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotodatapackage.py +0 -0
  21. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotodb.py +0 -0
  22. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotogo.py +0 -0
  23. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotographql.py +0 -0
  24. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoiceberg.py +0 -0
  25. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojava.py +0 -0
  26. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojs.py +0 -0
  27. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojsons.py +0 -0
  28. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojstruct.py +0 -0
  29. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotokusto.py +0 -0
  30. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotomd.py +0 -0
  31. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotools.py +0 -0
  32. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoparquet.py +0 -0
  33. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoproto.py +0 -0
  34. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotopython.py +0 -0
  35. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotorust.py +0 -0
  36. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotots.py +0 -0
  37. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoxsd.py +0 -0
  38. {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrovalidator.py +0 -0
  39. {structurize-3.2.1 → structurize-3.3.0}/avrotize/cddltostructure.py +0 -0
  40. {structurize-3.2.1 → structurize-3.3.0}/avrotize/common.py +0 -0
  41. {structurize-3.2.1 → structurize-3.3.0}/avrotize/constants.py +0 -0
  42. {structurize-3.2.1 → structurize-3.3.0}/avrotize/csvtoavro.py +0 -0
  43. {structurize-3.2.1 → structurize-3.3.0}/avrotize/datapackagetoavro.py +0 -0
  44. {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
  45. {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependencies/typescript/node22/package.json +0 -0
  46. {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependency_resolver.py +0 -0
  47. {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependency_version.py +0 -0
  48. {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsonstoavro.py +0 -0
  49. {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsonstostructure.py +0 -0
  50. {structurize-3.2.1 → structurize-3.3.0}/avrotize/jstructtoavro.py +0 -0
  51. {structurize-3.2.1 → structurize-3.3.0}/avrotize/kstructtoavro.py +0 -0
  52. {structurize-3.2.1 → structurize-3.3.0}/avrotize/kustotoavro.py +0 -0
  53. {structurize-3.2.1 → structurize-3.3.0}/avrotize/openapitostructure.py +0 -0
  54. {structurize-3.2.1 → structurize-3.3.0}/avrotize/parquettoavro.py +0 -0
  55. {structurize-3.2.1 → structurize-3.3.0}/avrotize/proto2parser.py +0 -0
  56. {structurize-3.2.1 → structurize-3.3.0}/avrotize/proto3parser.py +0 -0
  57. {structurize-3.2.1 → structurize-3.3.0}/avrotize/prototoavro.py +0 -0
  58. {structurize-3.2.1 → structurize-3.3.0}/avrotize/sqltoavro.py +0 -0
  59. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocddl.py +0 -0
  60. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocpp.py +0 -0
  61. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocsv.py +0 -0
  62. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretodatapackage.py +0 -0
  63. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretodb.py +0 -0
  64. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretogo.py +0 -0
  65. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretographql.py +0 -0
  66. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoiceberg.py +0 -0
  67. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojava.py +0 -0
  68. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojs.py +0 -0
  69. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojsons.py +0 -0
  70. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretokusto.py +0 -0
  71. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretomd.py +0 -0
  72. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoproto.py +0 -0
  73. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretopython.py +0 -0
  74. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretorust.py +0 -0
  75. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretots.py +0 -0
  76. {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoxsd.py +0 -0
  77. {structurize-3.2.1 → structurize-3.3.0}/avrotize/validate.py +0 -0
  78. {structurize-3.2.1 → structurize-3.3.0}/avrotize/xmltoschema.py +0 -0
  79. {structurize-3.2.1 → structurize-3.3.0}/avrotize/xsdtoavro.py +0 -0
  80. {structurize-3.2.1 → structurize-3.3.0}/build.ps1 +0 -0
  81. {structurize-3.2.1 → structurize-3.3.0}/build.sh +0 -0
  82. {structurize-3.2.1 → structurize-3.3.0}/pyproject.toml +0 -0
  83. {structurize-3.2.1 → structurize-3.3.0}/setup.cfg +0 -0
  84. {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/SOURCES.txt +0 -0
  85. {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/dependency_links.txt +0 -0
  86. {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/entry_points.txt +0 -0
  87. {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/requires.txt +0 -0
  88. {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structurize
3
- Version: 3.2.1
3
+ Version: 3.3.0
4
4
  Summary: Tools to convert from and to JSON Structure from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '3.2.1'
32
- __version_tuple__ = version_tuple = (3, 2, 1)
31
+ __version__ = version = '3.3.0'
32
+ __version_tuple__ = version_tuple = (3, 3, 0)
33
33
 
34
- __commit_id__ = commit_id = 'gfc8429a20'
34
+ __commit_id__ = commit_id = 'g09d8d822a'
@@ -67,7 +67,10 @@ def dynamic_import(module, func):
67
67
  def main():
68
68
  """Main function for the command line utility."""
69
69
  commands = load_commands()
70
- parser = argparse.ArgumentParser(description='Convert a variety of schema formats to Avrotize schema and vice versa.')
70
+ parser = argparse.ArgumentParser(
71
+ description='Convert a variety of schema formats to Avrotize schema and vice versa.',
72
+ fromfile_prefix_chars='@'
73
+ )
71
74
  parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
72
75
 
73
76
  subparsers = parser.add_subparsers(dest='command')
@@ -222,6 +222,23 @@ def _detect_discriminators(
222
222
  value_to_docs[doc.field_values[field_name]].append(doc)
223
223
 
224
224
  if len(value_to_docs) >= 2:
225
+ # Check 1: Does each value map to a consistent signature?
226
+ # A perfect discriminator has each value producing identical signatures
227
+ value_to_sigs: Dict[str, Set[tuple]] = {}
228
+ for val, val_docs in value_to_docs.items():
229
+ sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
230
+ value_to_sigs[val] = sigs
231
+
232
+ # Count values with perfectly consistent signatures (all docs same sig)
233
+ consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
234
+ consistency_ratio = consistent_values / len(value_to_sigs)
235
+
236
+ # Check 2: Are signatures distinct across values?
237
+ all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
238
+ distinct_sigs = set(all_primary_sigs)
239
+ distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
240
+
241
+ # Check 3: Original inter-similarity check (relaxed to 0.85)
225
242
  all_values = list(value_to_docs.keys())
226
243
  inter_sims = []
227
244
  for i, v1 in enumerate(all_values):
@@ -234,13 +251,97 @@ def _detect_discriminators(
234
251
 
235
252
  avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
236
253
 
237
- if avg_inter_sim < 0.7:
254
+ # Check 4: Discriminator-field correlation (envelope pattern)
255
+ # If discriminator value matches a unique payload field name, it's legitimate
256
+ # e.g., _subtype: "play" -> has field "play" that only appears for this value
257
+ discriminator_field_matches = 0
258
+ unique_fields_per_value: Dict[str, Set[str]] = {}
259
+ for disc_val, val_docs in value_to_docs.items():
260
+ if not val_docs:
261
+ continue
262
+ # Get fields unique to this discriminator value
263
+ this_sig = val_docs[0].field_signature
264
+ other_sigs = [d.field_signature for v, docs in value_to_docs.items()
265
+ if v != disc_val for d in docs[:1]]
266
+ if other_sigs:
267
+ common_with_others = this_sig.intersection(*other_sigs)
268
+ unique_fields = this_sig - common_with_others - {field_name}
269
+ unique_fields_per_value[disc_val] = unique_fields
270
+ # Check if discriminator value matches any unique field (case-insensitive)
271
+ disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
272
+ if any(uf.lower() == disc_val_lower for uf in unique_fields):
273
+ discriminator_field_matches += 1
274
+
275
+ has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
276
+
277
+ # Check 5: Structural quality - detect sparse data false positives
278
+ # If unique fields are very few AND overlap across variants, it's likely sparse data
279
+ should_reject_sparse = False
280
+ if not has_envelope_pattern and unique_fields_per_value:
281
+ all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
282
+ if all_unique_fields:
283
+ # Count total unique fields across all variants
284
+ total_unique = set().union(*all_unique_fields)
285
+ avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
286
+
287
+ # Check for "sparse optional field" pattern:
288
+ # - Very few unique fields per variant (1-2)
289
+ # - Moderate to high similarity (>0.6)
290
+ # - No envelope pattern
291
+ # - Few total samples per variant (could be sample noise)
292
+ # - Only 2 variants (binary split is more likely to be accidental)
293
+ # - NOT a subset pattern (where one variant is just base + extras)
294
+ min_samples = min(len(docs) for docs in value_to_docs.values())
295
+ num_variants = len(value_to_docs)
296
+
297
+ # Check for subset pattern (inheritance-like polymorphism)
298
+ # If one variant's signature is a subset of another, it's likely real
299
+ is_subset_pattern = False
300
+ all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
301
+ if len(all_sigs) >= 2:
302
+ for i, sig1 in enumerate(all_sigs):
303
+ for sig2 in all_sigs[i+1:]:
304
+ # Check if one is subset of the other (ignoring discriminator)
305
+ sig1_set = set(sig1) - {field_name}
306
+ sig2_set = set(sig2) - {field_name}
307
+ if sig1_set < sig2_set or sig2_set < sig1_set:
308
+ is_subset_pattern = True
309
+ break
310
+ if is_subset_pattern:
311
+ break
312
+
313
+ # Binary splits with few samples and minimal structural difference
314
+ # are most likely sparse data artifacts (unless it's a subset pattern)
315
+ if (num_variants == 2 and
316
+ avg_unique <= 1.5 and
317
+ avg_inter_sim > 0.6 and
318
+ min_samples < 5 and
319
+ not is_subset_pattern):
320
+ should_reject_sparse = True
321
+
322
+ if should_reject_sparse:
323
+ continue
324
+
325
+ # Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
326
+ # OR (c) envelope pattern detected
327
+ is_discriminator = (
328
+ avg_inter_sim < 0.7 or
329
+ (consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
330
+ has_envelope_pattern
331
+ )
332
+
333
+ if is_discriminator:
334
+ # Use distinctness as correlation score when similarity is high
335
+ # Boost score if envelope pattern detected
336
+ score = max(1.0 - avg_inter_sim, distinctness_ratio)
337
+ if has_envelope_pattern:
338
+ score = max(score, 0.95)
238
339
  correlation = {v: i for i, v in enumerate(all_values)}
239
340
  candidates.append(DiscriminatorCandidate(
240
341
  field_name=field_name,
241
342
  values=values,
242
343
  correlation=correlation,
243
- correlation_score=1.0 - avg_inter_sim
344
+ correlation_score=score
244
345
  ))
245
346
  continue
246
347
 
@@ -395,6 +496,24 @@ def infer_choice_type(
395
496
  if best.correlation_score > 0.3:
396
497
  clusters = _recluster_by_discriminator(documents, best)
397
498
 
499
+ # Fallback: if multi-cluster but no discriminator, try single-cluster analysis
500
+ # This handles cases where clustering threshold merged distinct types
501
+ if len(clusters) > 1 and not discriminators:
502
+ # Treat all documents as one cluster for discriminator detection
503
+ all_sigs = [d.field_signature for d in documents]
504
+ merged_sig = set().union(*all_sigs) if all_sigs else set()
505
+ required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
506
+ single_cluster = [SchemaCluster(
507
+ id=0,
508
+ documents=documents,
509
+ merged_signature=merged_sig,
510
+ required_fields=required_sig
511
+ )]
512
+ fallback_discriminators = _detect_discriminators(documents, single_cluster)
513
+ if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
514
+ discriminators = fallback_discriminators
515
+ clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
516
+
398
517
  # Single cluster = check for nested discriminator or sparse data
399
518
  if len(clusters) == 1:
400
519
  nested_result = None
@@ -1163,7 +1163,8 @@
1163
1163
  "type_name": "args.type_name",
1164
1164
  "avro_namespace": "args.namespace",
1165
1165
  "sample_size": "args.sample_size",
1166
- "infer_choices": "args.infer_choices"
1166
+ "infer_choices": "args.infer_choices",
1167
+ "choice_depth": "args.choice_depth"
1167
1168
  }
1168
1169
  },
1169
1170
  "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1207,6 +1208,13 @@
1207
1208
  "action": "store_true",
1208
1209
  "help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
1209
1210
  "required": false
1211
+ },
1212
+ {
1213
+ "name": "--choice-depth",
1214
+ "type": "int",
1215
+ "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
1216
+ "default": 1,
1217
+ "required": false
1210
1218
  }
1211
1219
  ],
1212
1220
  "suggested_output_file_path": "{input_file_name}.avsc",
@@ -1239,7 +1247,9 @@
1239
1247
  "type_name": "args.type_name",
1240
1248
  "base_id": "args.base_id",
1241
1249
  "sample_size": "args.sample_size",
1242
- "infer_choices": "args.infer_choices"
1250
+ "infer_choices": "args.infer_choices",
1251
+ "choice_depth": "args.choice_depth",
1252
+ "infer_enums": "args.infer_enums"
1243
1253
  }
1244
1254
  },
1245
1255
  "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1284,6 +1294,20 @@
1284
1294
  "action": "store_true",
1285
1295
  "help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
1286
1296
  "required": false
1297
+ },
1298
+ {
1299
+ "name": "--choice-depth",
1300
+ "type": "int",
1301
+ "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
1302
+ "default": 1,
1303
+ "required": false
1304
+ },
1305
+ {
1306
+ "name": "--infer-enums",
1307
+ "type": "bool",
1308
+ "action": "store_true",
1309
+ "help": "Detect enum types from repeated string values with low cardinality",
1310
+ "required": false
1287
1311
  }
1288
1312
  ],
1289
1313
  "suggested_output_file_path": "{input_file_name}.jstruct.json",
@@ -22,7 +22,8 @@ def convert_json_to_avro(
22
22
  type_name: str = 'Document',
23
23
  avro_namespace: str = '',
24
24
  sample_size: int = 0,
25
- infer_choices: bool = False
25
+ infer_choices: bool = False,
26
+ choice_depth: int = 1
26
27
  ) -> None:
27
28
  """Infers Avro schema from JSON files.
28
29
 
@@ -37,6 +38,7 @@ def convert_json_to_avro(
37
38
  avro_namespace: Namespace for generated Avro types
38
39
  sample_size: Maximum number of records to sample (0 = all)
39
40
  infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
41
+ choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
40
42
  """
41
43
  if not input_files:
42
44
  raise ValueError("At least one input file is required")
@@ -46,7 +48,7 @@ def convert_json_to_avro(
46
48
  if not values:
47
49
  raise ValueError("No valid JSON data found in input files")
48
50
 
49
- inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
51
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
50
52
  schema = inferrer.infer_from_json_values(type_name, values)
51
53
 
52
54
  # Ensure output directory exists
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
64
66
  type_name: str = 'Document',
65
67
  base_id: str = 'https://example.com/',
66
68
  sample_size: int = 0,
67
- infer_choices: bool = False
69
+ infer_choices: bool = False,
70
+ choice_depth: int = 1,
71
+ infer_enums: bool = False
68
72
  ) -> None:
69
73
  """Infers JSON Structure schema from JSON files.
70
74
 
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
78
82
  base_id: Base URI for $id generation
79
83
  sample_size: Maximum number of records to sample (0 = all)
80
84
  infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
85
+ choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
86
+ infer_enums: Detect enum types from repeated string values with low cardinality
81
87
  """
82
88
  if not input_files:
83
89
  raise ValueError("At least one input file is required")
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
87
93
  if not values:
88
94
  raise ValueError("No valid JSON data found in input files")
89
95
 
90
- inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
96
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
97
+ choice_depth=choice_depth, infer_enums=infer_enums)
91
98
  schema = inferrer.infer_from_json_values(type_name, values)
92
99
 
93
100
  # Ensure output directory exists