PyPI - structurize - Versions diffs - 3.2.1__tar.gz → 3.3.0__tar.gz - Mend

structurize 3.2.1tar.gz → 3.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

{structurize-3.2.1/structurize.egg-info → structurize-3.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: structurize
-Version: 3.2.1
+Version: 3.3.0
 Summary: Tools to convert from and to JSON Structure from various other schema languages.
 Author-email: Clemens Vasters <clemensv@microsoft.com>
 Classifier: Programming Language :: Python :: 3

{structurize-3.2.1 → structurize-3.3.0}/avrotize/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '3.2.1'
-__version_tuple__ = version_tuple = (3, 2, 1)
+__version__ = version = '3.3.0'
+__version_tuple__ = version_tuple = (3, 3, 0)
-__commit_id__ = commit_id = 'gfc8429a20'
+__commit_id__ = commit_id = 'g09d8d822a'

{structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotize.py RENAMED Viewed

@@ -67,7 +67,10 @@ def dynamic_import(module, func):
 def main():
     """Main function for the command line utility."""
     commands = load_commands()
-    parser = argparse.ArgumentParser(description='Convert a variety of schema formats to Avrotize schema and vice versa.')
+    parser = argparse.ArgumentParser(
+        description='Convert a variety of schema formats to Avrotize schema and vice versa.',
+        fromfile_prefix_chars='@'
+    )
     parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
     subparsers = parser.add_subparsers(dest='command')

{structurize-3.2.1 → structurize-3.3.0}/avrotize/choice_inference.py RENAMED Viewed

@@ -222,6 +222,23 @@ def _detect_discriminators(
                     value_to_docs[doc.field_values[field_name]].append(doc)
             if len(value_to_docs) >= 2:
+                # Check 1: Does each value map to a consistent signature?
+                # A perfect discriminator has each value producing identical signatures
+                value_to_sigs: Dict[str, Set[tuple]] = {}
+                for val, val_docs in value_to_docs.items():
+                    sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
+                    value_to_sigs[val] = sigs
+                # Count values with perfectly consistent signatures (all docs same sig)
+                consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
+                consistency_ratio = consistent_values / len(value_to_sigs)
+                # Check 2: Are signatures distinct across values?
+                all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
+                distinct_sigs = set(all_primary_sigs)
+                distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
+                # Check 3: Original inter-similarity check (relaxed to 0.85)
                 all_values = list(value_to_docs.keys())
                 inter_sims = []
                 for i, v1 in enumerate(all_values):
@@ -234,13 +251,97 @@ def _detect_discriminators(
                 avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
-                if avg_inter_sim < 0.7:
+                # Check 4: Discriminator-field correlation (envelope pattern)
+                # If discriminator value matches a unique payload field name, it's legitimate
+                # e.g., _subtype: "play" -> has field "play" that only appears for this value
+                discriminator_field_matches = 0
+                unique_fields_per_value: Dict[str, Set[str]] = {}
+                for disc_val, val_docs in value_to_docs.items():
+                    if not val_docs:
+                        continue
+                    # Get fields unique to this discriminator value
+                    this_sig = val_docs[0].field_signature
+                    other_sigs = [d.field_signature for v, docs in value_to_docs.items()
+                                  if v != disc_val for d in docs[:1]]
+                    if other_sigs:
+                        common_with_others = this_sig.intersection(*other_sigs)
+                        unique_fields = this_sig - common_with_others - {field_name}
+                        unique_fields_per_value[disc_val] = unique_fields
+                        # Check if discriminator value matches any unique field (case-insensitive)
+                        disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
+                        if any(uf.lower() == disc_val_lower for uf in unique_fields):
+                            discriminator_field_matches += 1
+                has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
+                # Check 5: Structural quality - detect sparse data false positives
+                # If unique fields are very few AND overlap across variants, it's likely sparse data
+                should_reject_sparse = False
+                if not has_envelope_pattern and unique_fields_per_value:
+                    all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
+                    if all_unique_fields:
+                        # Count total unique fields across all variants
+                        total_unique = set().union(*all_unique_fields)
+                        avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
+                        # Check for "sparse optional field" pattern:
+                        # - Very few unique fields per variant (1-2)
+                        # - Moderate to high similarity (>0.6)
+                        # - No envelope pattern
+                        # - Few total samples per variant (could be sample noise)
+                        # - Only 2 variants (binary split is more likely to be accidental)
+                        # - NOT a subset pattern (where one variant is just base + extras)
+                        min_samples = min(len(docs) for docs in value_to_docs.values())
+                        num_variants = len(value_to_docs)
+                        # Check for subset pattern (inheritance-like polymorphism)
+                        # If one variant's signature is a subset of another, it's likely real
+                        is_subset_pattern = False
+                        all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
+                        if len(all_sigs) >= 2:
+                            for i, sig1 in enumerate(all_sigs):
+                                for sig2 in all_sigs[i+1:]:
+                                    # Check if one is subset of the other (ignoring discriminator)
+                                    sig1_set = set(sig1) - {field_name}
+                                    sig2_set = set(sig2) - {field_name}
+                                    if sig1_set < sig2_set or sig2_set < sig1_set:
+                                        is_subset_pattern = True
+                                        break
+                                if is_subset_pattern:
+                                    break
+                        # Binary splits with few samples and minimal structural difference
+                        # are most likely sparse data artifacts (unless it's a subset pattern)
+                        if (num_variants == 2 and
+                            avg_unique <= 1.5 and
+                            avg_inter_sim > 0.6 and
+                            min_samples < 5 and
+                            not is_subset_pattern):
+                            should_reject_sparse = True
+                if should_reject_sparse:
+                    continue
+                # Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
+                # OR (c) envelope pattern detected
+                is_discriminator = (
+                    avg_inter_sim < 0.7 or
+                    (consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
+                    has_envelope_pattern
+                )
+                if is_discriminator:
+                    # Use distinctness as correlation score when similarity is high
+                    # Boost score if envelope pattern detected
+                    score = max(1.0 - avg_inter_sim, distinctness_ratio)
+                    if has_envelope_pattern:
+                        score = max(score, 0.95)
                     correlation = {v: i for i, v in enumerate(all_values)}
                     candidates.append(DiscriminatorCandidate(
                         field_name=field_name,
                         values=values,
                         correlation=correlation,
-                        correlation_score=1.0 - avg_inter_sim
+                        correlation_score=score
                     ))
             continue
@@ -395,6 +496,24 @@ def infer_choice_type(
         if best.correlation_score > 0.3:
             clusters = _recluster_by_discriminator(documents, best)
+    # Fallback: if multi-cluster but no discriminator, try single-cluster analysis
+    # This handles cases where clustering threshold merged distinct types
+    if len(clusters) > 1 and not discriminators:
+        # Treat all documents as one cluster for discriminator detection
+        all_sigs = [d.field_signature for d in documents]
+        merged_sig = set().union(*all_sigs) if all_sigs else set()
+        required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
+        single_cluster = [SchemaCluster(
+            id=0,
+            documents=documents,
+            merged_signature=merged_sig,
+            required_fields=required_sig
+        )]
+        fallback_discriminators = _detect_discriminators(documents, single_cluster)
+        if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
+            discriminators = fallback_discriminators
+            clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
     # Single cluster = check for nested discriminator or sparse data
     if len(clusters) == 1:
         nested_result = None

{structurize-3.2.1 → structurize-3.3.0}/avrotize/commands.json RENAMED Viewed

@@ -1163,7 +1163,8 @@
         "type_name": "args.type_name",
         "avro_namespace": "args.namespace",
         "sample_size": "args.sample_size",
-        "infer_choices": "args.infer_choices"
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth"
       }
     },
     "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1207,6 +1208,13 @@
         "action": "store_true",
         "help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
         "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
+        "default": 1,
+        "required": false
       }
     ],
     "suggested_output_file_path": "{input_file_name}.avsc",
@@ -1239,7 +1247,9 @@
         "type_name": "args.type_name",
         "base_id": "args.base_id",
         "sample_size": "args.sample_size",
-        "infer_choices": "args.infer_choices"
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth",
+        "infer_enums": "args.infer_enums"
       }
     },
     "extensions": [".json", ".jsonl", ".ndjson"],
@@ -1284,6 +1294,20 @@
         "action": "store_true",
         "help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
         "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
+        "default": 1,
+        "required": false
+      },
+      {
+        "name": "--infer-enums",
+        "type": "bool",
+        "action": "store_true",
+        "help": "Detect enum types from repeated string values with low cardinality",
+        "required": false
       }
     ],
     "suggested_output_file_path": "{input_file_name}.jstruct.json",

{structurize-3.2.1 → structurize-3.3.0}/avrotize/jsontoschema.py RENAMED Viewed

@@ -22,7 +22,8 @@ def convert_json_to_avro(
     type_name: str = 'Document',
     avro_namespace: str = '',
     sample_size: int = 0,
-    infer_choices: bool = False
+    infer_choices: bool = False,
+    choice_depth: int = 1
 ) -> None:
     """Infers Avro schema from JSON files.
@@ -37,6 +38,7 @@ def convert_json_to_avro(
         avro_namespace: Namespace for generated Avro types
         sample_size: Maximum number of records to sample (0 = all)
         infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
+        choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
     """
     if not input_files:
         raise ValueError("At least one input file is required")
@@ -46,7 +48,7 @@ def convert_json_to_avro(
     if not values:
         raise ValueError("No valid JSON data found in input files")
-    inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
+    inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
     schema = inferrer.infer_from_json_values(type_name, values)
     # Ensure output directory exists
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
     type_name: str = 'Document',
     base_id: str = 'https://example.com/',
     sample_size: int = 0,
-    infer_choices: bool = False
+    infer_choices: bool = False,
+    choice_depth: int = 1,
+    infer_enums: bool = False
 ) -> None:
     """Infers JSON Structure schema from JSON files.
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
         base_id: Base URI for $id generation
         sample_size: Maximum number of records to sample (0 = all)
         infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
+        choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
+        infer_enums: Detect enum types from repeated string values with low cardinality
     """
     if not input_files:
         raise ValueError("At least one input file is required")
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
     if not values:
         raise ValueError("No valid JSON data found in input files")
-    inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
+    inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
+                                            choice_depth=choice_depth, infer_enums=infer_enums)
     schema = inferrer.infer_from_json_values(type_name, values)
     # Ensure output directory exists

structurize 3.2.1__tar.gz → 3.3.0__tar.gz

structurize 3.2.1tar.gz → 3.3.0tar.gz