PyPI - structurize - Versions diffs - 3.2.2__tar.gz → 3.3.1__tar.gz - Mend

structurize 3.2.2tar.gz → 3.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

{structurize-3.2.2/structurize.egg-info → structurize-3.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: structurize
-Version: 3.2.2
+Version: 3.3.1
 Summary: Tools to convert from and to JSON Structure from various other schema languages.
 Author-email: Clemens Vasters <clemensv@microsoft.com>
 Classifier: Programming Language :: Python :: 3

{structurize-3.2.2 → structurize-3.3.1}/avrotize/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '3.2.2'
-__version_tuple__ = version_tuple = (3, 2, 2)
+__version__ = version = '3.3.1'
+__version_tuple__ = version_tuple = (3, 3, 1)
-__commit_id__ = commit_id = 'g4fc2eb920'
+__commit_id__ = commit_id = 'g670e64099'

{structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotize.py RENAMED Viewed

@@ -67,7 +67,10 @@ def dynamic_import(module, func):
 def main():
     """Main function for the command line utility."""
     commands = load_commands()
-    parser = argparse.ArgumentParser(description='Convert a variety of schema formats to Avrotize schema and vice versa.')
+    parser = argparse.ArgumentParser(
+        description='Convert a variety of schema formats to Avrotize schema and vice versa.',
+        fromfile_prefix_chars='@'
+    )
     parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
     subparsers = parser.add_subparsers(dest='command')

{structurize-3.2.2 → structurize-3.3.1}/avrotize/choice_inference.py RENAMED Viewed

@@ -214,6 +214,13 @@ def _detect_discriminators(
         if len(values) < 2:
             continue
+        # Skip boolean-like string values - these are flags, not discriminators
+        # A field with only "true"/"false" (or similar) values is not a type discriminator
+        normalized_values = {v.lower() if isinstance(v, str) else str(v).lower() for v in values}
+        boolean_values = {'true', 'false', 'yes', 'no', '0', '1'}
+        if normalized_values <= boolean_values:
+            continue
         # Single cluster with multiple values - check if values create distinct groups
         if len(clusters) == 1:
             value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
@@ -222,6 +229,23 @@ def _detect_discriminators(
                     value_to_docs[doc.field_values[field_name]].append(doc)
             if len(value_to_docs) >= 2:
+                # Check 1: Does each value map to a consistent signature?
+                # A perfect discriminator has each value producing identical signatures
+                value_to_sigs: Dict[str, Set[tuple]] = {}
+                for val, val_docs in value_to_docs.items():
+                    sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
+                    value_to_sigs[val] = sigs
+                # Count values with perfectly consistent signatures (all docs same sig)
+                consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
+                consistency_ratio = consistent_values / len(value_to_sigs)
+                # Check 2: Are signatures distinct across values?
+                all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
+                distinct_sigs = set(all_primary_sigs)
+                distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
+                # Check 3: Original inter-similarity check (relaxed to 0.85)
                 all_values = list(value_to_docs.keys())
                 inter_sims = []
                 for i, v1 in enumerate(all_values):
@@ -234,13 +258,97 @@ def _detect_discriminators(
                 avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
-                if avg_inter_sim < 0.7:
+                # Check 4: Discriminator-field correlation (envelope pattern)
+                # If discriminator value matches a unique payload field name, it's legitimate
+                # e.g., _subtype: "play" -> has field "play" that only appears for this value
+                discriminator_field_matches = 0
+                unique_fields_per_value: Dict[str, Set[str]] = {}
+                for disc_val, val_docs in value_to_docs.items():
+                    if not val_docs:
+                        continue
+                    # Get fields unique to this discriminator value
+                    this_sig = val_docs[0].field_signature
+                    other_sigs = [d.field_signature for v, docs in value_to_docs.items()
+                                  if v != disc_val for d in docs[:1]]
+                    if other_sigs:
+                        common_with_others = this_sig.intersection(*other_sigs)
+                        unique_fields = this_sig - common_with_others - {field_name}
+                        unique_fields_per_value[disc_val] = unique_fields
+                        # Check if discriminator value matches any unique field (case-insensitive)
+                        disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
+                        if any(uf.lower() == disc_val_lower for uf in unique_fields):
+                            discriminator_field_matches += 1
+                has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
+                # Check 5: Structural quality - detect sparse data false positives
+                # If unique fields are very few AND overlap across variants, it's likely sparse data
+                should_reject_sparse = False
+                if not has_envelope_pattern and unique_fields_per_value:
+                    all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
+                    if all_unique_fields:
+                        # Count total unique fields across all variants
+                        total_unique = set().union(*all_unique_fields)
+                        avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
+                        # Check for "sparse optional field" pattern:
+                        # - Very few unique fields per variant (1-2)
+                        # - Moderate to high similarity (>0.6)
+                        # - No envelope pattern
+                        # - Few total samples per variant (could be sample noise)
+                        # - Only 2 variants (binary split is more likely to be accidental)
+                        # - NOT a subset pattern (where one variant is just base + extras)
+                        min_samples = min(len(docs) for docs in value_to_docs.values())
+                        num_variants = len(value_to_docs)
+                        # Check for subset pattern (inheritance-like polymorphism)
+                        # If one variant's signature is a subset of another, it's likely real
+                        is_subset_pattern = False
+                        all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
+                        if len(all_sigs) >= 2:
+                            for i, sig1 in enumerate(all_sigs):
+                                for sig2 in all_sigs[i+1:]:
+                                    # Check if one is subset of the other (ignoring discriminator)
+                                    sig1_set = set(sig1) - {field_name}
+                                    sig2_set = set(sig2) - {field_name}
+                                    if sig1_set < sig2_set or sig2_set < sig1_set:
+                                        is_subset_pattern = True
+                                        break
+                                if is_subset_pattern:
+                                    break
+                        # Binary splits with few samples and minimal structural difference
+                        # are most likely sparse data artifacts (unless it's a subset pattern)
+                        if (num_variants == 2 and
+                            avg_unique <= 1.5 and
+                            avg_inter_sim > 0.6 and
+                            min_samples < 5 and
+                            not is_subset_pattern):
+                            should_reject_sparse = True
+                if should_reject_sparse:
+                    continue
+                # Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
+                # OR (c) envelope pattern detected
+                is_discriminator = (
+                    avg_inter_sim < 0.7 or
+                    (consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
+                    has_envelope_pattern
+                )
+                if is_discriminator:
+                    # Use distinctness as correlation score when similarity is high
+                    # Boost score if envelope pattern detected
+                    score = max(1.0 - avg_inter_sim, distinctness_ratio)
+                    if has_envelope_pattern:
+                        score = max(score, 0.95)
                     correlation = {v: i for i, v in enumerate(all_values)}
                     candidates.append(DiscriminatorCandidate(
                         field_name=field_name,
                         values=values,
                         correlation=correlation,
-                        correlation_score=1.0 - avg_inter_sim
+                        correlation_score=score
                     ))
             continue
@@ -395,6 +503,24 @@ def infer_choice_type(
         if best.correlation_score > 0.3:
             clusters = _recluster_by_discriminator(documents, best)
+    # Fallback: if multi-cluster but no discriminator, try single-cluster analysis
+    # This handles cases where clustering threshold merged distinct types
+    if len(clusters) > 1 and not discriminators:
+        # Treat all documents as one cluster for discriminator detection
+        all_sigs = [d.field_signature for d in documents]
+        merged_sig = set().union(*all_sigs) if all_sigs else set()
+        required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
+        single_cluster = [SchemaCluster(
+            id=0,
+            documents=documents,
+            merged_signature=merged_sig,
+            required_fields=required_sig
+        )]
+        fallback_discriminators = _detect_discriminators(documents, single_cluster)
+        if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
+            discriminators = fallback_discriminators
+            clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
     # Single cluster = check for nested discriminator or sparse data
     if len(clusters) == 1:
         nested_result = None

{structurize-3.2.2 → structurize-3.3.1}/avrotize/commands.json RENAMED Viewed

@@ -764,7 +764,10 @@
         "avro_namespace": "args.namespace",
         "avro_schema_file": "output_file_path",
         "emit_cloudevents": "args.emit_cloudevents",
-        "emit_cloudevents_xregistry": "args.emit_xregistry"
+        "emit_cloudevents_xregistry": "args.emit_xregistry",
+        "sample_size": "args.sample_size",
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth"
       }
     },
     "extensions": [
@@ -819,6 +822,26 @@
         "type": "bool",
         "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single Avrotize schema",
         "required": false
+      },
+      {
+        "name": "--sample-size",
+        "type": "int",
+        "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
+        "default": 100,
+        "required": false
+      },
+      {
+        "name": "--infer-choices",
+        "type": "bool",
+        "help": "Detect discriminated unions in dynamic fields and emit as Avro unions with discriminator defaults",
+        "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
+        "default": 1,
+        "required": false
       }
     ],
     "suggested_output_file_path": "{kusto_database}.avsc",
@@ -846,6 +869,131 @@
     ],
     "skip_input_file_handling": true
   },
+  {
+    "command": "k2s",
+    "description": "Convert Kusto schema to JSON Structure schema",
+    "group": "1_Schemas",
+    "function": {
+      "name": "avrotize.kustotojstruct.convert_kusto_to_jstruct",
+      "args": {
+        "kusto_uri": "args.kusto_uri",
+        "kusto_database": "args.kusto_database",
+        "table_name": "args.table_name",
+        "base_id": "args.base_id",
+        "jstruct_schema_file": "output_file_path",
+        "emit_cloudevents": "args.emit_cloudevents",
+        "emit_cloudevents_xregistry": "args.emit_xregistry",
+        "sample_size": "args.sample_size",
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth",
+        "infer_enums": "args.infer_enums"
+      }
+    },
+    "extensions": [
+      ".kusto"
+    ],
+    "args": [
+      {
+        "name": "input",
+        "type": "str",
+        "nargs": "?",
+        "help": "Kusto file",
+        "required": false
+      },
+      {
+        "name": "--out",
+        "type": "str",
+        "help": "Path to the JSON Structure schema file",
+        "required": false
+      },
+      {
+        "name": "--kusto-uri",
+        "type": "str",
+        "help": "Kusto Cluster URI",
+        "required": false
+      },
+      {
+        "name": "--kusto-database",
+        "type": "str",
+        "help": "Kusto database",
+        "required": false
+      },
+      {
+        "name": "--table-name",
+        "type": "str",
+        "help": "Kusto table name",
+        "required": false
+      },
+      {
+        "name": "--base-id",
+        "type": "str",
+        "help": "Base URI for $id generation",
+        "required": false
+      },
+      {
+        "name": "--emit-cloudevents",
+        "type": "bool",
+        "help": "Emit CloudEvents declarations for each table",
+        "required": false
+      },
+      {
+        "name": "--emit-xregistry",
+        "type": "bool",
+        "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single JSON Structure schema",
+        "required": false
+      },
+      {
+        "name": "--sample-size",
+        "type": "int",
+        "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
+        "default": 100,
+        "required": false
+      },
+      {
+        "name": "--infer-choices",
+        "type": "bool",
+        "help": "Detect discriminated unions in dynamic fields and emit as choice types with discriminator defaults",
+        "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
+        "default": 1,
+        "required": false
+      },
+      {
+        "name": "--infer-enums",
+        "type": "bool",
+        "help": "Detect enum types from repeated string values with low cardinality in dynamic fields",
+        "required": false
+      }
+    ],
+    "suggested_output_file_path": "{kusto_database}.jstruct.json",
+    "prompts": [
+      {
+        "name": "--base-id",
+        "message": "Enter the base URI for $id generation",
+        "type": "str",
+        "required": false
+      },
+      {
+        "name": "--emit-cloudevents",
+        "message": "Emit CloudEvents declarations for each table?",
+        "type": "bool",
+        "default": false,
+        "required": false
+      },
+      {
+        "name": "--emit-xregistry",
+        "message": "Emit an xRegistry manifest with CloudEvents declarations?",
+        "type": "bool",
+        "default": false,
+        "required": false
+      }
+    ],
+    "skip_input_file_handling": true
+  },
   {
     "command": "a2sql",
     "description": "Convert Avrotize schema to SQL schema",
@@ -1163,10 +1311,15 @@
         "type_name": "args.type_name",
         "avro_namespace": "args.namespace",
         "sample_size": "args.sample_size",
-        "infer_choices": "args.infer_choices"
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth"
       }
     },
-    "extensions": [".json", ".jsonl", ".ndjson"],
+    "extensions": [
+      ".json",
+      ".jsonl",
+      ".ndjson"
+    ],
     "args": [
       {
         "name": "input",
@@ -1207,6 +1360,13 @@
         "action": "store_true",
         "help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
         "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
+        "default": 1,
+        "required": false
       }
     ],
     "suggested_output_file_path": "{input_file_name}.avsc",
@@ -1239,10 +1399,16 @@
         "type_name": "args.type_name",
         "base_id": "args.base_id",
         "sample_size": "args.sample_size",
-        "infer_choices": "args.infer_choices"
+        "infer_choices": "args.infer_choices",
+        "choice_depth": "args.choice_depth",
+        "infer_enums": "args.infer_enums"
       }
     },
-    "extensions": [".json", ".jsonl", ".ndjson"],
+    "extensions": [
+      ".json",
+      ".jsonl",
+      ".ndjson"
+    ],
     "args": [
       {
         "name": "input",
@@ -1284,6 +1450,20 @@
         "action": "store_true",
         "help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
         "required": false
+      },
+      {
+        "name": "--choice-depth",
+        "type": "int",
+        "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
+        "default": 1,
+        "required": false
+      },
+      {
+        "name": "--infer-enums",
+        "type": "bool",
+        "action": "store_true",
+        "help": "Detect enum types from repeated string values with low cardinality",
+        "required": false
       }
     ],
     "suggested_output_file_path": "{input_file_name}.jstruct.json",
@@ -1319,7 +1499,9 @@
         "sample_size": "args.sample_size"
       }
     },
-    "extensions": [".xml"],
+    "extensions": [
+      ".xml"
+    ],
     "args": [
       {
         "name": "input",
@@ -1387,7 +1569,9 @@
         "sample_size": "args.sample_size"
       }
     },
-    "extensions": [".xml"],
+    "extensions": [
+      ".xml"
+    ],
     "args": [
       {
         "name": "input",
@@ -1456,7 +1640,10 @@
         "quiet": "args.quiet"
       }
     },
-    "extensions": [".json", ".jsonl"],
+    "extensions": [
+      ".json",
+      ".jsonl"
+    ],
     "args": [
       {
         "name": "input",
@@ -1643,7 +1830,10 @@
         "name": "--format",
         "type": "str",
         "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
-        "choices": ["schema", "arrow"],
+        "choices": [
+          "schema",
+          "arrow"
+        ],
         "default": "arrow",
         "required": false
       }
@@ -1707,7 +1897,10 @@
         "name": "--format",
         "type": "str",
         "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
-        "choices": ["schema", "arrow"],
+        "choices": [
+          "schema",
+          "arrow"
+        ],
         "default": "arrow",
         "required": false
       }
@@ -2676,7 +2869,10 @@
         "avro_annotation": "args.avro_annotation"
       }
     },
-    "extensions": [".struct.json", ".json"],
+    "extensions": [
+      ".struct.json",
+      ".json"
+    ],
     "args": [
       {
         "name": "input",

{structurize-3.2.2 → structurize-3.3.1}/avrotize/jsontoschema.py RENAMED Viewed

@@ -22,7 +22,8 @@ def convert_json_to_avro(
     type_name: str = 'Document',
     avro_namespace: str = '',
     sample_size: int = 0,
-    infer_choices: bool = False
+    infer_choices: bool = False,
+    choice_depth: int = 1
 ) -> None:
     """Infers Avro schema from JSON files.
@@ -37,6 +38,7 @@ def convert_json_to_avro(
         avro_namespace: Namespace for generated Avro types
         sample_size: Maximum number of records to sample (0 = all)
         infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
+        choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
     """
     if not input_files:
         raise ValueError("At least one input file is required")
@@ -46,7 +48,7 @@ def convert_json_to_avro(
     if not values:
         raise ValueError("No valid JSON data found in input files")
-    inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
+    inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
     schema = inferrer.infer_from_json_values(type_name, values)
     # Ensure output directory exists
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
     type_name: str = 'Document',
     base_id: str = 'https://example.com/',
     sample_size: int = 0,
-    infer_choices: bool = False
+    infer_choices: bool = False,
+    choice_depth: int = 1,
+    infer_enums: bool = False
 ) -> None:
     """Infers JSON Structure schema from JSON files.
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
         base_id: Base URI for $id generation
         sample_size: Maximum number of records to sample (0 = all)
         infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
+        choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
+        infer_enums: Detect enum types from repeated string values with low cardinality
     """
     if not input_files:
         raise ValueError("At least one input file is required")
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
     if not values:
         raise ValueError("No valid JSON data found in input files")
-    inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
+    inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
+                                            choice_depth=choice_depth, infer_enums=infer_enums)
     schema = inferrer.infer_from_json_values(type_name, values)
     # Ensure output directory exists
@@ -103,7 +110,7 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
     """Loads JSON values from files.
     Handles both single JSON documents and JSON Lines (JSONL) files.
-    Arrays at the root level are flattened into individual values.
+    Top-level arrays are treated as single array values, not flattened.
     Args:
         input_files: List of file paths
@@ -127,14 +134,8 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
         # Try parsing as a single JSON document first
         try:
             data = json.loads(content)
-            if isinstance(data, list):
-                # Root-level array: each element is a separate value
-                for item in data:
-                    values.append(item)
-                    if sample_size > 0 and len(values) >= sample_size:
-                        break
-            else:
-                values.append(data)
+            # Treat any valid JSON (including arrays) as a single value
+            values.append(data)
             continue
         except json.JSONDecodeError:
             pass

{structurize-3.2.2 → structurize-3.3.1}/avrotize/kustotoavro.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Tuple
 from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
 from avrotize.common import get_tree_hash
 from avrotize.constants import AVRO_VERSION
+from avrotize.schema_inference import AvroSchemaInferrer
 JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
@@ -14,7 +15,7 @@ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
 class KustoToAvro:
     """ Converts Kusto table schemas to Avro schema format."""
-    def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None):
+    def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
         """ Initializes the KustoToAvro class with the Kusto URI and database name. """
         kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
         self.client = KustoClient(kcsb)
@@ -24,6 +25,9 @@ class KustoToAvro:
         self.avro_schema_path = avro_schema_path
         self.emit_xregistry = emit_cloudevents_xregistry
         self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
+        self.sample_size = sample_size if sample_size > 0 else 100
+        self.infer_choices = infer_choices
+        self.choice_depth = choice_depth
         if self.emit_xregistry:
             if not self.avro_namespace:
                 raise ValueError(
@@ -172,20 +176,19 @@ class KustoToAvro:
             type_value: The value of the type column (if any)
         """
         type_column_name = type_column['Name'] if type_column else None
-        query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take 100"
+        query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
         rows = self.client.execute(self.kusto_database, query)
         values = [row[column_name] for row in rows.primary_results[0]]
         type_name = type_value if type_value else f"{table_name}.{column_name}"
-        unique_types = self.consolidated_type_list(type_name, values)
-        if len(unique_types) > 1:
-            # Using a union of inferred types
-            return unique_types
-        elif len(unique_types) == 1:
-            # Single type, no need for union
-            return unique_types[0]
-        else:
-            # No values, default to string
-            return "string"
+        # Use the new AvroSchemaInferrer for consistent inference
+        inferrer = AvroSchemaInferrer(
+            namespace=self.avro_namespace,
+            altnames_key='kql',
+            infer_choices=self.infer_choices,
+            choice_depth=self.choice_depth
+        )
+        return inferrer.infer_from_json_values(type_name, values)
     type_map : Dict[str, JsonNode] = {
             "int": "int",
@@ -440,7 +443,7 @@ class KustoToAvro:
             json.dump(output, avro_file, indent=4)
-def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None):
+def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
     """ Converts Kusto table schemas to Avro schema format."""
     if not kusto_uri:
@@ -451,5 +454,5 @@ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str |
         avro_namespace = kusto_database
     kusto_to_avro = KustoToAvro(
-        kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file,emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider)
+        kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth)
     return kusto_to_avro.process_all_tables()

structurize 3.2.2__tar.gz → 3.3.1__tar.gz

structurize 3.2.2tar.gz → 3.3.1tar.gz