PyPI - tellaro-query-language - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

tellaro-query-language 0.2.2py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

tellaro_query_language-0.2.5.dist-info/LICENSE +72 -0
tellaro_query_language-0.2.5.dist-info/METADATA +806 -0
{tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/RECORD +25 -22
{tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/entry_points.txt +1 -0
tql/__init__.py +1 -1
tql/cache/base.py +79 -7
tql/cache/memory.py +126 -18
tql/cli.py +484 -0
tql/core.py +261 -5
tql/core_components/opensearch_operations.py +23 -4
tql/evaluator.py +3 -1
tql/evaluator_components/special_expressions.py +62 -10
tql/evaluator_components/value_comparison.py +70 -12
tql/exceptions.py +6 -4
tql/field_type_inference.py +285 -0
tql/mutator_analyzer.py +2 -2
tql/mutators/geo.py +57 -20
tql/opensearch_components/query_converter.py +1 -1
tql/opensearch_stats.py +10 -7
tql/parser.py +56 -21
tql/post_processor.py +44 -11
tql/scripts.py +19 -2
tql/stats_evaluator.py +361 -7
tql/streaming_file_processor.py +335 -0
tellaro_query_language-0.2.2.dist-info/LICENSE +0 -21
tellaro_query_language-0.2.2.dist-info/METADATA +0 -433
{tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/WHEEL +0 -0

tql/stats_evaluator.py CHANGED Viewed

@@ -6,7 +6,7 @@ aggregation queries against data records in memory.
 import statistics
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from .exceptions import TQLError
@@ -58,6 +58,354 @@ class TQLStatsEvaluator:
     def __init__(self):
         """Initialize the stats evaluator."""
+    def evaluate_stats_streaming(
+        self,
+        record_iterator: Any,
+        stats_ast: Dict[str, Any],
+        field_mappings: Optional[Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]]] = None,
+    ) -> Dict[str, Any]:
+        """Evaluate stats query against streaming records.
+        This method processes records incrementally using accumulators to minimize
+        memory usage for large datasets.
+        Args:
+            record_iterator: Iterator/generator yielding records
+            stats_ast: Stats AST from parser
+            field_mappings: Optional field type mappings
+        Returns:
+            Aggregated results in UI-friendly format
+        """
+        aggregations = stats_ast.get("aggregations", [])
+        group_by_fields = stats_ast.get("group_by", [])
+        # Validate aggregation types against field mappings if provided
+        if field_mappings:
+            self._validate_aggregations(aggregations, field_mappings)
+        if not group_by_fields:
+            # Simple aggregation without grouping (streaming accumulators)
+            return self._streaming_simple_aggregation(record_iterator, aggregations)
+        else:
+            # Grouped aggregation (still needs to track groups)
+            return self._streaming_grouped_aggregation(record_iterator, aggregations, group_by_fields)
+    def _streaming_simple_aggregation(  # noqa: C901
+        self, record_iterator: Any, aggregations: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """Perform streaming aggregation without grouping.
+        Args:
+            record_iterator: Iterator yielding records
+            aggregations: Aggregation specifications
+        Returns:
+            Aggregated results
+        """
+        # Initialize accumulators for each aggregation
+        accumulators = {}
+        for agg in aggregations:
+            func = agg["function"]
+            field = agg["field"]
+            key = f"{func}_{field}"
+            accumulators[key] = {
+                "function": func,
+                "field": field,
+                "count": 0,
+                "sum": 0,
+                "min": None,
+                "max": None,
+                "values": [],  # For unique, values, percentiles
+                "unique_set": set(),  # For unique_count
+            }
+        # Process records
+        for record in record_iterator:
+            for agg in aggregations:
+                func = agg["function"]
+                field = agg["field"]
+                key = f"{func}_{field}"
+                acc = accumulators[key]
+                # Handle count(*)
+                if func == "count" and field == "*":
+                    acc["count"] += 1
+                    continue
+                # Get field value
+                value = self._get_field_value(record, field)
+                if value is None:
+                    continue
+                # Update accumulator based on function
+                if func == "count":
+                    acc["count"] += 1
+                elif func == "unique_count":
+                    try:
+                        acc["unique_set"].add(value)
+                    except TypeError:
+                        # Unhashable type, use string representation
+                        acc["unique_set"].add(str(value))
+                elif func in ["sum", "min", "max", "average", "avg"]:
+                    numeric_value = self._to_numeric(value)
+                    acc["sum"] += numeric_value
+                    acc["count"] += 1
+                    if acc["min"] is None or numeric_value < acc["min"]:
+                        acc["min"] = numeric_value
+                    if acc["max"] is None or numeric_value > acc["max"]:
+                        acc["max"] = numeric_value
+                elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
+                    # Need to store all values for these
+                    acc["values"].append(self._to_numeric(value))
+                elif func in ["values", "unique", "cardinality"]:
+                    # Store unique values
+                    if value not in acc["values"]:
+                        acc["values"].append(value)
+        # Calculate final results
+        if len(aggregations) == 1:
+            agg = aggregations[0]
+            value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
+            return {
+                "type": "simple_aggregation",
+                "function": agg["function"],
+                "field": agg["field"],
+                "alias": agg.get("alias"),
+                "value": value,
+            }
+        else:
+            results = {}
+            for agg in aggregations:
+                value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
+                key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
+                results[key] = value
+            return {"type": "multiple_aggregations", "results": results}
+    def _streaming_grouped_aggregation(  # noqa: C901
+        self, record_iterator: Any, aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
+    ) -> Dict[str, Any]:
+        """Perform streaming grouped aggregation.
+        For grouped aggregations, we still need to track groups in memory,
+        but we process records one at a time.
+        Args:
+            record_iterator: Iterator yielding records
+            aggregations: Aggregation specifications
+            group_by_fields: Fields to group by
+        Returns:
+            Grouped aggregation results
+        """
+        # Normalize group_by_fields
+        normalized_fields = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                normalized_fields.append({"field": field, "bucket_size": None})
+            elif isinstance(field, dict):
+                normalized_fields.append(field)
+            else:
+                normalized_fields.append({"field": str(field), "bucket_size": None})
+        # Track groups with accumulators
+        groups: Dict[Tuple[Any, ...], Dict[str, Any]] = defaultdict(
+            lambda: self._create_group_accumulators(aggregations)
+        )
+        key_mapping: Dict[Tuple[Any, ...], List[Tuple[str, Any]]] = {}
+        # Process records
+        for record in record_iterator:
+            # Build group key
+            key_parts = []
+            for field_spec in normalized_fields:
+                field_name = field_spec.get("field")
+                if field_name is None:
+                    continue
+                value = self._get_field_value(record, field_name)
+                key_parts.append((field_name, value))
+            hashable_key = self._make_hashable_key(key_parts)
+            # Store key mapping
+            if hashable_key not in key_mapping:
+                key_mapping[hashable_key] = key_parts
+            # Update accumulators for this group
+            group_accs = groups[hashable_key]
+            self._update_group_accumulators(group_accs, record, aggregations)
+        # Finalize results
+        results = []
+        for hashable_key, group_accs in groups.items():
+            original_key = key_mapping[hashable_key]
+            group_result = {"key": dict(original_key), "doc_count": group_accs["doc_count"]}
+            if len(aggregations) == 1:
+                agg = aggregations[0]
+                value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
+                agg_key = agg.get("alias") or agg["function"]
+                group_result[agg_key] = value
+            else:
+                group_result["aggregations"] = {}
+                for agg in aggregations:
+                    value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
+                    agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
+                    group_result["aggregations"][agg_key] = value
+            results.append(group_result)
+        # Apply modifiers and bucket limits
+        results = self._apply_modifiers(results, aggregations)
+        results = self._apply_bucket_limits(results, normalized_fields)
+        # Extract field names for response
+        group_by_field_names = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                group_by_field_names.append(field)
+            elif isinstance(field, dict) and "field" in field:
+                group_by_field_names.append(field["field"])
+            else:
+                group_by_field_names.append(str(field))
+        return {"type": "grouped_aggregation", "group_by": group_by_field_names, "results": results}
+    def _create_group_accumulators(self, aggregations: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Create accumulator structure for a single group.
+        Args:
+            aggregations: Aggregation specifications
+        Returns:
+            Dictionary of accumulators
+        """
+        accumulators: Dict[str, Any] = {"doc_count": 0}
+        for agg in aggregations:
+            func = agg["function"]
+            field = agg["field"]
+            key = f"{func}_{field}"
+            acc_value: Dict[str, Any] = {
+                "function": func,
+                "field": field,
+                "count": 0,
+                "sum": 0,
+                "min": None,
+                "max": None,
+                "values": [],
+                "unique_set": set(),
+            }
+            accumulators[key] = acc_value
+        return accumulators
+    def _update_group_accumulators(  # noqa: C901
+        self, group_accs: Dict[str, Any], record: Dict[str, Any], aggregations: List[Dict[str, Any]]
+    ) -> None:
+        """Update group accumulators with a new record.
+        Args:
+            group_accs: Group accumulators dictionary
+            record: Record to process
+            aggregations: Aggregation specifications
+        """
+        group_accs["doc_count"] += 1
+        for agg in aggregations:
+            func = agg["function"]
+            field = agg["field"]
+            key = f"{func}_{field}"
+            acc = group_accs[key]
+            # Handle count(*)
+            if func == "count" and field == "*":
+                acc["count"] += 1
+                continue
+            # Get field value
+            value = self._get_field_value(record, field)
+            if value is None:
+                continue
+            # Update accumulator
+            if func == "count":
+                acc["count"] += 1
+            elif func == "unique_count":
+                try:
+                    acc["unique_set"].add(value)
+                except TypeError:
+                    acc["unique_set"].add(str(value))
+            elif func in ["sum", "min", "max", "average", "avg"]:
+                numeric_value = self._to_numeric(value)
+                acc["sum"] += numeric_value
+                acc["count"] += 1
+                if acc["min"] is None or numeric_value < acc["min"]:
+                    acc["min"] = numeric_value
+                if acc["max"] is None or numeric_value > acc["max"]:
+                    acc["max"] = numeric_value
+            elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
+                acc["values"].append(self._to_numeric(value))
+            elif func in ["values", "unique", "cardinality"]:
+                if value not in acc["values"]:
+                    acc["values"].append(value)
+    def _finalize_accumulator(self, acc: Dict[str, Any], agg_spec: Dict[str, Any]) -> Any:  # noqa: C901
+        """Finalize an accumulator to produce the final aggregation value.
+        Args:
+            acc: Accumulator dictionary
+            agg_spec: Aggregation specification
+        Returns:
+            Final aggregated value
+        """
+        func = agg_spec["function"]
+        if func == "count":
+            return acc["count"]
+        elif func == "unique_count":
+            return len(acc["unique_set"])
+        elif func == "sum":
+            return acc["sum"]
+        elif func == "min":
+            return acc["min"]
+        elif func == "max":
+            return acc["max"]
+        elif func in ["average", "avg"]:
+            return acc["sum"] / acc["count"] if acc["count"] > 0 else None
+        elif func in ["median", "med"]:
+            if not acc["values"]:
+                return None
+            sorted_values = sorted(acc["values"])
+            return statistics.median(sorted_values)
+        elif func in ["std", "standard_deviation"]:
+            if len(acc["values"]) < 2:
+                return None
+            return statistics.stdev(acc["values"])
+        elif func in ["percentile", "percentiles", "p", "pct"]:
+            if not acc["values"]:
+                return None
+            sorted_values = sorted(acc["values"])
+            percentile_values = agg_spec.get("percentile_values", [50])
+            if len(percentile_values) == 1:
+                return self._calculate_percentile(sorted_values, percentile_values[0])
+            else:
+                result = {}
+                for p in percentile_values:
+                    result[f"p{int(p)}"] = self._calculate_percentile(sorted_values, p)
+                return result
+        elif func in ["values", "unique", "cardinality"]:
+            unique_values = acc["values"]
+            try:
+                unique_values.sort()
+            except TypeError:
+                pass
+            return unique_values
+        else:
+            return None
     def evaluate_stats(
         self, records: List[Dict[str, Any]], stats_ast: Dict[str, Any], field_mappings: Optional[Dict[str, str]] = None
     ) -> Dict[str, Any]:
@@ -85,7 +433,11 @@ class TQLStatsEvaluator:
             # Grouped aggregation
             return self._grouped_aggregation(records, aggregations, group_by_fields)
-    def _validate_aggregations(self, aggregations: List[Dict[str, Any]], field_mappings: Dict[str, str]) -> None:
+    def _validate_aggregations(
+        self,
+        aggregations: List[Dict[str, Any]],
+        field_mappings: Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]],
+    ) -> None:
         """Validate that aggregation functions are compatible with field types.
         Args:
@@ -146,7 +498,7 @@ class TQLStatsEvaluator:
             return {"type": "multiple_aggregations", "results": results}
-    def _grouped_aggregation(
+    def _grouped_aggregation(  # noqa: C901
         self, records: List[Dict[str, Any]], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
     ) -> Dict[str, Any]:
         """Perform aggregation with grouping.
@@ -180,7 +532,9 @@ class TQLStatsEvaluator:
             # Build group key
             key_parts = []
             for field_spec in normalized_fields:
-                field_name = field_spec["field"]
+                field_name = field_spec.get("field")
+                if field_name is None:
+                    continue
                 value = self._get_field_value(record, field_name)
                 key_parts.append((field_name, value))
@@ -371,7 +725,7 @@ class TQLStatsEvaluator:
         return results
-    def _apply_bucket_limits(
+    def _apply_bucket_limits(  # noqa: C901
         self, results: List[Dict[str, Any]], normalized_fields: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
         """Apply per-field bucket size limits to results.
@@ -407,8 +761,8 @@ class TQLStatsEvaluator:
         filtered_results = []
         # Track unique values at each level
-        level_values = {}
-        for level, field_spec in enumerate(normalized_fields):
+        level_values: Dict[int, Dict[Any, Set[Any]]] = {}
+        for level, _field_spec in enumerate(normalized_fields):
             level_values[level] = {}
         for result in results:

tellaro-query-language 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl

tellaro-query-language 0.2.2py3-none-any.whl → 0.2.5py3-none-any.whl