PyPI - tellaro-query-language - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/METADATA +24 -1
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/RECORD +27 -27
tql/core.py +225 -54
tql/core_components/opensearch_operations.py +415 -99
tql/core_components/stats_operations.py +11 -1
tql/evaluator.py +39 -2
tql/evaluator_components/special_expressions.py +25 -6
tql/evaluator_components/value_comparison.py +31 -3
tql/mutator_analyzer.py +640 -242
tql/mutators/__init__.py +5 -1
tql/mutators/dns.py +76 -53
tql/mutators/security.py +101 -100
tql/mutators/string.py +74 -0
tql/opensearch_components/field_mapping.py +9 -3
tql/opensearch_components/lucene_converter.py +12 -0
tql/opensearch_components/query_converter.py +134 -25
tql/opensearch_mappings.py +2 -2
tql/opensearch_stats.py +170 -39
tql/parser.py +92 -37
tql/parser_components/ast_builder.py +37 -1
tql/parser_components/field_extractor.py +9 -1
tql/parser_components/grammar.py +32 -8
tql/post_processor.py +489 -31
tql/stats_evaluator.py +170 -12
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/LICENSE +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/WHEEL +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/entry_points.txt +0 -0

tql/opensearch_components/query_converter.py CHANGED Viewed

@@ -42,6 +42,18 @@ class QueryConverter:
                 return self._convert_geo_expr(node)
             elif node_type == "nslookup_expr":
                 return self._convert_nslookup_expr(node)
+            elif node_type == "query_with_stats":
+                # For query_with_stats, only convert the filter part
+                # The stats part is handled by the stats engine
+                filter_node = node.get("filter")
+                if filter_node:
+                    return self.convert_node(filter_node)
+                else:
+                    return {"match_all": {}}
+            elif node_type == "stats_expr":
+                # Pure stats queries match all documents
+                # The aggregations are handled by the stats engine
+                return {"match_all": {}}
         raise TQLValidationError(f"Unknown node type: {node}")
@@ -87,7 +99,8 @@ class QueryConverter:
         """Convert a comparison operation to OpenSearch query."""
         field_name = node["field"]
         operator = node["operator"]
-        value = node["value"]
+        # For exists/not_exists operators, value is None
+        value = node.get("value") if operator not in ["exists", "not_exists"] else None
         field_mutators = node.get("field_mutators", [])
         # Check if mutators change the field type
@@ -127,10 +140,16 @@ class QueryConverter:
         # Note: ALL and NOT_ALL operators are handled with script queries and don't need post-processing
         requires_post_processing = node.get("post_process_value", False) or has_type_changing_mutators
-        if requires_post_processing:
-            # For operations that require post-processing, we need to query more broadly
-            # to ensure we get all potentially matching documents
-            if operator in [
+        # Also check if we have transform mutators with filtering operators
+        # Transform mutators change the field value, so we need to use exists query
+        has_transform_mutators_with_filter = node.get("has_transform_mutators_with_filter", False)
+        # Also check field_mutators directly in case the flag wasn't set
+        if (
+            not has_transform_mutators_with_filter
+            and field_mutators
+            and operator
+            in [
                 "eq",
                 "=",
                 "ne",
@@ -151,10 +170,55 @@ class QueryConverter:
                 "lte",
                 "between",
                 "not_between",
-            ]:
-                # For these operators, use exists query to get all docs with the field
-                # The actual filtering will happen in post-processing
-                return {"exists": {"field": opensearch_field}}
+            ]
+        ):
+            # Check if any of the mutators are transform mutators
+            TRANSFORM_MUTATORS = {
+                "lowercase",
+                "uppercase",
+                "trim",
+                "replace",
+                "refang",
+                "defang",
+                "b64encode",
+                "b64decode",
+                "urldecode",
+            }
+            for mutator in field_mutators:
+                if mutator.get("name", "").lower() in TRANSFORM_MUTATORS:
+                    has_transform_mutators_with_filter = True
+                    break
+        if requires_post_processing or has_transform_mutators_with_filter:
+            # For value mutators, type-changing field mutators, or transform mutators that require post-processing, use exists query
+            # But NOT for field mutators like any/all/none - those should not affect the query
+            if node.get("value_mutators") or has_type_changing_mutators or has_transform_mutators_with_filter:
+                # Only for these mutators do we need to broaden the search
+                if operator in [
+                    "eq",
+                    "=",
+                    "ne",
+                    "!=",
+                    "contains",
+                    "not_contains",
+                    "startswith",
+                    "endswith",
+                    "not_startswith",
+                    "not_endswith",
+                    ">",
+                    ">=",
+                    "<",
+                    "<=",
+                    "gt",
+                    "gte",
+                    "lt",
+                    "lte",
+                    "between",
+                    "not_between",
+                ]:
+                    # For these operators with mutators, use exists query to get all docs with the field
+                    # The actual filtering will happen in post-processing
+                    return {"exists": {"field": opensearch_field}}
         # Handle special wildcard conversion for keyword fields
         if use_wildcard and operator == "contains":
@@ -162,23 +226,38 @@ class QueryConverter:
         # Convert operator to OpenSearch query
         if operator in ["eq", "="]:
-            # Check if we're using a text field
-            is_text_field = self._is_text_field(field_name, opensearch_field)
-            # Use match query for text fields, term for others
-            if is_text_field:
-                return {"match": {opensearch_field: value}}
+            # For fields with mappings, use the optimized query type
+            if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
+                # Check if we're using a text field
+                is_text_field = self._is_text_field(field_name, opensearch_field)
+                if is_text_field:
+                    return {"match": {opensearch_field: value}}
+                else:
+                    return {"term": {opensearch_field: value}}
             else:
-                return {"term": {opensearch_field: value}}
+                # For unmapped fields, use match_phrase for strings (safer default)
+                # This ensures compatibility with both text and keyword fields
+                if isinstance(value, str):
+                    return {"match_phrase": {opensearch_field: value}}
+                else:
+                    # For non-string values (numbers, booleans), use term query
+                    return {"term": {opensearch_field: value}}
         elif operator in ["ne", "!="]:
-            # Check if we're using a text field
-            is_text_field = self._is_text_field(field_name, opensearch_field)
-            # Use match query for text fields, term for others
-            if is_text_field:
-                return {"bool": {"must_not": {"match": {opensearch_field: value}}}}
+            # For fields with mappings, use the optimized query type
+            if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
+                # Check if we're using a text field
+                is_text_field = self._is_text_field(field_name, opensearch_field)
+                if is_text_field:
+                    return {"bool": {"must_not": {"match": {opensearch_field: value}}}}
+                else:
+                    return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
             else:
-                return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
+                # For unmapped fields, use match_phrase for strings (safer default)
+                if isinstance(value, str):
+                    return {"bool": {"must_not": {"match_phrase": {opensearch_field: value}}}}
+                else:
+                    # For non-string values (numbers, booleans), use term query
+                    return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
         elif operator in ["gt", ">"]:
             return {"range": {opensearch_field: {"gt": value}}}
         elif operator in ["gte", ">="]:
@@ -434,8 +513,25 @@ class QueryConverter:
         right_query = self.convert_node(node["right"])
         if operator == "and":
-            return {"bool": {"must": [left_query, right_query]}}
+            # Collect all must clauses, flattening where appropriate
+            must_clauses = []
+            # Helper function to extract clauses
+            def extract_must_clauses(query):
+                if isinstance(query, dict) and "bool" in query:
+                    bool_query = query["bool"]
+                    # If it only has must clauses, extract them
+                    if set(bool_query.keys()) == {"must"} and isinstance(bool_query["must"], list):
+                        return bool_query["must"]
+                return [query]
+            # Extract and flatten must clauses
+            must_clauses.extend(extract_must_clauses(left_query))
+            must_clauses.extend(extract_must_clauses(right_query))
+            return {"bool": {"must": must_clauses}}
         elif operator == "or":
+            # OR still needs should clause
             return {"bool": {"should": [left_query, right_query], "minimum_should_match": 1}}
         else:
             raise TQLUnsupportedOperationError(f"Logical operator '{operator}' not supported for OpenSearch")
@@ -720,15 +816,28 @@ class QueryConverter:
             value: Value to convert
         Returns:
-            Converted value (bool, None, or original)
+            Converted value (bool, None, numeric, or original)
         """
         if isinstance(value, str):
+            # Check for boolean values
             if value.lower() == "true":
                 return True
             elif value.lower() == "false":
                 return False
             elif value.lower() == "null":
                 return None
+            # Check if it's a numeric string
+            elif value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
+                # Convert to integer
+                return int(value)
+            else:
+                # Try to parse as float
+                try:
+                    # Check if it has a decimal point
+                    if "." in value:
+                        return float(value)
+                except ValueError:
+                    pass
         return value
     def _is_text_field(self, field_name: str, opensearch_field: str) -> bool:

tql/opensearch_mappings.py CHANGED Viewed

@@ -52,7 +52,7 @@ def extract_field_mappings_from_opensearch(
     try:
         # Extract field names from the TQL query
         field_names = tql_instance.extract_fields(tql_query)
-        logger.info(f"Extracted {len(field_names)} fields from TQL query: {field_names}")
+        logger.debug(f"Extracted {len(field_names)} fields from TQL query: {field_names}")
         if not field_names:
             logger.warning("No fields found in TQL query")
@@ -68,7 +68,7 @@ def extract_field_mappings_from_opensearch(
         # Extract and convert mappings to TQL format
         tql_mappings = _convert_opensearch_mappings_to_tql_format(mapping_response, field_names)
-        logger.info(f"Successfully converted mappings for {len(tql_mappings)} fields")
+        logger.debug(f"Successfully converted mappings for {len(tql_mappings)} fields")
         return tql_mappings
     except Exception as e:

tql/opensearch_stats.py CHANGED Viewed

@@ -4,6 +4,7 @@ This module translates TQL stats queries to OpenSearch aggregation DSL.
 """
 from typing import Any, Dict, List, Optional, Union
+import json
 from .exceptions import TQLError
@@ -84,6 +85,9 @@ class OpenSearchStatsTranslator:
         if group_by_fields:
             # Build nested terms aggregations for grouping
             aggs_dsl = self._build_grouped_aggregations(aggregations, group_by_fields, field_mappings)
+            print(
+                f"\n=== OpenSearch Aggregation Query ===\nGroup by: {group_by_fields}\nAggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
+            )
         else:
             # Simple aggregations without grouping
             aggs_dsl = self._build_simple_aggregations(aggregations, field_mappings)
@@ -163,14 +167,14 @@ class OpenSearchStatsTranslator:
     def _build_grouped_aggregations(
         self,
         aggregations: List[Dict[str, Any]],
-        group_by_fields: List[str],
+        group_by_fields: List[Any],
         field_mappings: Optional[Dict[str, str]] = None,
     ) -> Dict[str, Any]:
         """Build aggregations with grouping.
         Args:
             aggregations: List of aggregation specifications
-            group_by_fields: Fields to group by
+            group_by_fields: Fields to group by (can be strings or dicts with bucket_size)
             field_mappings: Optional field mappings
         Returns:
@@ -193,18 +197,38 @@ class OpenSearchStatsTranslator:
                 size = agg.get("limit", 10)
                 break
+        # Normalize group_by_fields to handle both old (string) and new (dict) formats
+        normalized_fields = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                # Old format: just field name, use default bucket size
+                normalized_fields.append({"field": field, "bucket_size": 10})
+            elif isinstance(field, dict):
+                # New format: {"field": "name", "bucket_size": N}
+                bucket_size = field.get("bucket_size", 10) if field.get("bucket_size") is not None else 10
+                normalized_fields.append({"field": field["field"], "bucket_size": bucket_size})
+            else:
+                # Shouldn't happen but handle gracefully
+                normalized_fields.append({"field": str(field), "bucket_size": 10})
         # Build nested terms aggregations for each group_by field
         current_aggs = inner_aggs
         # Process group_by fields in reverse order to build proper nesting
-        for field in reversed(group_by_fields):
-            terms_agg = {"terms": {"field": field, "size": size}}
+        for i, field_spec in enumerate(reversed(normalized_fields)):
+            field_name = field_spec["field"]
+            bucket_size = field_spec["bucket_size"]
+            # Always respect user-specified bucket sizes
+            # The user has explicitly set limits with "top N" syntax
+            terms_agg = {"terms": {"field": field_name, "size": bucket_size}}
             # Add ordering if this is the outermost aggregation and we have order field
-            if field == group_by_fields[0] and order_field:
+            if field_name == normalized_fields[0]["field"] and order_field:
                 # For nested aggregations, we need the full path
                 order_path = order_field
-                if len(group_by_fields) > 1:
+                if len(normalized_fields) > 1:
                     # Multi-level grouping requires special handling
                     # OpenSearch doesn't support ordering by sub-aggregations in nested terms
                     # We'll need to handle this in post-processing
@@ -217,7 +241,7 @@ class OpenSearchStatsTranslator:
                 terms_agg["aggs"] = current_aggs
             # Wrap for next level
-            current_aggs = {f"group_by_{field}": terms_agg}
+            current_aggs = {f"group_by_{field_name}": terms_agg}
         return current_aggs
@@ -314,14 +338,14 @@ class OpenSearchStatsTranslator:
             return {"type": "multiple_aggregations", "results": results}
     def _transform_grouped_response(
-        self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[str]
+        self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
     ) -> Dict[str, Any]:
         """Transform grouped aggregation response.
         Args:
             response: OpenSearch response
             aggregations: Aggregation specifications
-            group_by_fields: Grouping fields
+            group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
         Returns:
             Transformed response
@@ -329,61 +353,78 @@ class OpenSearchStatsTranslator:
         # Navigate to the grouped results
         aggs_data = response.get("aggregations", {})
-        # Get the outermost grouping
-        first_group_key = f"group_by_{group_by_fields[0]}"
+        # Normalize group_by_fields to handle both old (string) and new (dict) formats
+        normalized_fields = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                # Old format: just field name
+                normalized_fields.append({"field": field, "bucket_size": None})
+            elif isinstance(field, dict):
+                # New format: {"field": "name", "bucket_size": N}
+                normalized_fields.append(field)
+            else:
+                # Shouldn't happen but handle gracefully
+                normalized_fields.append({"field": str(field), "bucket_size": None})
+        # Get the outermost grouping - use the field name from the normalized structure
+        first_field_name = normalized_fields[0]["field"]
+        first_group_key = f"group_by_{first_field_name}"
         grouped_data = aggs_data.get(first_group_key, {})
         # Extract buckets
         buckets = grouped_data.get("buckets", [])
-        # Transform buckets
+        # Transform buckets - handle multi-level grouping recursively
         results = []
+        print(
+            f"\n=== OpenSearch Response Debug ===\nTotal buckets at top level: {len(buckets)}\nGroup by fields: {group_by_fields}\n"
+        )
         for bucket in buckets:
-            result = self._transform_bucket(bucket, aggregations, group_by_fields, 0)
+            result = self._transform_bucket_recursive(bucket, aggregations, normalized_fields, 0)
             if result:
-                results.append(result)
+                # Handle the case where recursive transformation returns a list (multi-level)
+                if isinstance(result, list):
+                    results.extend(result)
+                else:
+                    results.append(result)
-        return {"type": "grouped_aggregation", "group_by": group_by_fields, "results": results}
+        return {"type": "stats_grouped", "group_by": group_by_fields, "results": results}
     def _transform_bucket(
-        self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[str], level: int
+        self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any], level: int
     ) -> Optional[Dict[str, Any]]:
         """Transform a single bucket from grouped aggregation.
         Args:
             bucket: OpenSearch bucket
             aggregations: Aggregation specifications
-            group_by_fields: Grouping fields
+            group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
             level: Current nesting level
         Returns:
             Transformed bucket or None
         """
+        # Normalize group_by_fields to handle both old (string) and new (dict) formats
+        normalized_fields = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                # Old format: just field name
+                normalized_fields.append({"field": field, "bucket_size": None})
+            elif isinstance(field, dict):
+                # New format: {"field": "name", "bucket_size": N}
+                normalized_fields.append(field)
+            else:
+                # Shouldn't happen but handle gracefully
+                normalized_fields.append({"field": str(field), "bucket_size": None})
         result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
         # Add current level key
-        if level < len(group_by_fields):
-            field = group_by_fields[level]
-            result["key"][field] = bucket.get("key")
-        # Check if there are more levels
-        if level + 1 < len(group_by_fields):
-            # Navigate to next level
-            next_field = group_by_fields[level + 1]
-            next_group_key = f"group_by_{next_field}"
-            if next_group_key in bucket:
-                # This is a nested grouping, we need to aggregate the sub-buckets
-                # For now, we'll just take the first sub-bucket
-                # TODO: Handle proper multi-level grouping
-                sub_buckets = bucket[next_group_key].get("buckets", [])
-                if sub_buckets:
-                    sub_result = self._transform_bucket(sub_buckets[0], aggregations, group_by_fields, level + 1)
-                    if sub_result:
-                        # Merge keys
-                        result["key"].update(sub_result["key"])
-        # Extract aggregation values
+        if level < len(normalized_fields):
+            field_name = normalized_fields[level]["field"]
+            result["key"][field_name] = bucket.get("key")
+        # Extract aggregation values at the innermost level
         if len(aggregations) == 1:
             # Single aggregation
             agg = aggregations[0]
@@ -402,6 +443,96 @@ class OpenSearchStatsTranslator:
         return result
+    def _transform_bucket_recursive(
+        self,
+        bucket: Dict[str, Any],
+        aggregations: List[Dict[str, Any]],
+        normalized_fields: List[Dict[str, Any]],
+        level: int,
+    ) -> Optional[Dict[str, Any]]:
+        """Transform a bucket recursively for multi-level grouping.
+        Args:
+            bucket: OpenSearch bucket
+            aggregations: Aggregation specifications
+            normalized_fields: Normalized group by fields with field names and bucket sizes
+            level: Current nesting level
+        Returns:
+            Transformed bucket or None
+        """
+        result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
+        # Add current level key
+        if level < len(normalized_fields):
+            field_name = normalized_fields[level]["field"]
+            result["key"][field_name] = bucket.get("key")
+        # Check if we're at the deepest level (have aggregation values)
+        is_leaf_level = True
+        for agg in aggregations:
+            alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
+            if alias in bucket:
+                is_leaf_level = True
+                break
+        # Check if there are sub-buckets (nested grouping)
+        next_level_field = None
+        if level + 1 < len(normalized_fields):
+            next_level_field = f"group_by_{normalized_fields[level + 1]['field']}"
+            if next_level_field in bucket and "buckets" in bucket[next_level_field]:
+                is_leaf_level = False
+        if is_leaf_level:
+            # Extract aggregation values at the leaf level
+            if len(aggregations) == 1:
+                # Single aggregation
+                agg = aggregations[0]
+                alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
+                value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
+                agg_key = agg.get("alias") or agg["function"]
+                result[agg_key] = value
+            else:
+                # Multiple aggregations
+                result["aggregations"] = {}
+                for i, agg in enumerate(aggregations):
+                    alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_{i}"
+                    value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
+                    agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
+                    result["aggregations"][agg_key] = value
+        else:
+            # Handle nested buckets
+            sub_buckets = bucket[next_level_field].get("buckets", [])
+            sub_results = []
+            print(
+                f"  Level {level}: Processing {len(sub_buckets)} sub-buckets for field {normalized_fields[level + 1]['field']}"
+            )
+            for sub_bucket in sub_buckets:
+                sub_result_data = self._transform_bucket_recursive(
+                    sub_bucket, aggregations, normalized_fields, level + 1
+                )
+                if sub_result_data:
+                    # Handle the case where sub_result_data might be a list (deeper nesting)
+                    if isinstance(sub_result_data, list):
+                        for sub_item in sub_result_data:
+                            # Merge the keys from current level with sub-level keys
+                            merged_key = dict(result["key"])
+                            merged_key.update(sub_item["key"])
+                            sub_item["key"] = merged_key
+                            sub_results.append(sub_item)
+                    else:
+                        # Merge the keys from different levels
+                        merged_key = dict(result["key"])
+                        merged_key.update(sub_result_data["key"])
+                        sub_result_data["key"] = merged_key
+                        sub_results.append(sub_result_data)
+            # For multi-level grouping, we return the sub-results as separate entries
+            # This flattens the nested structure into a list of results
+            return sub_results if sub_results else None
+        return result
     def _extract_aggregation_value(  # noqa: C901
         self, agg_result: Dict[str, Any], function: str
     ) -> Union[int, float, Dict[str, Any], List[Any], None]:

tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl