PyPI - tellaro-query-language - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/METADATA +24 -1
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/RECORD +27 -27
tql/core.py +225 -54
tql/core_components/opensearch_operations.py +415 -99
tql/core_components/stats_operations.py +11 -1
tql/evaluator.py +39 -2
tql/evaluator_components/special_expressions.py +25 -6
tql/evaluator_components/value_comparison.py +31 -3
tql/mutator_analyzer.py +640 -242
tql/mutators/__init__.py +5 -1
tql/mutators/dns.py +76 -53
tql/mutators/security.py +101 -100
tql/mutators/string.py +74 -0
tql/opensearch_components/field_mapping.py +9 -3
tql/opensearch_components/lucene_converter.py +12 -0
tql/opensearch_components/query_converter.py +134 -25
tql/opensearch_mappings.py +2 -2
tql/opensearch_stats.py +170 -39
tql/parser.py +92 -37
tql/parser_components/ast_builder.py +37 -1
tql/parser_components/field_extractor.py +9 -1
tql/parser_components/grammar.py +32 -8
tql/post_processor.py +489 -31
tql/stats_evaluator.py +170 -12
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/LICENSE +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/WHEEL +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/entry_points.txt +0 -0

tql/core_components/opensearch_operations.py CHANGED Viewed

@@ -45,11 +45,20 @@ class OpenSearchOperations:
         # Parse the query
         ast = self.parser.parse(query)
+        # Analyze the query for mutators
+        from ..mutator_analyzer import MutatorAnalyzer
+        analyzer = MutatorAnalyzer(self.field_mappings)
+        analysis_result = analyzer.analyze_ast(ast, context="opensearch")
+        # Use the optimized AST (with array operators removed)
+        optimized_ast = analysis_result.optimized_ast
         # Create OpenSearch backend
         backend = OpenSearchBackend(field_mappings=self.field_mappings)
-        # Convert to OpenSearch query
-        opensearch_query = backend.convert(ast)
+        # Convert to OpenSearch query using the optimized AST
+        opensearch_query = backend.convert(optimized_ast)
         return opensearch_query
@@ -96,7 +105,7 @@ class OpenSearchOperations:
             }
         # Create analyzer
-        analyzer = MutatorAnalyzer(self.enhanced_mappings)
+        analyzer = MutatorAnalyzer(self.field_mappings)
         # Analyze the AST
         return analyzer.analyze_ast(ast)
@@ -141,7 +150,7 @@ class OpenSearchOperations:
         query: str,
         index: Optional[str] = None,
         size: int = 10000,
-        search_after: Optional[List[Any]] = None,
+        from_: int = 0,
         sort: Optional[List[Dict[str, Any]]] = None,
         source_includes: Optional[List[str]] = None,
         source_excludes: Optional[List[str]] = None,
@@ -175,8 +184,8 @@ class OpenSearchOperations:
             query: TQL query string
             index: OpenSearch index name (uses environment variable if not provided)
             size: Maximum number of results to return (default: 10000)
-            search_after: Values from previous result for pagination
-            sort: List of sort specifications (required for search_after)
+            from_: Starting offset for pagination (max 10000 - size)
+            sort: List of sort specifications
             source_includes: Fields to include in response
             source_excludes: Fields to exclude from response
             track_total_hits: Whether to track total hit count
@@ -226,6 +235,13 @@ class OpenSearchOperations:
         is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
         if is_stats_query:
+            # Analyze the query to check for mutators
+            analysis_result = self.analyze_opensearch_query(query)
+            has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
+            needs_post_processing_for_stats = (
+                has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
+            )
             # Handle stats queries differently
             from ..opensearch_stats import OpenSearchStatsTranslator
@@ -240,7 +256,11 @@ class OpenSearchOperations:
                 # Convert filter to OpenSearch query
                 backend = OpenSearchBackend(field_mappings=self.field_mappings)
                 if filter_ast:
-                    filter_query = backend.convert(filter_ast)["query"]
+                    # Use the optimized AST if we have mutators
+                    if has_mutators and needs_post_processing_for_stats:
+                        filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
+                    else:
+                        filter_query = backend.convert(filter_ast)["query"]
                 else:
                     filter_query = {"match_all": {}}
             else:
@@ -248,19 +268,27 @@ class OpenSearchOperations:
                 stats_ast = ast
                 filter_query = {"match_all": {}}
-            # Build aggregations
-            if stats_ast:
-                stats_result = translator.translate_stats(stats_ast, self.field_mappings)
+            # For stats queries with post-processing mutators, we need to handle them differently
+            if needs_post_processing_for_stats:
+                # We'll need to fetch all documents and aggregate in memory
+                opensearch_query = {"query": filter_query}
+                needs_phase2 = True
+                # Store the stats AST for later processing
+                stats_ast_for_post_processing = stats_ast
             else:
-                stats_result = {"aggs": {}}
+                # Build aggregations for direct OpenSearch execution
+                if stats_ast:
+                    stats_result = translator.translate_stats(stats_ast, self.field_mappings)
+                else:
+                    stats_result = {"aggs": {}}
-            # Extract the aggregations (translate_stats returns {"aggs": {...}})
-            aggregations = stats_result.get("aggs", {})
+                # Extract the aggregations (translate_stats returns {"aggs": {...}})
+                aggregations = stats_result.get("aggs", {})
-            # Build the complete query
-            opensearch_query = {"query": filter_query, "aggs": aggregations}
-            needs_phase2 = False
-            has_mutators = False
+                # Build the complete query
+                opensearch_query = {"query": filter_query, "aggs": aggregations}
+                needs_phase2 = False
+                stats_ast_for_post_processing = None
         else:
             # Parse and analyze the query normally
             analysis_result = self.analyze_opensearch_query(query)
@@ -316,19 +344,49 @@ class OpenSearchOperations:
             base_query = search_body.get("query", {})
             time_filter = {"range": {timestamp_field: time_range}}
-            # Wrap the existing query with time filter
+            # Wrap the existing query with time filter in filter context
             if base_query:
-                search_body["query"] = {"bool": {"must": [base_query, time_filter]}}
+                # If the base query is already a bool query, add to its filter array
+                if isinstance(base_query, dict) and base_query.get("bool"):
+                    bool_query = base_query["bool"]
+                    if "filter" in bool_query:
+                        # Add to existing filter array
+                        if isinstance(bool_query["filter"], list):
+                            bool_query["filter"].append(time_filter)
+                        else:
+                            # Convert single filter to array
+                            bool_query["filter"] = [bool_query["filter"], time_filter]
+                    else:
+                        # No filter array yet, create one
+                        bool_query["filter"] = [time_filter]
+                    search_body["query"] = base_query
+                else:
+                    # Wrap in bool query with filter
+                    search_body["query"] = {"bool": {"filter": [base_query, time_filter]}}
             else:
                 search_body["query"] = time_filter
-        search_body.update({"size": size, "track_total_hits": track_total_hits})
+        # For stats queries, set size based on whether we need documents for post-processing
+        if is_stats_query:
+            if needs_phase2:
+                # Need all documents for post-processing
+                search_body.update({"size": 10000, "track_total_hits": track_total_hits})
+            else:
+                # Pure aggregation query - no documents needed
+                search_body.update({"size": 0, "track_total_hits": track_total_hits})
+        else:
+            search_body.update({"size": size, "track_total_hits": track_total_hits})
         # Add optional parameters
         if sort:
             search_body["sort"] = sort
-        if search_after:
-            search_body["search_after"] = search_after
+        # Add from parameter for pagination (limit to 10000 total)
+        if from_ > 0:
+            # Ensure we don't exceed the 10000 limit
+            max_allowed_from = 10000 - size
+            from_ = min(from_, max_allowed_from)
+            search_body["from"] = from_
         if source_includes or source_excludes:
             search_body["_source"] = {}
             if source_includes:
@@ -341,6 +399,9 @@ class OpenSearchOperations:
         # Add any additional parameters from kwargs
         search_body.update(kwargs)
+        # Store the complete search body for debugging
+        complete_opensearch_query = search_body.copy()
         # Build search parameters
         search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
@@ -408,43 +469,125 @@ class OpenSearchOperations:
         # Handle stats query results differently
         if is_stats_query:
-            # Process stats aggregation results
-            aggs_response = response.get("aggregations", {})
+            if needs_phase2 and "stats_ast_for_post_processing" in locals():
+                # Stats query with post-processing - need to aggregate in memory
+                # First, get all documents and apply mutators
+                all_documents = []
+                # Handle scroll for large datasets
+                if scan_all or needs_phase2:
+                    # Use scroll to get all documents
+                    scroll_params = search_params.copy()
+                    scroll_params["scroll"] = scroll_timeout
+                    scroll_params["body"]["size"] = min(10000, scroll_size)
-            # Format the stats results based on the test expectations
-            # Use the correct stats AST
-            if ast.get("type") == "query_with_stats":
-                stats_ast = ast.get("stats")
-            else:
-                stats_ast = ast
+                    try:
+                        # Initial search
+                        scroll_response = client.search(**scroll_params)
+                        scroll_hits = scroll_response.get("hits", {}).get("hits", [])
+                        while scroll_hits:
+                            for hit in scroll_hits:
+                                all_documents.append(hit["_source"])
+                            scroll_id = scroll_response.get("_scroll_id")
+                            if not scroll_id:
+                                break
+                            scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
+                            scroll_hits = scroll_response.get("hits", {}).get("hits", [])
+                        # Clean up scroll
+                        if scroll_id:
+                            try:
+                                client.clear_scroll(scroll_id=scroll_id)
+                            except Exception:
+                                pass
+                    except Exception as e:
+                        raise TQLExecutionError(f"Failed to fetch documents for stats post-processing: {str(e)}")
+                else:
+                    # Fetch documents with regular pagination
+                    for hit in response.get("hits", {}).get("hits", []):
+                        all_documents.append(hit["_source"])
+                # Apply post-processing mutators
+                if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
+                    processor = QueryPostProcessor()
+                    processed_docs = processor.process_results(
+                        all_documents, analysis_result.post_processing_requirements, track_enrichments=False
+                    )
+                    # Filter if needed
+                    filtered_docs = processor.filter_results(
+                        processed_docs, analysis_result.post_processing_requirements
+                    )
+                else:
+                    filtered_docs = all_documents
+                # Now perform in-memory aggregation
+                from ..stats_evaluator import TQLStatsEvaluator
+                stats_evaluator = TQLStatsEvaluator()
+                # Execute the stats aggregation in memory
+                stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
+                # Format response for stats-only (no documents)
+                result = {
+                    "stats": stats_results,
+                    "total": len(filtered_docs),
+                    "post_processing_applied": True,
+                    "health_status": "red",
+                    "health_reasons": [
+                        {
+                            "status": "red",
+                            "query_part": "stats with post-processing",
+                            "reason": f"Stats query required fetching {len(all_documents)} documents for post-processing",
+                        }
+                    ],
+                    "performance_impact": {
+                        "overhead_ms": 0,  # Would need timing to calculate
+                        "documents_processed": len(all_documents),
+                        "mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
+                    },
+                    "opensearch_query": complete_opensearch_query,
+                }
-            # Extract aggregation info
-            if stats_ast:
-                aggregations = stats_ast.get("aggregations", [])
-                group_by_fields = stats_ast.get("group_by", [])
+                return result
             else:
-                aggregations = []
-                group_by_fields = []
+                # Regular stats query using OpenSearch aggregations
+                aggs_response = response.get("aggregations", {})
-            # Format results differently based on whether we have grouping
-            if group_by_fields:
-                # For grouped stats, we need to extract buckets
+                # Format the stats results based on the test expectations
+                # Use the correct stats AST
+                if ast.get("type") == "query_with_stats":
+                    stats_ast = ast.get("stats")
+                else:
+                    stats_ast = ast
+                # Extract aggregation info
                 if stats_ast:
-                    buckets = self._extract_grouped_buckets(aggs_response, group_by_fields, aggregations, stats_ast)
+                    aggregations = stats_ast.get("aggregations", [])
+                    group_by_fields = stats_ast.get("group_by", [])
                 else:
-                    buckets = []
-                # For multiple aggregations, include all operations
-                operations = [agg.get("function") for agg in aggregations]
-                fields = [agg.get("field") for agg in aggregations]
-                stats_results = {
-                    "type": "stats_grouped",
-                    "operation": operations[0] if len(operations) == 1 else operations,
-                    "field": fields[0] if len(fields) == 1 else fields,
-                    "results": buckets,  # Array of buckets for grouped results
-                    "group_by": group_by_fields,
-                }
+                    aggregations = []
+                    group_by_fields = []
+            # Format results differently based on whether we have grouping
+            if group_by_fields:
+                # Use the OpenSearchStatsTranslator to properly transform the response
+                from ..opensearch_stats import OpenSearchStatsTranslator
+                translator = OpenSearchStatsTranslator()
+                # Transform the response using the translator
+                transformed_response = translator.transform_response(response, stats_ast)
+                # The transformed response already has the correct structure
+                stats_results = transformed_response
+                # Add viz_hint if present in stats AST
+                if stats_ast and stats_ast.get("viz_hint"):
+                    stats_results["viz_hint"] = stats_ast["viz_hint"]
             else:
                 # Simple aggregations without grouping
                 if aggregations:
@@ -488,6 +631,10 @@ class OpenSearchOperations:
                             "values": value,
                             "group_by": [],
                         }
+                        # Add viz_hint if present in stats AST
+                        if stats_ast and stats_ast.get("viz_hint"):
+                            stats_results["viz_hint"] = stats_ast["viz_hint"]
                     else:
                         # Multiple aggregations
                         agg_results = {}
@@ -526,50 +673,143 @@ class OpenSearchOperations:
                             "type": "stats",
                             "results": agg_results,
                         }
+                        # Add viz_hint if present in stats AST
+                        if stats_ast and stats_ast.get("viz_hint"):
+                            stats_results["viz_hint"] = stats_ast["viz_hint"]
                 else:
                     stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
-            # Extract hits if size > 0
-            hits = response.get("hits", {}).get("hits", [])
-            documents = []
-            if size > 0 and hits:
-                for hit in hits:
-                    doc = hit["_source"].copy()
-                    # Preserve metadata
-                    if "_id" in hit:
-                        doc["_id"] = hit["_id"]
-                    if "_score" in hit:
-                        doc["_score"] = hit["_score"]
-                    documents.append(doc)
-            # Return in the expected format
+            # For stats queries, return only stats (no documents)
+            # Total from aggregation metadata or hit count
+            total_count = response.get("hits", {}).get("total", {}).get("value", 0)
+            # Return stats-only format
             result = {
-                "results": documents,
-                "total": response.get("hits", {}).get("total", {}).get("value", 0),
                 "stats": stats_results,
+                "total": total_count,
                 "post_processing_applied": False,
-                "health_status": "HEALTHY",
+                "health_status": "green",
                 "health_reasons": [],
                 "performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
-                "scan_info": {"used_scan": False},
+                "opensearch_query": complete_opensearch_query,
+                "query_type": "stats",
             }
-            # Add query_type if documents were requested
-            if size > 0:
-                result["query_type"] = "stats_with_docs"
             return result
         # Extract hits for regular queries
-        hits = response.get("hits", {}).get("hits", [])
+        initial_hits = response.get("hits", {}).get("hits", [])
         total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
         # Process results based on whether we need Phase 2
-        if needs_phase2:
-            # Apply Phase 2 processing
+        if needs_phase2 and not scan_all:
+            # Pagination with post-processing - continue fetching pages until we get results
             processor = QueryPostProcessor()
+            results: List[Dict[str, Any]] = []
+            total_documents_before_filter = 0
+            total_documents_after_filter = 0
+            current_from = from_
+            pages_checked = 0
+            max_pages_to_check = min(10, (total_hits // size) + 1) if size > 0 else 1  # Limit to prevent infinite loops
+            while len(results) < size and pages_checked < max_pages_to_check and current_from < total_hits:
+                # Fetch current page
+                if pages_checked > 0:
+                    # Need to fetch next page
+                    search_params["body"]["from"] = current_from
+                    try:
+                        response = client.search(**search_params)
+                    except Exception as e:
+                        raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
+                    current_hits = response.get("hits", {}).get("hits", [])
+                else:
+                    # Use initial hits for first page
+                    current_hits = initial_hits
+                if not current_hits:
+                    break  # No more results
+                # Process the hits from this page
+                documents = []
+                hit_metadata = []
+                for hit in current_hits:
+                    documents.append(hit["_source"])
+                    hit_metadata.append(
+                        {
+                            "_id": hit.get("_id"),
+                            "_score": hit.get("_score"),
+                            "_explanation": hit.get("_explanation") if explain else None,
+                        }
+                    )
+                total_documents_before_filter += len(documents)
+                # Apply post-processing
+                if isinstance(analysis_result, MutatorAnalysisResult):
+                    processed_docs = processor.process_results(
+                        documents,
+                        analysis_result.post_processing_requirements,
+                        track_enrichments=kwargs.get("save_enrichment", False),
+                    )
-            # Extract documents from hits
+                    # Filter results
+                    filtered_docs = processor.filter_results(
+                        processed_docs, analysis_result.post_processing_requirements
+                    )
+                else:
+                    processed_docs = documents
+                    filtered_docs = documents
+                # Add filtered results with metadata
+                for doc in filtered_docs:
+                    if len(results) >= size:
+                        break  # We have enough results
+                    # Find the original hit metadata
+                    for i, orig_doc in enumerate(documents):
+                        if orig_doc == doc or self._docs_match(orig_doc, doc):
+                            # Add metadata
+                            if hit_metadata[i]["_id"]:
+                                doc["_id"] = hit_metadata[i]["_id"]
+                            if hit_metadata[i]["_score"]:
+                                doc["_score"] = hit_metadata[i]["_score"]
+                            if hit_metadata[i]["_explanation"]:
+                                doc["_explanation"] = hit_metadata[i]["_explanation"]
+                            break
+                    results.append(doc)
+                total_documents_after_filter += len(filtered_docs)
+                # Move to next page
+                current_from += size
+                pages_checked += 1
+            # Store filtering stats
+            pagination_stats = {
+                "page_size": size,
+                "pages_checked": pages_checked,
+                "documents_retrieved": total_documents_before_filter,
+                "documents_returned": len(results),
+                "documents_filtered": total_documents_before_filter - total_documents_after_filter,
+                "filter_rate": (
+                    (
+                        (total_documents_before_filter - total_documents_after_filter)
+                        / total_documents_before_filter
+                        * 100
+                    )
+                    if total_documents_before_filter > 0
+                    else 0
+                ),
+                "actual_from": from_,  # Original from
+                "actual_to": current_from,  # Where we ended up searching to
+            }
+        elif needs_phase2 and scan_all:
+            # scan_all mode with post-processing - process all results
+            processor = QueryPostProcessor()
+            # Extract all documents from hits
             documents = []
             hit_metadata = []
             for hit in hits:
@@ -590,10 +830,9 @@ class OpenSearchOperations:
                     track_enrichments=kwargs.get("save_enrichment", False),
                 )
-                # Then filter results based on requirements (e.g., ALL operator, contains with mutators)
+                # Then filter results based on requirements
                 filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
             else:
-                # No post-processing needed
                 processed_docs = documents
                 filtered_docs = documents
@@ -601,7 +840,6 @@ class OpenSearchOperations:
             results = []
             for doc in filtered_docs:
                 # Find the original hit metadata for this document
-                # This is a simple approach - in production you might want to track IDs
                 for i, orig_doc in enumerate(documents):
                     if orig_doc == doc or self._docs_match(orig_doc, doc):
                         # Add metadata
@@ -613,9 +851,17 @@ class OpenSearchOperations:
                             doc["_explanation"] = hit_metadata[i]["_explanation"]
                         break
                 results.append(doc)
+            pagination_stats = {
+                "documents_scanned": len(documents),
+                "documents_passed": len(results),
+                "filter_rate": (len(results) / len(documents) * 100) if documents else 0,
+            }
         else:
             # No Phase 2 needed, just extract documents
             results = []
+            hits = initial_hits  # Use the initial hits
             for hit in hits:
                 doc = hit["_source"].copy()
                 # Preserve metadata
@@ -627,6 +873,8 @@ class OpenSearchOperations:
                     doc["_explanation"] = hit["explanation"]
                 results.append(doc)
+            pagination_stats = None
         # Return raw response if requested
         if kwargs.get("raw_response", False):
             return {
@@ -688,8 +936,8 @@ class OpenSearchOperations:
             "performance_impact": performance_impact,
             "optimizations_applied": [],  # TODO: Track actual optimizations  # noqa: W0511
             "opensearch_query": (
-                opensearch_query.get("query", {}) if opensearch_query else {}
-            ),  # Include the query that was sent
+                complete_opensearch_query if "complete_opensearch_query" in locals() else {}
+            ),  # Include the full query body
             "time_range": time_range,
             "timestamp_field": timestamp_field,
             "query_type": "regular",  # Regular query (not stats)
@@ -703,19 +951,57 @@ class OpenSearchOperations:
             },
         }
+        # Add pagination stats if available
+        if pagination_stats:
+            result["post_processing_stats"] = pagination_stats
         # Add pagination info for non-scan queries
         if not scan_all:
-            result["pagination"] = {
+            # Cap displayed total at 10000 for consistency
+            displayed_total = min(opensearch_total, 10000)
+            pagination_info = {
                 "size": size,
-                "total": opensearch_total,
-                "has_more": len(hits) == size,  # If we got a full page, there might be more
+                "from": from_,
+                "total": displayed_total,
+                "actual_total": opensearch_total,  # Real total for reference
+                "returned": len(results),
             }
-            # Add sort values from the last hit for search_after pagination
-            if hits and sort:
-                last_hit = hits[-1]
-                if "sort" in last_hit:
-                    result["sort_values"] = last_hit["sort"]
+            if needs_phase2 and pagination_stats:
+                # Post-processing was applied - update pagination to reflect auto-pagination
+                actual_last_position = pagination_stats.get("actual_to", from_ + size)
+                # Update from to reflect where we actually searched to
+                if pagination_stats["pages_checked"] > 1:
+                    # We auto-paginated, so update the effective "from" position
+                    pagination_info["from"] = from_
+                    pagination_info["actual_from_searched"] = from_
+                    pagination_info["actual_to_searched"] = actual_last_position
+                    pagination_info["auto_paginated"] = True
+                    pagination_info["pages_auto_fetched"] = pagination_stats["pages_checked"]
+                # Has more if we haven't reached the 10000 limit
+                pagination_info["has_more"] = actual_last_position < 10000 and actual_last_position < opensearch_total
+                pagination_info["documents_retrieved"] = pagination_stats["documents_retrieved"]
+                pagination_info["documents_filtered"] = pagination_stats["documents_filtered"]
+                pagination_info["filter_rate"] = f"{pagination_stats['filter_rate']:.1f}%"
+                # Calculate the last valid page number (page that contains the 10,000th record)
+                last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
+                pagination_info["last_page"] = last_page
+                pagination_info["current_page"] = from_ // size
+            else:
+                # Regular pagination without post-processing
+                # Has more if we got full page and haven't reached 10000 limit
+                pagination_info["has_more"] = len(initial_hits) == size and (from_ + size < 10000)
+                # Calculate the last valid page number
+                last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
+                pagination_info["last_page"] = last_page
+                pagination_info["current_page"] = from_ // size
+            result["pagination"] = pagination_info
         return result
@@ -740,7 +1026,7 @@ class OpenSearchOperations:
     def _extract_grouped_buckets(  # noqa: C901
         self,
         aggs_response: Dict[str, Any],
-        group_by_fields: List[str],
+        group_by_fields: List[Any],
         aggregations: List[Dict[str, Any]],
         stats_ast: Dict[str, Any],
     ) -> List[Dict[str, Any]]:
@@ -748,7 +1034,7 @@ class OpenSearchOperations:
         Args:
             aggs_response: OpenSearch aggregations response
-            group_by_fields: List of fields used for grouping
+            group_by_fields: List of fields used for grouping (can be strings or dicts)
             aggregations: List of aggregation specifications
             stats_ast: The stats AST for reference
@@ -757,9 +1043,19 @@ class OpenSearchOperations:
         """
         buckets = []
+        # Normalize group_by_fields to extract field names
+        normalized_fields = []
+        for field in group_by_fields:
+            if isinstance(field, str):
+                normalized_fields.append(field)
+            elif isinstance(field, dict) and "field" in field:
+                normalized_fields.append(field["field"])
+            else:
+                normalized_fields.append(str(field))
         # For single-level grouping
-        if len(group_by_fields) == 1:
-            field = group_by_fields[0]
+        if len(normalized_fields) == 1:
+            field = normalized_fields[0]
             # Look for the terms aggregation with the group field name
             terms_agg_name = f"group_by_{field}"
@@ -811,6 +1107,16 @@ class OpenSearchOperations:
                                 bucket_result[output_key] = agg_value["value"]
                             else:
                                 bucket_result[output_key] = agg_value
+                        else:
+                            # For count(*), also check doc_count
+                            if func == "count" and field_name == "*":
+                                bucket_result[output_key] = bucket.get("doc_count", 0)
+                            else:
+                                # Try to find any aggregation value in the bucket
+                                for key, value in bucket.items():
+                                    if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
+                                        bucket_result[output_key] = value["value"]
+                                        break
                 buckets.append(bucket_result)
@@ -820,7 +1126,7 @@ class OpenSearchOperations:
             current_agg = aggs_response
             # Find the first group_by aggregation
-            for field in group_by_fields:
+            for field in normalized_fields:
                 group_key = f"group_by_{field}"
                 if group_key in current_agg:
                     current_agg = current_agg[group_key]
@@ -831,7 +1137,7 @@ class OpenSearchOperations:
             # Process nested buckets recursively
             if "buckets" in current_agg:
-                buckets = self._process_nested_buckets(current_agg["buckets"], group_by_fields, aggregations, 0)
+                buckets = self._process_nested_buckets(current_agg["buckets"], normalized_fields, aggregations, 0)
         return buckets
@@ -846,7 +1152,7 @@ class OpenSearchOperations:
         Args:
             buckets_data: List of bucket data from OpenSearch
-            group_by_fields: List of fields used for grouping
+            group_by_fields: List of fields used for grouping (already normalized to strings)
             aggregations: List of aggregation specifications
             level: Current nesting level (0-based)
@@ -910,6 +1216,16 @@ class OpenSearchOperations:
                                 result[output_key] = agg_value["value"]
                             else:
                                 result[output_key] = agg_value
+                        else:
+                            # For count(*), also check doc_count
+                            if func == "count" and field_name == "*":
+                                result[output_key] = bucket.get("doc_count", 0)
+                            else:
+                                # Try to find any aggregation value in the bucket
+                                for key, value in bucket.items():
+                                    if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
+                                        result[output_key] = value["value"]
+                                        break
                 results.append(result)

tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl