tellaro-query-language 0.1.9__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,11 +45,20 @@ class OpenSearchOperations:
45
45
  # Parse the query
46
46
  ast = self.parser.parse(query)
47
47
 
48
+ # Analyze the query for mutators
49
+ from ..mutator_analyzer import MutatorAnalyzer
50
+
51
+ analyzer = MutatorAnalyzer(self.field_mappings)
52
+ analysis_result = analyzer.analyze_ast(ast, context="opensearch")
53
+
54
+ # Use the optimized AST (with array operators removed)
55
+ optimized_ast = analysis_result.optimized_ast
56
+
48
57
  # Create OpenSearch backend
49
58
  backend = OpenSearchBackend(field_mappings=self.field_mappings)
50
59
 
51
- # Convert to OpenSearch query
52
- opensearch_query = backend.convert(ast)
60
+ # Convert to OpenSearch query using the optimized AST
61
+ opensearch_query = backend.convert(optimized_ast)
53
62
 
54
63
  return opensearch_query
55
64
 
@@ -96,7 +105,7 @@ class OpenSearchOperations:
96
105
  }
97
106
 
98
107
  # Create analyzer
99
- analyzer = MutatorAnalyzer(self.enhanced_mappings)
108
+ analyzer = MutatorAnalyzer(self.field_mappings)
100
109
 
101
110
  # Analyze the AST
102
111
  return analyzer.analyze_ast(ast)
@@ -175,7 +184,7 @@ class OpenSearchOperations:
175
184
  query: TQL query string
176
185
  index: OpenSearch index name (uses environment variable if not provided)
177
186
  size: Maximum number of results to return (default: 10000)
178
- from_: Offset for pagination (default: 0)
187
+ from_: Starting offset for pagination (max 10000 - size)
179
188
  sort: List of sort specifications
180
189
  source_includes: Fields to include in response
181
190
  source_excludes: Fields to exclude from response
@@ -226,6 +235,13 @@ class OpenSearchOperations:
226
235
  is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
227
236
 
228
237
  if is_stats_query:
238
+ # Analyze the query to check for mutators
239
+ analysis_result = self.analyze_opensearch_query(query)
240
+ has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
241
+ needs_post_processing_for_stats = (
242
+ has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
243
+ )
244
+
229
245
  # Handle stats queries differently
230
246
  from ..opensearch_stats import OpenSearchStatsTranslator
231
247
 
@@ -240,7 +256,11 @@ class OpenSearchOperations:
240
256
  # Convert filter to OpenSearch query
241
257
  backend = OpenSearchBackend(field_mappings=self.field_mappings)
242
258
  if filter_ast:
243
- filter_query = backend.convert(filter_ast)["query"]
259
+ # Use the optimized AST if we have mutators
260
+ if has_mutators and needs_post_processing_for_stats:
261
+ filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
262
+ else:
263
+ filter_query = backend.convert(filter_ast)["query"]
244
264
  else:
245
265
  filter_query = {"match_all": {}}
246
266
  else:
@@ -248,19 +268,27 @@ class OpenSearchOperations:
248
268
  stats_ast = ast
249
269
  filter_query = {"match_all": {}}
250
270
 
251
- # Build aggregations
252
- if stats_ast:
253
- stats_result = translator.translate_stats(stats_ast, self.field_mappings)
271
+ # For stats queries with post-processing mutators, we need to handle them differently
272
+ if needs_post_processing_for_stats:
273
+ # We'll need to fetch all documents and aggregate in memory
274
+ opensearch_query = {"query": filter_query}
275
+ needs_phase2 = True
276
+ # Store the stats AST for later processing
277
+ stats_ast_for_post_processing = stats_ast
254
278
  else:
255
- stats_result = {"aggs": {}}
279
+ # Build aggregations for direct OpenSearch execution
280
+ if stats_ast:
281
+ stats_result = translator.translate_stats(stats_ast, self.field_mappings)
282
+ else:
283
+ stats_result = {"aggs": {}}
256
284
 
257
- # Extract the aggregations (translate_stats returns {"aggs": {...}})
258
- aggregations = stats_result.get("aggs", {})
285
+ # Extract the aggregations (translate_stats returns {"aggs": {...}})
286
+ aggregations = stats_result.get("aggs", {})
259
287
 
260
- # Build the complete query
261
- opensearch_query = {"query": filter_query, "aggs": aggregations}
262
- needs_phase2 = False
263
- has_mutators = False
288
+ # Build the complete query
289
+ opensearch_query = {"query": filter_query, "aggs": aggregations}
290
+ needs_phase2 = False
291
+ stats_ast_for_post_processing = None
264
292
  else:
265
293
  # Parse and analyze the query normally
266
294
  analysis_result = self.analyze_opensearch_query(query)
@@ -316,17 +344,49 @@ class OpenSearchOperations:
316
344
  base_query = search_body.get("query", {})
317
345
  time_filter = {"range": {timestamp_field: time_range}}
318
346
 
319
- # Wrap the existing query with time filter
347
+ # Wrap the existing query with time filter in filter context
320
348
  if base_query:
321
- search_body["query"] = {"bool": {"must": [base_query, time_filter]}}
349
+ # If the base query is already a bool query, add to its filter array
350
+ if isinstance(base_query, dict) and base_query.get("bool"):
351
+ bool_query = base_query["bool"]
352
+ if "filter" in bool_query:
353
+ # Add to existing filter array
354
+ if isinstance(bool_query["filter"], list):
355
+ bool_query["filter"].append(time_filter)
356
+ else:
357
+ # Convert single filter to array
358
+ bool_query["filter"] = [bool_query["filter"], time_filter]
359
+ else:
360
+ # No filter array yet, create one
361
+ bool_query["filter"] = [time_filter]
362
+ search_body["query"] = base_query
363
+ else:
364
+ # Wrap in bool query with filter
365
+ search_body["query"] = {"bool": {"filter": [base_query, time_filter]}}
322
366
  else:
323
367
  search_body["query"] = time_filter
324
368
 
325
- search_body.update({"size": size, "from": from_, "track_total_hits": track_total_hits})
369
+ # For stats queries, set size based on whether we need documents for post-processing
370
+ if is_stats_query:
371
+ if needs_phase2:
372
+ # Need all documents for post-processing
373
+ search_body.update({"size": 10000, "track_total_hits": track_total_hits})
374
+ else:
375
+ # Pure aggregation query - no documents needed
376
+ search_body.update({"size": 0, "track_total_hits": track_total_hits})
377
+ else:
378
+ search_body.update({"size": size, "track_total_hits": track_total_hits})
326
379
 
327
380
  # Add optional parameters
328
381
  if sort:
329
382
  search_body["sort"] = sort
383
+
384
+ # Add from parameter for pagination (limit to 10000 total)
385
+ if from_ > 0:
386
+ # Ensure we don't exceed the 10000 limit
387
+ max_allowed_from = 10000 - size
388
+ from_ = min(from_, max_allowed_from)
389
+ search_body["from"] = from_
330
390
  if source_includes or source_excludes:
331
391
  search_body["_source"] = {}
332
392
  if source_includes:
@@ -339,6 +399,9 @@ class OpenSearchOperations:
339
399
  # Add any additional parameters from kwargs
340
400
  search_body.update(kwargs)
341
401
 
402
+ # Store the complete search body for debugging
403
+ complete_opensearch_query = search_body.copy()
404
+
342
405
  # Build search parameters
343
406
  search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
344
407
 
@@ -406,43 +469,125 @@ class OpenSearchOperations:
406
469
 
407
470
  # Handle stats query results differently
408
471
  if is_stats_query:
409
- # Process stats aggregation results
410
- aggs_response = response.get("aggregations", {})
472
+ if needs_phase2 and "stats_ast_for_post_processing" in locals():
473
+ # Stats query with post-processing - need to aggregate in memory
474
+ # First, get all documents and apply mutators
475
+ all_documents = []
476
+
477
+ # Handle scroll for large datasets
478
+ if scan_all or needs_phase2:
479
+ # Use scroll to get all documents
480
+ scroll_params = search_params.copy()
481
+ scroll_params["scroll"] = scroll_timeout
482
+ scroll_params["body"]["size"] = min(10000, scroll_size)
411
483
 
412
- # Format the stats results based on the test expectations
413
- # Use the correct stats AST
414
- if ast.get("type") == "query_with_stats":
415
- stats_ast = ast.get("stats")
416
- else:
417
- stats_ast = ast
484
+ try:
485
+ # Initial search
486
+ scroll_response = client.search(**scroll_params)
487
+ scroll_hits = scroll_response.get("hits", {}).get("hits", [])
488
+
489
+ while scroll_hits:
490
+ for hit in scroll_hits:
491
+ all_documents.append(hit["_source"])
492
+
493
+ scroll_id = scroll_response.get("_scroll_id")
494
+ if not scroll_id:
495
+ break
496
+
497
+ scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
498
+ scroll_hits = scroll_response.get("hits", {}).get("hits", [])
499
+
500
+ # Clean up scroll
501
+ if scroll_id:
502
+ try:
503
+ client.clear_scroll(scroll_id=scroll_id)
504
+ except Exception:
505
+ pass
506
+ except Exception as e:
507
+ raise TQLExecutionError(f"Failed to fetch documents for stats post-processing: {str(e)}")
508
+ else:
509
+ # Fetch documents with regular pagination
510
+ for hit in response.get("hits", {}).get("hits", []):
511
+ all_documents.append(hit["_source"])
512
+
513
+ # Apply post-processing mutators
514
+ if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
515
+ processor = QueryPostProcessor()
516
+ processed_docs = processor.process_results(
517
+ all_documents, analysis_result.post_processing_requirements, track_enrichments=False
518
+ )
519
+ # Filter if needed
520
+ filtered_docs = processor.filter_results(
521
+ processed_docs, analysis_result.post_processing_requirements
522
+ )
523
+ else:
524
+ filtered_docs = all_documents
525
+
526
+ # Now perform in-memory aggregation
527
+ from ..stats_evaluator import TQLStatsEvaluator
528
+
529
+ stats_evaluator = TQLStatsEvaluator()
530
+
531
+ # Execute the stats aggregation in memory
532
+ stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
533
+
534
+ # Format response for stats-only (no documents)
535
+ result = {
536
+ "stats": stats_results,
537
+ "total": len(filtered_docs),
538
+ "post_processing_applied": True,
539
+ "health_status": "red",
540
+ "health_reasons": [
541
+ {
542
+ "status": "red",
543
+ "query_part": "stats with post-processing",
544
+ "reason": f"Stats query required fetching {len(all_documents)} documents for post-processing",
545
+ }
546
+ ],
547
+ "performance_impact": {
548
+ "overhead_ms": 0, # Would need timing to calculate
549
+ "documents_processed": len(all_documents),
550
+ "mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
551
+ },
552
+ "opensearch_query": complete_opensearch_query,
553
+ }
418
554
 
419
- # Extract aggregation info
420
- if stats_ast:
421
- aggregations = stats_ast.get("aggregations", [])
422
- group_by_fields = stats_ast.get("group_by", [])
555
+ return result
423
556
  else:
424
- aggregations = []
425
- group_by_fields = []
557
+ # Regular stats query using OpenSearch aggregations
558
+ aggs_response = response.get("aggregations", {})
426
559
 
427
- # Format results differently based on whether we have grouping
428
- if group_by_fields:
429
- # For grouped stats, we need to extract buckets
560
+ # Format the stats results based on the test expectations
561
+ # Use the correct stats AST
562
+ if ast.get("type") == "query_with_stats":
563
+ stats_ast = ast.get("stats")
564
+ else:
565
+ stats_ast = ast
566
+
567
+ # Extract aggregation info
430
568
  if stats_ast:
431
- buckets = self._extract_grouped_buckets(aggs_response, group_by_fields, aggregations, stats_ast)
569
+ aggregations = stats_ast.get("aggregations", [])
570
+ group_by_fields = stats_ast.get("group_by", [])
432
571
  else:
433
- buckets = []
434
-
435
- # For multiple aggregations, include all operations
436
- operations = [agg.get("function") for agg in aggregations]
437
- fields = [agg.get("field") for agg in aggregations]
438
-
439
- stats_results = {
440
- "type": "stats_grouped",
441
- "operation": operations[0] if len(operations) == 1 else operations,
442
- "field": fields[0] if len(fields) == 1 else fields,
443
- "results": buckets, # Array of buckets for grouped results
444
- "group_by": group_by_fields,
445
- }
572
+ aggregations = []
573
+ group_by_fields = []
574
+
575
+ # Format results differently based on whether we have grouping
576
+ if group_by_fields:
577
+ # Use the OpenSearchStatsTranslator to properly transform the response
578
+ from ..opensearch_stats import OpenSearchStatsTranslator
579
+
580
+ translator = OpenSearchStatsTranslator()
581
+
582
+ # Transform the response using the translator
583
+ transformed_response = translator.transform_response(response, stats_ast)
584
+
585
+ # The transformed response already has the correct structure
586
+ stats_results = transformed_response
587
+
588
+ # Add viz_hint if present in stats AST
589
+ if stats_ast and stats_ast.get("viz_hint"):
590
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
446
591
  else:
447
592
  # Simple aggregations without grouping
448
593
  if aggregations:
@@ -486,6 +631,10 @@ class OpenSearchOperations:
486
631
  "values": value,
487
632
  "group_by": [],
488
633
  }
634
+
635
+ # Add viz_hint if present in stats AST
636
+ if stats_ast and stats_ast.get("viz_hint"):
637
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
489
638
  else:
490
639
  # Multiple aggregations
491
640
  agg_results = {}
@@ -524,50 +673,143 @@ class OpenSearchOperations:
524
673
  "type": "stats",
525
674
  "results": agg_results,
526
675
  }
676
+
677
+ # Add viz_hint if present in stats AST
678
+ if stats_ast and stats_ast.get("viz_hint"):
679
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
527
680
  else:
528
681
  stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
529
682
 
530
- # Extract hits if size > 0
531
- hits = response.get("hits", {}).get("hits", [])
532
- documents = []
533
- if size > 0 and hits:
534
- for hit in hits:
535
- doc = hit["_source"].copy()
536
- # Preserve metadata
537
- if "_id" in hit:
538
- doc["_id"] = hit["_id"]
539
- if "_score" in hit:
540
- doc["_score"] = hit["_score"]
541
- documents.append(doc)
542
-
543
- # Return in the expected format
683
+ # For stats queries, return only stats (no documents)
684
+ # Total from aggregation metadata or hit count
685
+ total_count = response.get("hits", {}).get("total", {}).get("value", 0)
686
+
687
+ # Return stats-only format
544
688
  result = {
545
- "results": documents,
546
- "total": response.get("hits", {}).get("total", {}).get("value", 0),
547
689
  "stats": stats_results,
690
+ "total": total_count,
548
691
  "post_processing_applied": False,
549
- "health_status": "HEALTHY",
692
+ "health_status": "green",
550
693
  "health_reasons": [],
551
694
  "performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
552
- "scan_info": {"used_scan": False},
695
+ "opensearch_query": complete_opensearch_query,
696
+ "query_type": "stats",
553
697
  }
554
698
 
555
- # Add query_type if documents were requested
556
- if size > 0:
557
- result["query_type"] = "stats_with_docs"
558
-
559
699
  return result
560
700
 
561
701
  # Extract hits for regular queries
562
- hits = response.get("hits", {}).get("hits", [])
702
+ initial_hits = response.get("hits", {}).get("hits", [])
563
703
  total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
564
704
 
565
705
  # Process results based on whether we need Phase 2
566
- if needs_phase2:
567
- # Apply Phase 2 processing
706
+ if needs_phase2 and not scan_all:
707
+ # Pagination with post-processing - continue fetching pages until we get results
708
+ processor = QueryPostProcessor()
709
+ results: List[Dict[str, Any]] = []
710
+ total_documents_before_filter = 0
711
+ total_documents_after_filter = 0
712
+ current_from = from_
713
+ pages_checked = 0
714
+ max_pages_to_check = min(10, (total_hits // size) + 1) if size > 0 else 1 # Limit to prevent infinite loops
715
+
716
+ while len(results) < size and pages_checked < max_pages_to_check and current_from < total_hits:
717
+ # Fetch current page
718
+ if pages_checked > 0:
719
+ # Need to fetch next page
720
+ search_params["body"]["from"] = current_from
721
+ try:
722
+ response = client.search(**search_params)
723
+ except Exception as e:
724
+ raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
725
+ current_hits = response.get("hits", {}).get("hits", [])
726
+ else:
727
+ # Use initial hits for first page
728
+ current_hits = initial_hits
729
+
730
+ if not current_hits:
731
+ break # No more results
732
+
733
+ # Process the hits from this page
734
+ documents = []
735
+ hit_metadata = []
736
+ for hit in current_hits:
737
+ documents.append(hit["_source"])
738
+ hit_metadata.append(
739
+ {
740
+ "_id": hit.get("_id"),
741
+ "_score": hit.get("_score"),
742
+ "_explanation": hit.get("_explanation") if explain else None,
743
+ }
744
+ )
745
+
746
+ total_documents_before_filter += len(documents)
747
+
748
+ # Apply post-processing
749
+ if isinstance(analysis_result, MutatorAnalysisResult):
750
+ processed_docs = processor.process_results(
751
+ documents,
752
+ analysis_result.post_processing_requirements,
753
+ track_enrichments=kwargs.get("save_enrichment", False),
754
+ )
755
+
756
+ # Filter results
757
+ filtered_docs = processor.filter_results(
758
+ processed_docs, analysis_result.post_processing_requirements
759
+ )
760
+ else:
761
+ processed_docs = documents
762
+ filtered_docs = documents
763
+
764
+ # Add filtered results with metadata
765
+ for doc in filtered_docs:
766
+ if len(results) >= size:
767
+ break # We have enough results
768
+
769
+ # Find the original hit metadata
770
+ for i, orig_doc in enumerate(documents):
771
+ if orig_doc == doc or self._docs_match(orig_doc, doc):
772
+ # Add metadata
773
+ if hit_metadata[i]["_id"]:
774
+ doc["_id"] = hit_metadata[i]["_id"]
775
+ if hit_metadata[i]["_score"]:
776
+ doc["_score"] = hit_metadata[i]["_score"]
777
+ if hit_metadata[i]["_explanation"]:
778
+ doc["_explanation"] = hit_metadata[i]["_explanation"]
779
+ break
780
+ results.append(doc)
781
+
782
+ total_documents_after_filter += len(filtered_docs)
783
+
784
+ # Move to next page
785
+ current_from += size
786
+ pages_checked += 1
787
+
788
+ # Store filtering stats
789
+ pagination_stats = {
790
+ "page_size": size,
791
+ "pages_checked": pages_checked,
792
+ "documents_retrieved": total_documents_before_filter,
793
+ "documents_returned": len(results),
794
+ "documents_filtered": total_documents_before_filter - total_documents_after_filter,
795
+ "filter_rate": (
796
+ (
797
+ (total_documents_before_filter - total_documents_after_filter)
798
+ / total_documents_before_filter
799
+ * 100
800
+ )
801
+ if total_documents_before_filter > 0
802
+ else 0
803
+ ),
804
+ "actual_from": from_, # Original from
805
+ "actual_to": current_from, # Where we ended up searching to
806
+ }
807
+
808
+ elif needs_phase2 and scan_all:
809
+ # scan_all mode with post-processing - process all results
568
810
  processor = QueryPostProcessor()
569
811
 
570
- # Extract documents from hits
812
+ # Extract all documents from hits
571
813
  documents = []
572
814
  hit_metadata = []
573
815
  for hit in hits:
@@ -588,10 +830,9 @@ class OpenSearchOperations:
588
830
  track_enrichments=kwargs.get("save_enrichment", False),
589
831
  )
590
832
 
591
- # Then filter results based on requirements (e.g., ALL operator, contains with mutators)
833
+ # Then filter results based on requirements
592
834
  filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
593
835
  else:
594
- # No post-processing needed
595
836
  processed_docs = documents
596
837
  filtered_docs = documents
597
838
 
@@ -599,7 +840,6 @@ class OpenSearchOperations:
599
840
  results = []
600
841
  for doc in filtered_docs:
601
842
  # Find the original hit metadata for this document
602
- # This is a simple approach - in production you might want to track IDs
603
843
  for i, orig_doc in enumerate(documents):
604
844
  if orig_doc == doc or self._docs_match(orig_doc, doc):
605
845
  # Add metadata
@@ -611,9 +851,17 @@ class OpenSearchOperations:
611
851
  doc["_explanation"] = hit_metadata[i]["_explanation"]
612
852
  break
613
853
  results.append(doc)
854
+
855
+ pagination_stats = {
856
+ "documents_scanned": len(documents),
857
+ "documents_passed": len(results),
858
+ "filter_rate": (len(results) / len(documents) * 100) if documents else 0,
859
+ }
860
+
614
861
  else:
615
862
  # No Phase 2 needed, just extract documents
616
863
  results = []
864
+ hits = initial_hits # Use the initial hits
617
865
  for hit in hits:
618
866
  doc = hit["_source"].copy()
619
867
  # Preserve metadata
@@ -625,6 +873,8 @@ class OpenSearchOperations:
625
873
  doc["_explanation"] = hit["explanation"]
626
874
  results.append(doc)
627
875
 
876
+ pagination_stats = None
877
+
628
878
  # Return raw response if requested
629
879
  if kwargs.get("raw_response", False):
630
880
  return {
@@ -686,8 +936,8 @@ class OpenSearchOperations:
686
936
  "performance_impact": performance_impact,
687
937
  "optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
688
938
  "opensearch_query": (
689
- opensearch_query.get("query", {}) if opensearch_query else {}
690
- ), # Include the query that was sent
939
+ complete_opensearch_query if "complete_opensearch_query" in locals() else {}
940
+ ), # Include the full query body
691
941
  "time_range": time_range,
692
942
  "timestamp_field": timestamp_field,
693
943
  "query_type": "regular", # Regular query (not stats)
@@ -701,15 +951,58 @@ class OpenSearchOperations:
701
951
  },
702
952
  }
703
953
 
954
+ # Add pagination stats if available
955
+ if pagination_stats:
956
+ result["post_processing_stats"] = pagination_stats
957
+
704
958
  # Add pagination info for non-scan queries
705
959
  if not scan_all:
706
- result["pagination"] = {
960
+ # Cap displayed total at 10000 for consistency
961
+ displayed_total = min(opensearch_total, 10000)
962
+
963
+ pagination_info = {
707
964
  "size": size,
708
965
  "from": from_,
709
- "total": opensearch_total,
710
- "has_more": opensearch_total > (from_ + len(results)),
966
+ "total": displayed_total,
967
+ "actual_total": opensearch_total, # Real total for reference
968
+ "returned": len(results),
711
969
  }
712
970
 
971
+ if needs_phase2 and pagination_stats:
972
+ # Post-processing was applied - update pagination to reflect auto-pagination
973
+ actual_last_position = pagination_stats.get("actual_to", from_ + size)
974
+
975
+ # Update from to reflect where we actually searched to
976
+ if pagination_stats["pages_checked"] > 1:
977
+ # We auto-paginated, so update the effective "from" position
978
+ pagination_info["from"] = from_
979
+ pagination_info["actual_from_searched"] = from_
980
+ pagination_info["actual_to_searched"] = actual_last_position
981
+ pagination_info["auto_paginated"] = True
982
+ pagination_info["pages_auto_fetched"] = pagination_stats["pages_checked"]
983
+
984
+ # Has more if we haven't reached the 10000 limit
985
+ pagination_info["has_more"] = actual_last_position < 10000 and actual_last_position < opensearch_total
986
+ pagination_info["documents_retrieved"] = pagination_stats["documents_retrieved"]
987
+ pagination_info["documents_filtered"] = pagination_stats["documents_filtered"]
988
+ pagination_info["filter_rate"] = f"{pagination_stats['filter_rate']:.1f}%"
989
+
990
+ # Calculate the last valid page number (page that contains the 10,000th record)
991
+ last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
992
+ pagination_info["last_page"] = last_page
993
+ pagination_info["current_page"] = from_ // size
994
+ else:
995
+ # Regular pagination without post-processing
996
+ # Has more if we got full page and haven't reached 10000 limit
997
+ pagination_info["has_more"] = len(initial_hits) == size and (from_ + size < 10000)
998
+
999
+ # Calculate the last valid page number
1000
+ last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
1001
+ pagination_info["last_page"] = last_page
1002
+ pagination_info["current_page"] = from_ // size
1003
+
1004
+ result["pagination"] = pagination_info
1005
+
713
1006
  return result
714
1007
 
715
1008
  def _docs_match(self, doc1: Dict[str, Any], doc2: Dict[str, Any]) -> bool:
@@ -733,7 +1026,7 @@ class OpenSearchOperations:
733
1026
  def _extract_grouped_buckets( # noqa: C901
734
1027
  self,
735
1028
  aggs_response: Dict[str, Any],
736
- group_by_fields: List[str],
1029
+ group_by_fields: List[Any],
737
1030
  aggregations: List[Dict[str, Any]],
738
1031
  stats_ast: Dict[str, Any],
739
1032
  ) -> List[Dict[str, Any]]:
@@ -741,7 +1034,7 @@ class OpenSearchOperations:
741
1034
 
742
1035
  Args:
743
1036
  aggs_response: OpenSearch aggregations response
744
- group_by_fields: List of fields used for grouping
1037
+ group_by_fields: List of fields used for grouping (can be strings or dicts)
745
1038
  aggregations: List of aggregation specifications
746
1039
  stats_ast: The stats AST for reference
747
1040
 
@@ -750,9 +1043,19 @@ class OpenSearchOperations:
750
1043
  """
751
1044
  buckets = []
752
1045
 
1046
+ # Normalize group_by_fields to extract field names
1047
+ normalized_fields = []
1048
+ for field in group_by_fields:
1049
+ if isinstance(field, str):
1050
+ normalized_fields.append(field)
1051
+ elif isinstance(field, dict) and "field" in field:
1052
+ normalized_fields.append(field["field"])
1053
+ else:
1054
+ normalized_fields.append(str(field))
1055
+
753
1056
  # For single-level grouping
754
- if len(group_by_fields) == 1:
755
- field = group_by_fields[0]
1057
+ if len(normalized_fields) == 1:
1058
+ field = normalized_fields[0]
756
1059
  # Look for the terms aggregation with the group field name
757
1060
  terms_agg_name = f"group_by_{field}"
758
1061
 
@@ -804,6 +1107,16 @@ class OpenSearchOperations:
804
1107
  bucket_result[output_key] = agg_value["value"]
805
1108
  else:
806
1109
  bucket_result[output_key] = agg_value
1110
+ else:
1111
+ # For count(*), also check doc_count
1112
+ if func == "count" and field_name == "*":
1113
+ bucket_result[output_key] = bucket.get("doc_count", 0)
1114
+ else:
1115
+ # Try to find any aggregation value in the bucket
1116
+ for key, value in bucket.items():
1117
+ if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
1118
+ bucket_result[output_key] = value["value"]
1119
+ break
807
1120
 
808
1121
  buckets.append(bucket_result)
809
1122
 
@@ -813,7 +1126,7 @@ class OpenSearchOperations:
813
1126
  current_agg = aggs_response
814
1127
 
815
1128
  # Find the first group_by aggregation
816
- for field in group_by_fields:
1129
+ for field in normalized_fields:
817
1130
  group_key = f"group_by_{field}"
818
1131
  if group_key in current_agg:
819
1132
  current_agg = current_agg[group_key]
@@ -824,7 +1137,7 @@ class OpenSearchOperations:
824
1137
 
825
1138
  # Process nested buckets recursively
826
1139
  if "buckets" in current_agg:
827
- buckets = self._process_nested_buckets(current_agg["buckets"], group_by_fields, aggregations, 0)
1140
+ buckets = self._process_nested_buckets(current_agg["buckets"], normalized_fields, aggregations, 0)
828
1141
 
829
1142
  return buckets
830
1143
 
@@ -839,7 +1152,7 @@ class OpenSearchOperations:
839
1152
 
840
1153
  Args:
841
1154
  buckets_data: List of bucket data from OpenSearch
842
- group_by_fields: List of fields used for grouping
1155
+ group_by_fields: List of fields used for grouping (already normalized to strings)
843
1156
  aggregations: List of aggregation specifications
844
1157
  level: Current nesting level (0-based)
845
1158
 
@@ -903,6 +1216,16 @@ class OpenSearchOperations:
903
1216
  result[output_key] = agg_value["value"]
904
1217
  else:
905
1218
  result[output_key] = agg_value
1219
+ else:
1220
+ # For count(*), also check doc_count
1221
+ if func == "count" and field_name == "*":
1222
+ result[output_key] = bucket.get("doc_count", 0)
1223
+ else:
1224
+ # Try to find any aggregation value in the bucket
1225
+ for key, value in bucket.items():
1226
+ if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
1227
+ result[output_key] = value["value"]
1228
+ break
906
1229
 
907
1230
  results.append(result)
908
1231