tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,11 +45,20 @@ class OpenSearchOperations:
45
45
  # Parse the query
46
46
  ast = self.parser.parse(query)
47
47
 
48
+ # Analyze the query for mutators
49
+ from ..mutator_analyzer import MutatorAnalyzer
50
+
51
+ analyzer = MutatorAnalyzer(self.field_mappings)
52
+ analysis_result = analyzer.analyze_ast(ast, context="opensearch")
53
+
54
+ # Use the optimized AST (with array operators removed)
55
+ optimized_ast = analysis_result.optimized_ast
56
+
48
57
  # Create OpenSearch backend
49
58
  backend = OpenSearchBackend(field_mappings=self.field_mappings)
50
59
 
51
- # Convert to OpenSearch query
52
- opensearch_query = backend.convert(ast)
60
+ # Convert to OpenSearch query using the optimized AST
61
+ opensearch_query = backend.convert(optimized_ast)
53
62
 
54
63
  return opensearch_query
55
64
 
@@ -96,7 +105,7 @@ class OpenSearchOperations:
96
105
  }
97
106
 
98
107
  # Create analyzer
99
- analyzer = MutatorAnalyzer(self.enhanced_mappings)
108
+ analyzer = MutatorAnalyzer(self.field_mappings)
100
109
 
101
110
  # Analyze the AST
102
111
  return analyzer.analyze_ast(ast)
@@ -141,7 +150,7 @@ class OpenSearchOperations:
141
150
  query: str,
142
151
  index: Optional[str] = None,
143
152
  size: int = 10000,
144
- search_after: Optional[List[Any]] = None,
153
+ from_: int = 0,
145
154
  sort: Optional[List[Dict[str, Any]]] = None,
146
155
  source_includes: Optional[List[str]] = None,
147
156
  source_excludes: Optional[List[str]] = None,
@@ -175,8 +184,8 @@ class OpenSearchOperations:
175
184
  query: TQL query string
176
185
  index: OpenSearch index name (uses environment variable if not provided)
177
186
  size: Maximum number of results to return (default: 10000)
178
- search_after: Values from previous result for pagination
179
- sort: List of sort specifications (required for search_after)
187
+ from_: Starting offset for pagination (max 10000 - size)
188
+ sort: List of sort specifications
180
189
  source_includes: Fields to include in response
181
190
  source_excludes: Fields to exclude from response
182
191
  track_total_hits: Whether to track total hit count
@@ -226,6 +235,13 @@ class OpenSearchOperations:
226
235
  is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
227
236
 
228
237
  if is_stats_query:
238
+ # Analyze the query to check for mutators
239
+ analysis_result = self.analyze_opensearch_query(query)
240
+ has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
241
+ needs_post_processing_for_stats = (
242
+ has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
243
+ )
244
+
229
245
  # Handle stats queries differently
230
246
  from ..opensearch_stats import OpenSearchStatsTranslator
231
247
 
@@ -240,7 +256,11 @@ class OpenSearchOperations:
240
256
  # Convert filter to OpenSearch query
241
257
  backend = OpenSearchBackend(field_mappings=self.field_mappings)
242
258
  if filter_ast:
243
- filter_query = backend.convert(filter_ast)["query"]
259
+ # Use the optimized AST if we have mutators
260
+ if has_mutators and needs_post_processing_for_stats:
261
+ filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
262
+ else:
263
+ filter_query = backend.convert(filter_ast)["query"]
244
264
  else:
245
265
  filter_query = {"match_all": {}}
246
266
  else:
@@ -248,19 +268,27 @@ class OpenSearchOperations:
248
268
  stats_ast = ast
249
269
  filter_query = {"match_all": {}}
250
270
 
251
- # Build aggregations
252
- if stats_ast:
253
- stats_result = translator.translate_stats(stats_ast, self.field_mappings)
271
+ # For stats queries with post-processing mutators, we need to handle them differently
272
+ if needs_post_processing_for_stats:
273
+ # We'll need to fetch all documents and aggregate in memory
274
+ opensearch_query = {"query": filter_query}
275
+ needs_phase2 = True
276
+ # Store the stats AST for later processing
277
+ stats_ast_for_post_processing = stats_ast
254
278
  else:
255
- stats_result = {"aggs": {}}
279
+ # Build aggregations for direct OpenSearch execution
280
+ if stats_ast:
281
+ stats_result = translator.translate_stats(stats_ast, self.field_mappings)
282
+ else:
283
+ stats_result = {"aggs": {}}
256
284
 
257
- # Extract the aggregations (translate_stats returns {"aggs": {...}})
258
- aggregations = stats_result.get("aggs", {})
285
+ # Extract the aggregations (translate_stats returns {"aggs": {...}})
286
+ aggregations = stats_result.get("aggs", {})
259
287
 
260
- # Build the complete query
261
- opensearch_query = {"query": filter_query, "aggs": aggregations}
262
- needs_phase2 = False
263
- has_mutators = False
288
+ # Build the complete query
289
+ opensearch_query = {"query": filter_query, "aggs": aggregations}
290
+ needs_phase2 = False
291
+ stats_ast_for_post_processing = None
264
292
  else:
265
293
  # Parse and analyze the query normally
266
294
  analysis_result = self.analyze_opensearch_query(query)
@@ -316,19 +344,49 @@ class OpenSearchOperations:
316
344
  base_query = search_body.get("query", {})
317
345
  time_filter = {"range": {timestamp_field: time_range}}
318
346
 
319
- # Wrap the existing query with time filter
347
+ # Wrap the existing query with time filter in filter context
320
348
  if base_query:
321
- search_body["query"] = {"bool": {"must": [base_query, time_filter]}}
349
+ # If the base query is already a bool query, add to its filter array
350
+ if isinstance(base_query, dict) and base_query.get("bool"):
351
+ bool_query = base_query["bool"]
352
+ if "filter" in bool_query:
353
+ # Add to existing filter array
354
+ if isinstance(bool_query["filter"], list):
355
+ bool_query["filter"].append(time_filter)
356
+ else:
357
+ # Convert single filter to array
358
+ bool_query["filter"] = [bool_query["filter"], time_filter]
359
+ else:
360
+ # No filter array yet, create one
361
+ bool_query["filter"] = [time_filter]
362
+ search_body["query"] = base_query
363
+ else:
364
+ # Wrap in bool query with filter
365
+ search_body["query"] = {"bool": {"filter": [base_query, time_filter]}}
322
366
  else:
323
367
  search_body["query"] = time_filter
324
368
 
325
- search_body.update({"size": size, "track_total_hits": track_total_hits})
369
+ # For stats queries, set size based on whether we need documents for post-processing
370
+ if is_stats_query:
371
+ if needs_phase2:
372
+ # Need all documents for post-processing
373
+ search_body.update({"size": 10000, "track_total_hits": track_total_hits})
374
+ else:
375
+ # Pure aggregation query - no documents needed
376
+ search_body.update({"size": 0, "track_total_hits": track_total_hits})
377
+ else:
378
+ search_body.update({"size": size, "track_total_hits": track_total_hits})
326
379
 
327
380
  # Add optional parameters
328
381
  if sort:
329
382
  search_body["sort"] = sort
330
- if search_after:
331
- search_body["search_after"] = search_after
383
+
384
+ # Add from parameter for pagination (limit to 10000 total)
385
+ if from_ > 0:
386
+ # Ensure we don't exceed the 10000 limit
387
+ max_allowed_from = 10000 - size
388
+ from_ = min(from_, max_allowed_from)
389
+ search_body["from"] = from_
332
390
  if source_includes or source_excludes:
333
391
  search_body["_source"] = {}
334
392
  if source_includes:
@@ -341,6 +399,9 @@ class OpenSearchOperations:
341
399
  # Add any additional parameters from kwargs
342
400
  search_body.update(kwargs)
343
401
 
402
+ # Store the complete search body for debugging
403
+ complete_opensearch_query = search_body.copy()
404
+
344
405
  # Build search parameters
345
406
  search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
346
407
 
@@ -408,43 +469,125 @@ class OpenSearchOperations:
408
469
 
409
470
  # Handle stats query results differently
410
471
  if is_stats_query:
411
- # Process stats aggregation results
412
- aggs_response = response.get("aggregations", {})
472
+ if needs_phase2 and "stats_ast_for_post_processing" in locals():
473
+ # Stats query with post-processing - need to aggregate in memory
474
+ # First, get all documents and apply mutators
475
+ all_documents = []
476
+
477
+ # Handle scroll for large datasets
478
+ if scan_all or needs_phase2:
479
+ # Use scroll to get all documents
480
+ scroll_params = search_params.copy()
481
+ scroll_params["scroll"] = scroll_timeout
482
+ scroll_params["body"]["size"] = min(10000, scroll_size)
413
483
 
414
- # Format the stats results based on the test expectations
415
- # Use the correct stats AST
416
- if ast.get("type") == "query_with_stats":
417
- stats_ast = ast.get("stats")
418
- else:
419
- stats_ast = ast
484
+ try:
485
+ # Initial search
486
+ scroll_response = client.search(**scroll_params)
487
+ scroll_hits = scroll_response.get("hits", {}).get("hits", [])
488
+
489
+ while scroll_hits:
490
+ for hit in scroll_hits:
491
+ all_documents.append(hit["_source"])
492
+
493
+ scroll_id = scroll_response.get("_scroll_id")
494
+ if not scroll_id:
495
+ break
496
+
497
+ scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
498
+ scroll_hits = scroll_response.get("hits", {}).get("hits", [])
499
+
500
+ # Clean up scroll
501
+ if scroll_id:
502
+ try:
503
+ client.clear_scroll(scroll_id=scroll_id)
504
+ except Exception:
505
+ pass
506
+ except Exception as e:
507
+ raise TQLExecutionError(f"Failed to fetch documents for stats post-processing: {str(e)}")
508
+ else:
509
+ # Fetch documents with regular pagination
510
+ for hit in response.get("hits", {}).get("hits", []):
511
+ all_documents.append(hit["_source"])
512
+
513
+ # Apply post-processing mutators
514
+ if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
515
+ processor = QueryPostProcessor()
516
+ processed_docs = processor.process_results(
517
+ all_documents, analysis_result.post_processing_requirements, track_enrichments=False
518
+ )
519
+ # Filter if needed
520
+ filtered_docs = processor.filter_results(
521
+ processed_docs, analysis_result.post_processing_requirements
522
+ )
523
+ else:
524
+ filtered_docs = all_documents
525
+
526
+ # Now perform in-memory aggregation
527
+ from ..stats_evaluator import TQLStatsEvaluator
528
+
529
+ stats_evaluator = TQLStatsEvaluator()
530
+
531
+ # Execute the stats aggregation in memory
532
+ stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
533
+
534
+ # Format response for stats-only (no documents)
535
+ result = {
536
+ "stats": stats_results,
537
+ "total": len(filtered_docs),
538
+ "post_processing_applied": True,
539
+ "health_status": "red",
540
+ "health_reasons": [
541
+ {
542
+ "status": "red",
543
+ "query_part": "stats with post-processing",
544
+ "reason": f"Stats query required fetching {len(all_documents)} documents for post-processing",
545
+ }
546
+ ],
547
+ "performance_impact": {
548
+ "overhead_ms": 0, # Would need timing to calculate
549
+ "documents_processed": len(all_documents),
550
+ "mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
551
+ },
552
+ "opensearch_query": complete_opensearch_query,
553
+ }
420
554
 
421
- # Extract aggregation info
422
- if stats_ast:
423
- aggregations = stats_ast.get("aggregations", [])
424
- group_by_fields = stats_ast.get("group_by", [])
555
+ return result
425
556
  else:
426
- aggregations = []
427
- group_by_fields = []
557
+ # Regular stats query using OpenSearch aggregations
558
+ aggs_response = response.get("aggregations", {})
428
559
 
429
- # Format results differently based on whether we have grouping
430
- if group_by_fields:
431
- # For grouped stats, we need to extract buckets
560
+ # Format the stats results based on the test expectations
561
+ # Use the correct stats AST
562
+ if ast.get("type") == "query_with_stats":
563
+ stats_ast = ast.get("stats")
564
+ else:
565
+ stats_ast = ast
566
+
567
+ # Extract aggregation info
432
568
  if stats_ast:
433
- buckets = self._extract_grouped_buckets(aggs_response, group_by_fields, aggregations, stats_ast)
569
+ aggregations = stats_ast.get("aggregations", [])
570
+ group_by_fields = stats_ast.get("group_by", [])
434
571
  else:
435
- buckets = []
436
-
437
- # For multiple aggregations, include all operations
438
- operations = [agg.get("function") for agg in aggregations]
439
- fields = [agg.get("field") for agg in aggregations]
440
-
441
- stats_results = {
442
- "type": "stats_grouped",
443
- "operation": operations[0] if len(operations) == 1 else operations,
444
- "field": fields[0] if len(fields) == 1 else fields,
445
- "results": buckets, # Array of buckets for grouped results
446
- "group_by": group_by_fields,
447
- }
572
+ aggregations = []
573
+ group_by_fields = []
574
+
575
+ # Format results differently based on whether we have grouping
576
+ if group_by_fields:
577
+ # Use the OpenSearchStatsTranslator to properly transform the response
578
+ from ..opensearch_stats import OpenSearchStatsTranslator
579
+
580
+ translator = OpenSearchStatsTranslator()
581
+
582
+ # Transform the response using the translator
583
+ transformed_response = translator.transform_response(response, stats_ast)
584
+
585
+ # The transformed response already has the correct structure
586
+ stats_results = transformed_response
587
+
588
+ # Add viz_hint if present in stats AST
589
+ if stats_ast and stats_ast.get("viz_hint"):
590
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
448
591
  else:
449
592
  # Simple aggregations without grouping
450
593
  if aggregations:
@@ -488,6 +631,10 @@ class OpenSearchOperations:
488
631
  "values": value,
489
632
  "group_by": [],
490
633
  }
634
+
635
+ # Add viz_hint if present in stats AST
636
+ if stats_ast and stats_ast.get("viz_hint"):
637
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
491
638
  else:
492
639
  # Multiple aggregations
493
640
  agg_results = {}
@@ -526,50 +673,143 @@ class OpenSearchOperations:
526
673
  "type": "stats",
527
674
  "results": agg_results,
528
675
  }
676
+
677
+ # Add viz_hint if present in stats AST
678
+ if stats_ast and stats_ast.get("viz_hint"):
679
+ stats_results["viz_hint"] = stats_ast["viz_hint"]
529
680
  else:
530
681
  stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
531
682
 
532
- # Extract hits if size > 0
533
- hits = response.get("hits", {}).get("hits", [])
534
- documents = []
535
- if size > 0 and hits:
536
- for hit in hits:
537
- doc = hit["_source"].copy()
538
- # Preserve metadata
539
- if "_id" in hit:
540
- doc["_id"] = hit["_id"]
541
- if "_score" in hit:
542
- doc["_score"] = hit["_score"]
543
- documents.append(doc)
544
-
545
- # Return in the expected format
683
+ # For stats queries, return only stats (no documents)
684
+ # Total from aggregation metadata or hit count
685
+ total_count = response.get("hits", {}).get("total", {}).get("value", 0)
686
+
687
+ # Return stats-only format
546
688
  result = {
547
- "results": documents,
548
- "total": response.get("hits", {}).get("total", {}).get("value", 0),
549
689
  "stats": stats_results,
690
+ "total": total_count,
550
691
  "post_processing_applied": False,
551
- "health_status": "HEALTHY",
692
+ "health_status": "green",
552
693
  "health_reasons": [],
553
694
  "performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
554
- "scan_info": {"used_scan": False},
695
+ "opensearch_query": complete_opensearch_query,
696
+ "query_type": "stats",
555
697
  }
556
698
 
557
- # Add query_type if documents were requested
558
- if size > 0:
559
- result["query_type"] = "stats_with_docs"
560
-
561
699
  return result
562
700
 
563
701
  # Extract hits for regular queries
564
- hits = response.get("hits", {}).get("hits", [])
702
+ initial_hits = response.get("hits", {}).get("hits", [])
565
703
  total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
566
704
 
567
705
  # Process results based on whether we need Phase 2
568
- if needs_phase2:
569
- # Apply Phase 2 processing
706
+ if needs_phase2 and not scan_all:
707
+ # Pagination with post-processing - continue fetching pages until we get results
570
708
  processor = QueryPostProcessor()
709
+ results: List[Dict[str, Any]] = []
710
+ total_documents_before_filter = 0
711
+ total_documents_after_filter = 0
712
+ current_from = from_
713
+ pages_checked = 0
714
+ max_pages_to_check = min(10, (total_hits // size) + 1) if size > 0 else 1 # Limit to prevent infinite loops
715
+
716
+ while len(results) < size and pages_checked < max_pages_to_check and current_from < total_hits:
717
+ # Fetch current page
718
+ if pages_checked > 0:
719
+ # Need to fetch next page
720
+ search_params["body"]["from"] = current_from
721
+ try:
722
+ response = client.search(**search_params)
723
+ except Exception as e:
724
+ raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
725
+ current_hits = response.get("hits", {}).get("hits", [])
726
+ else:
727
+ # Use initial hits for first page
728
+ current_hits = initial_hits
729
+
730
+ if not current_hits:
731
+ break # No more results
732
+
733
+ # Process the hits from this page
734
+ documents = []
735
+ hit_metadata = []
736
+ for hit in current_hits:
737
+ documents.append(hit["_source"])
738
+ hit_metadata.append(
739
+ {
740
+ "_id": hit.get("_id"),
741
+ "_score": hit.get("_score"),
742
+ "_explanation": hit.get("_explanation") if explain else None,
743
+ }
744
+ )
745
+
746
+ total_documents_before_filter += len(documents)
747
+
748
+ # Apply post-processing
749
+ if isinstance(analysis_result, MutatorAnalysisResult):
750
+ processed_docs = processor.process_results(
751
+ documents,
752
+ analysis_result.post_processing_requirements,
753
+ track_enrichments=kwargs.get("save_enrichment", False),
754
+ )
571
755
 
572
- # Extract documents from hits
756
+ # Filter results
757
+ filtered_docs = processor.filter_results(
758
+ processed_docs, analysis_result.post_processing_requirements
759
+ )
760
+ else:
761
+ processed_docs = documents
762
+ filtered_docs = documents
763
+
764
+ # Add filtered results with metadata
765
+ for doc in filtered_docs:
766
+ if len(results) >= size:
767
+ break # We have enough results
768
+
769
+ # Find the original hit metadata
770
+ for i, orig_doc in enumerate(documents):
771
+ if orig_doc == doc or self._docs_match(orig_doc, doc):
772
+ # Add metadata
773
+ if hit_metadata[i]["_id"]:
774
+ doc["_id"] = hit_metadata[i]["_id"]
775
+ if hit_metadata[i]["_score"]:
776
+ doc["_score"] = hit_metadata[i]["_score"]
777
+ if hit_metadata[i]["_explanation"]:
778
+ doc["_explanation"] = hit_metadata[i]["_explanation"]
779
+ break
780
+ results.append(doc)
781
+
782
+ total_documents_after_filter += len(filtered_docs)
783
+
784
+ # Move to next page
785
+ current_from += size
786
+ pages_checked += 1
787
+
788
+ # Store filtering stats
789
+ pagination_stats = {
790
+ "page_size": size,
791
+ "pages_checked": pages_checked,
792
+ "documents_retrieved": total_documents_before_filter,
793
+ "documents_returned": len(results),
794
+ "documents_filtered": total_documents_before_filter - total_documents_after_filter,
795
+ "filter_rate": (
796
+ (
797
+ (total_documents_before_filter - total_documents_after_filter)
798
+ / total_documents_before_filter
799
+ * 100
800
+ )
801
+ if total_documents_before_filter > 0
802
+ else 0
803
+ ),
804
+ "actual_from": from_, # Original from
805
+ "actual_to": current_from, # Where we ended up searching to
806
+ }
807
+
808
+ elif needs_phase2 and scan_all:
809
+ # scan_all mode with post-processing - process all results
810
+ processor = QueryPostProcessor()
811
+
812
+ # Extract all documents from hits
573
813
  documents = []
574
814
  hit_metadata = []
575
815
  for hit in hits:
@@ -590,10 +830,9 @@ class OpenSearchOperations:
590
830
  track_enrichments=kwargs.get("save_enrichment", False),
591
831
  )
592
832
 
593
- # Then filter results based on requirements (e.g., ALL operator, contains with mutators)
833
+ # Then filter results based on requirements
594
834
  filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
595
835
  else:
596
- # No post-processing needed
597
836
  processed_docs = documents
598
837
  filtered_docs = documents
599
838
 
@@ -601,7 +840,6 @@ class OpenSearchOperations:
601
840
  results = []
602
841
  for doc in filtered_docs:
603
842
  # Find the original hit metadata for this document
604
- # This is a simple approach - in production you might want to track IDs
605
843
  for i, orig_doc in enumerate(documents):
606
844
  if orig_doc == doc or self._docs_match(orig_doc, doc):
607
845
  # Add metadata
@@ -613,9 +851,17 @@ class OpenSearchOperations:
613
851
  doc["_explanation"] = hit_metadata[i]["_explanation"]
614
852
  break
615
853
  results.append(doc)
854
+
855
+ pagination_stats = {
856
+ "documents_scanned": len(documents),
857
+ "documents_passed": len(results),
858
+ "filter_rate": (len(results) / len(documents) * 100) if documents else 0,
859
+ }
860
+
616
861
  else:
617
862
  # No Phase 2 needed, just extract documents
618
863
  results = []
864
+ hits = initial_hits # Use the initial hits
619
865
  for hit in hits:
620
866
  doc = hit["_source"].copy()
621
867
  # Preserve metadata
@@ -627,6 +873,8 @@ class OpenSearchOperations:
627
873
  doc["_explanation"] = hit["explanation"]
628
874
  results.append(doc)
629
875
 
876
+ pagination_stats = None
877
+
630
878
  # Return raw response if requested
631
879
  if kwargs.get("raw_response", False):
632
880
  return {
@@ -688,8 +936,8 @@ class OpenSearchOperations:
688
936
  "performance_impact": performance_impact,
689
937
  "optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
690
938
  "opensearch_query": (
691
- opensearch_query.get("query", {}) if opensearch_query else {}
692
- ), # Include the query that was sent
939
+ complete_opensearch_query if "complete_opensearch_query" in locals() else {}
940
+ ), # Include the full query body
693
941
  "time_range": time_range,
694
942
  "timestamp_field": timestamp_field,
695
943
  "query_type": "regular", # Regular query (not stats)
@@ -703,19 +951,57 @@ class OpenSearchOperations:
703
951
  },
704
952
  }
705
953
 
954
+ # Add pagination stats if available
955
+ if pagination_stats:
956
+ result["post_processing_stats"] = pagination_stats
957
+
706
958
  # Add pagination info for non-scan queries
707
959
  if not scan_all:
708
- result["pagination"] = {
960
+ # Cap displayed total at 10000 for consistency
961
+ displayed_total = min(opensearch_total, 10000)
962
+
963
+ pagination_info = {
709
964
  "size": size,
710
- "total": opensearch_total,
711
- "has_more": len(hits) == size, # If we got a full page, there might be more
965
+ "from": from_,
966
+ "total": displayed_total,
967
+ "actual_total": opensearch_total, # Real total for reference
968
+ "returned": len(results),
712
969
  }
713
970
 
714
- # Add sort values from the last hit for search_after pagination
715
- if hits and sort:
716
- last_hit = hits[-1]
717
- if "sort" in last_hit:
718
- result["sort_values"] = last_hit["sort"]
971
+ if needs_phase2 and pagination_stats:
972
+ # Post-processing was applied - update pagination to reflect auto-pagination
973
+ actual_last_position = pagination_stats.get("actual_to", from_ + size)
974
+
975
+ # Update from to reflect where we actually searched to
976
+ if pagination_stats["pages_checked"] > 1:
977
+ # We auto-paginated, so update the effective "from" position
978
+ pagination_info["from"] = from_
979
+ pagination_info["actual_from_searched"] = from_
980
+ pagination_info["actual_to_searched"] = actual_last_position
981
+ pagination_info["auto_paginated"] = True
982
+ pagination_info["pages_auto_fetched"] = pagination_stats["pages_checked"]
983
+
984
+ # Has more if we haven't reached the 10000 limit
985
+ pagination_info["has_more"] = actual_last_position < 10000 and actual_last_position < opensearch_total
986
+ pagination_info["documents_retrieved"] = pagination_stats["documents_retrieved"]
987
+ pagination_info["documents_filtered"] = pagination_stats["documents_filtered"]
988
+ pagination_info["filter_rate"] = f"{pagination_stats['filter_rate']:.1f}%"
989
+
990
+ # Calculate the last valid page number (page that contains the 10,000th record)
991
+ last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
992
+ pagination_info["last_page"] = last_page
993
+ pagination_info["current_page"] = from_ // size
994
+ else:
995
+ # Regular pagination without post-processing
996
+ # Has more if we got full page and haven't reached 10000 limit
997
+ pagination_info["has_more"] = len(initial_hits) == size and (from_ + size < 10000)
998
+
999
+ # Calculate the last valid page number
1000
+ last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
1001
+ pagination_info["last_page"] = last_page
1002
+ pagination_info["current_page"] = from_ // size
1003
+
1004
+ result["pagination"] = pagination_info
719
1005
 
720
1006
  return result
721
1007
 
@@ -740,7 +1026,7 @@ class OpenSearchOperations:
740
1026
  def _extract_grouped_buckets( # noqa: C901
741
1027
  self,
742
1028
  aggs_response: Dict[str, Any],
743
- group_by_fields: List[str],
1029
+ group_by_fields: List[Any],
744
1030
  aggregations: List[Dict[str, Any]],
745
1031
  stats_ast: Dict[str, Any],
746
1032
  ) -> List[Dict[str, Any]]:
@@ -748,7 +1034,7 @@ class OpenSearchOperations:
748
1034
 
749
1035
  Args:
750
1036
  aggs_response: OpenSearch aggregations response
751
- group_by_fields: List of fields used for grouping
1037
+ group_by_fields: List of fields used for grouping (can be strings or dicts)
752
1038
  aggregations: List of aggregation specifications
753
1039
  stats_ast: The stats AST for reference
754
1040
 
@@ -757,9 +1043,19 @@ class OpenSearchOperations:
757
1043
  """
758
1044
  buckets = []
759
1045
 
1046
+ # Normalize group_by_fields to extract field names
1047
+ normalized_fields = []
1048
+ for field in group_by_fields:
1049
+ if isinstance(field, str):
1050
+ normalized_fields.append(field)
1051
+ elif isinstance(field, dict) and "field" in field:
1052
+ normalized_fields.append(field["field"])
1053
+ else:
1054
+ normalized_fields.append(str(field))
1055
+
760
1056
  # For single-level grouping
761
- if len(group_by_fields) == 1:
762
- field = group_by_fields[0]
1057
+ if len(normalized_fields) == 1:
1058
+ field = normalized_fields[0]
763
1059
  # Look for the terms aggregation with the group field name
764
1060
  terms_agg_name = f"group_by_{field}"
765
1061
 
@@ -811,6 +1107,16 @@ class OpenSearchOperations:
811
1107
  bucket_result[output_key] = agg_value["value"]
812
1108
  else:
813
1109
  bucket_result[output_key] = agg_value
1110
+ else:
1111
+ # For count(*), also check doc_count
1112
+ if func == "count" and field_name == "*":
1113
+ bucket_result[output_key] = bucket.get("doc_count", 0)
1114
+ else:
1115
+ # Try to find any aggregation value in the bucket
1116
+ for key, value in bucket.items():
1117
+ if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
1118
+ bucket_result[output_key] = value["value"]
1119
+ break
814
1120
 
815
1121
  buckets.append(bucket_result)
816
1122
 
@@ -820,7 +1126,7 @@ class OpenSearchOperations:
820
1126
  current_agg = aggs_response
821
1127
 
822
1128
  # Find the first group_by aggregation
823
- for field in group_by_fields:
1129
+ for field in normalized_fields:
824
1130
  group_key = f"group_by_{field}"
825
1131
  if group_key in current_agg:
826
1132
  current_agg = current_agg[group_key]
@@ -831,7 +1137,7 @@ class OpenSearchOperations:
831
1137
 
832
1138
  # Process nested buckets recursively
833
1139
  if "buckets" in current_agg:
834
- buckets = self._process_nested_buckets(current_agg["buckets"], group_by_fields, aggregations, 0)
1140
+ buckets = self._process_nested_buckets(current_agg["buckets"], normalized_fields, aggregations, 0)
835
1141
 
836
1142
  return buckets
837
1143
 
@@ -846,7 +1152,7 @@ class OpenSearchOperations:
846
1152
 
847
1153
  Args:
848
1154
  buckets_data: List of bucket data from OpenSearch
849
- group_by_fields: List of fields used for grouping
1155
+ group_by_fields: List of fields used for grouping (already normalized to strings)
850
1156
  aggregations: List of aggregation specifications
851
1157
  level: Current nesting level (0-based)
852
1158
 
@@ -910,6 +1216,16 @@ class OpenSearchOperations:
910
1216
  result[output_key] = agg_value["value"]
911
1217
  else:
912
1218
  result[output_key] = agg_value
1219
+ else:
1220
+ # For count(*), also check doc_count
1221
+ if func == "count" and field_name == "*":
1222
+ result[output_key] = bucket.get("doc_count", 0)
1223
+ else:
1224
+ # Try to find any aggregation value in the bucket
1225
+ for key, value in bucket.items():
1226
+ if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
1227
+ result[output_key] = value["value"]
1228
+ break
913
1229
 
914
1230
  results.append(result)
915
1231