tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/METADATA +24 -1
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/RECORD +27 -27
- tql/core.py +225 -54
- tql/core_components/opensearch_operations.py +415 -99
- tql/core_components/stats_operations.py +11 -1
- tql/evaluator.py +39 -2
- tql/evaluator_components/special_expressions.py +25 -6
- tql/evaluator_components/value_comparison.py +31 -3
- tql/mutator_analyzer.py +640 -242
- tql/mutators/__init__.py +5 -1
- tql/mutators/dns.py +76 -53
- tql/mutators/security.py +101 -100
- tql/mutators/string.py +74 -0
- tql/opensearch_components/field_mapping.py +9 -3
- tql/opensearch_components/lucene_converter.py +12 -0
- tql/opensearch_components/query_converter.py +134 -25
- tql/opensearch_mappings.py +2 -2
- tql/opensearch_stats.py +170 -39
- tql/parser.py +92 -37
- tql/parser_components/ast_builder.py +37 -1
- tql/parser_components/field_extractor.py +9 -1
- tql/parser_components/grammar.py +32 -8
- tql/post_processor.py +489 -31
- tql/stats_evaluator.py +170 -12
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/LICENSE +0 -0
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/WHEEL +0 -0
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/entry_points.txt +0 -0
|
@@ -45,11 +45,20 @@ class OpenSearchOperations:
|
|
|
45
45
|
# Parse the query
|
|
46
46
|
ast = self.parser.parse(query)
|
|
47
47
|
|
|
48
|
+
# Analyze the query for mutators
|
|
49
|
+
from ..mutator_analyzer import MutatorAnalyzer
|
|
50
|
+
|
|
51
|
+
analyzer = MutatorAnalyzer(self.field_mappings)
|
|
52
|
+
analysis_result = analyzer.analyze_ast(ast, context="opensearch")
|
|
53
|
+
|
|
54
|
+
# Use the optimized AST (with array operators removed)
|
|
55
|
+
optimized_ast = analysis_result.optimized_ast
|
|
56
|
+
|
|
48
57
|
# Create OpenSearch backend
|
|
49
58
|
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
50
59
|
|
|
51
|
-
# Convert to OpenSearch query
|
|
52
|
-
opensearch_query = backend.convert(
|
|
60
|
+
# Convert to OpenSearch query using the optimized AST
|
|
61
|
+
opensearch_query = backend.convert(optimized_ast)
|
|
53
62
|
|
|
54
63
|
return opensearch_query
|
|
55
64
|
|
|
@@ -96,7 +105,7 @@ class OpenSearchOperations:
|
|
|
96
105
|
}
|
|
97
106
|
|
|
98
107
|
# Create analyzer
|
|
99
|
-
analyzer = MutatorAnalyzer(self.
|
|
108
|
+
analyzer = MutatorAnalyzer(self.field_mappings)
|
|
100
109
|
|
|
101
110
|
# Analyze the AST
|
|
102
111
|
return analyzer.analyze_ast(ast)
|
|
@@ -141,7 +150,7 @@ class OpenSearchOperations:
|
|
|
141
150
|
query: str,
|
|
142
151
|
index: Optional[str] = None,
|
|
143
152
|
size: int = 10000,
|
|
144
|
-
|
|
153
|
+
from_: int = 0,
|
|
145
154
|
sort: Optional[List[Dict[str, Any]]] = None,
|
|
146
155
|
source_includes: Optional[List[str]] = None,
|
|
147
156
|
source_excludes: Optional[List[str]] = None,
|
|
@@ -175,8 +184,8 @@ class OpenSearchOperations:
|
|
|
175
184
|
query: TQL query string
|
|
176
185
|
index: OpenSearch index name (uses environment variable if not provided)
|
|
177
186
|
size: Maximum number of results to return (default: 10000)
|
|
178
|
-
|
|
179
|
-
sort: List of sort specifications
|
|
187
|
+
from_: Starting offset for pagination (max 10000 - size)
|
|
188
|
+
sort: List of sort specifications
|
|
180
189
|
source_includes: Fields to include in response
|
|
181
190
|
source_excludes: Fields to exclude from response
|
|
182
191
|
track_total_hits: Whether to track total hit count
|
|
@@ -226,6 +235,13 @@ class OpenSearchOperations:
|
|
|
226
235
|
is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
|
|
227
236
|
|
|
228
237
|
if is_stats_query:
|
|
238
|
+
# Analyze the query to check for mutators
|
|
239
|
+
analysis_result = self.analyze_opensearch_query(query)
|
|
240
|
+
has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
|
|
241
|
+
needs_post_processing_for_stats = (
|
|
242
|
+
has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
|
|
243
|
+
)
|
|
244
|
+
|
|
229
245
|
# Handle stats queries differently
|
|
230
246
|
from ..opensearch_stats import OpenSearchStatsTranslator
|
|
231
247
|
|
|
@@ -240,7 +256,11 @@ class OpenSearchOperations:
|
|
|
240
256
|
# Convert filter to OpenSearch query
|
|
241
257
|
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
242
258
|
if filter_ast:
|
|
243
|
-
|
|
259
|
+
# Use the optimized AST if we have mutators
|
|
260
|
+
if has_mutators and needs_post_processing_for_stats:
|
|
261
|
+
filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
|
|
262
|
+
else:
|
|
263
|
+
filter_query = backend.convert(filter_ast)["query"]
|
|
244
264
|
else:
|
|
245
265
|
filter_query = {"match_all": {}}
|
|
246
266
|
else:
|
|
@@ -248,19 +268,27 @@ class OpenSearchOperations:
|
|
|
248
268
|
stats_ast = ast
|
|
249
269
|
filter_query = {"match_all": {}}
|
|
250
270
|
|
|
251
|
-
#
|
|
252
|
-
if
|
|
253
|
-
|
|
271
|
+
# For stats queries with post-processing mutators, we need to handle them differently
|
|
272
|
+
if needs_post_processing_for_stats:
|
|
273
|
+
# We'll need to fetch all documents and aggregate in memory
|
|
274
|
+
opensearch_query = {"query": filter_query}
|
|
275
|
+
needs_phase2 = True
|
|
276
|
+
# Store the stats AST for later processing
|
|
277
|
+
stats_ast_for_post_processing = stats_ast
|
|
254
278
|
else:
|
|
255
|
-
|
|
279
|
+
# Build aggregations for direct OpenSearch execution
|
|
280
|
+
if stats_ast:
|
|
281
|
+
stats_result = translator.translate_stats(stats_ast, self.field_mappings)
|
|
282
|
+
else:
|
|
283
|
+
stats_result = {"aggs": {}}
|
|
256
284
|
|
|
257
|
-
|
|
258
|
-
|
|
285
|
+
# Extract the aggregations (translate_stats returns {"aggs": {...}})
|
|
286
|
+
aggregations = stats_result.get("aggs", {})
|
|
259
287
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
288
|
+
# Build the complete query
|
|
289
|
+
opensearch_query = {"query": filter_query, "aggs": aggregations}
|
|
290
|
+
needs_phase2 = False
|
|
291
|
+
stats_ast_for_post_processing = None
|
|
264
292
|
else:
|
|
265
293
|
# Parse and analyze the query normally
|
|
266
294
|
analysis_result = self.analyze_opensearch_query(query)
|
|
@@ -316,19 +344,49 @@ class OpenSearchOperations:
|
|
|
316
344
|
base_query = search_body.get("query", {})
|
|
317
345
|
time_filter = {"range": {timestamp_field: time_range}}
|
|
318
346
|
|
|
319
|
-
# Wrap the existing query with time filter
|
|
347
|
+
# Wrap the existing query with time filter in filter context
|
|
320
348
|
if base_query:
|
|
321
|
-
|
|
349
|
+
# If the base query is already a bool query, add to its filter array
|
|
350
|
+
if isinstance(base_query, dict) and base_query.get("bool"):
|
|
351
|
+
bool_query = base_query["bool"]
|
|
352
|
+
if "filter" in bool_query:
|
|
353
|
+
# Add to existing filter array
|
|
354
|
+
if isinstance(bool_query["filter"], list):
|
|
355
|
+
bool_query["filter"].append(time_filter)
|
|
356
|
+
else:
|
|
357
|
+
# Convert single filter to array
|
|
358
|
+
bool_query["filter"] = [bool_query["filter"], time_filter]
|
|
359
|
+
else:
|
|
360
|
+
# No filter array yet, create one
|
|
361
|
+
bool_query["filter"] = [time_filter]
|
|
362
|
+
search_body["query"] = base_query
|
|
363
|
+
else:
|
|
364
|
+
# Wrap in bool query with filter
|
|
365
|
+
search_body["query"] = {"bool": {"filter": [base_query, time_filter]}}
|
|
322
366
|
else:
|
|
323
367
|
search_body["query"] = time_filter
|
|
324
368
|
|
|
325
|
-
|
|
369
|
+
# For stats queries, set size based on whether we need documents for post-processing
|
|
370
|
+
if is_stats_query:
|
|
371
|
+
if needs_phase2:
|
|
372
|
+
# Need all documents for post-processing
|
|
373
|
+
search_body.update({"size": 10000, "track_total_hits": track_total_hits})
|
|
374
|
+
else:
|
|
375
|
+
# Pure aggregation query - no documents needed
|
|
376
|
+
search_body.update({"size": 0, "track_total_hits": track_total_hits})
|
|
377
|
+
else:
|
|
378
|
+
search_body.update({"size": size, "track_total_hits": track_total_hits})
|
|
326
379
|
|
|
327
380
|
# Add optional parameters
|
|
328
381
|
if sort:
|
|
329
382
|
search_body["sort"] = sort
|
|
330
|
-
|
|
331
|
-
|
|
383
|
+
|
|
384
|
+
# Add from parameter for pagination (limit to 10000 total)
|
|
385
|
+
if from_ > 0:
|
|
386
|
+
# Ensure we don't exceed the 10000 limit
|
|
387
|
+
max_allowed_from = 10000 - size
|
|
388
|
+
from_ = min(from_, max_allowed_from)
|
|
389
|
+
search_body["from"] = from_
|
|
332
390
|
if source_includes or source_excludes:
|
|
333
391
|
search_body["_source"] = {}
|
|
334
392
|
if source_includes:
|
|
@@ -341,6 +399,9 @@ class OpenSearchOperations:
|
|
|
341
399
|
# Add any additional parameters from kwargs
|
|
342
400
|
search_body.update(kwargs)
|
|
343
401
|
|
|
402
|
+
# Store the complete search body for debugging
|
|
403
|
+
complete_opensearch_query = search_body.copy()
|
|
404
|
+
|
|
344
405
|
# Build search parameters
|
|
345
406
|
search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
|
|
346
407
|
|
|
@@ -408,43 +469,125 @@ class OpenSearchOperations:
|
|
|
408
469
|
|
|
409
470
|
# Handle stats query results differently
|
|
410
471
|
if is_stats_query:
|
|
411
|
-
|
|
412
|
-
|
|
472
|
+
if needs_phase2 and "stats_ast_for_post_processing" in locals():
|
|
473
|
+
# Stats query with post-processing - need to aggregate in memory
|
|
474
|
+
# First, get all documents and apply mutators
|
|
475
|
+
all_documents = []
|
|
476
|
+
|
|
477
|
+
# Handle scroll for large datasets
|
|
478
|
+
if scan_all or needs_phase2:
|
|
479
|
+
# Use scroll to get all documents
|
|
480
|
+
scroll_params = search_params.copy()
|
|
481
|
+
scroll_params["scroll"] = scroll_timeout
|
|
482
|
+
scroll_params["body"]["size"] = min(10000, scroll_size)
|
|
413
483
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
484
|
+
try:
|
|
485
|
+
# Initial search
|
|
486
|
+
scroll_response = client.search(**scroll_params)
|
|
487
|
+
scroll_hits = scroll_response.get("hits", {}).get("hits", [])
|
|
488
|
+
|
|
489
|
+
while scroll_hits:
|
|
490
|
+
for hit in scroll_hits:
|
|
491
|
+
all_documents.append(hit["_source"])
|
|
492
|
+
|
|
493
|
+
scroll_id = scroll_response.get("_scroll_id")
|
|
494
|
+
if not scroll_id:
|
|
495
|
+
break
|
|
496
|
+
|
|
497
|
+
scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
498
|
+
scroll_hits = scroll_response.get("hits", {}).get("hits", [])
|
|
499
|
+
|
|
500
|
+
# Clean up scroll
|
|
501
|
+
if scroll_id:
|
|
502
|
+
try:
|
|
503
|
+
client.clear_scroll(scroll_id=scroll_id)
|
|
504
|
+
except Exception:
|
|
505
|
+
pass
|
|
506
|
+
except Exception as e:
|
|
507
|
+
raise TQLExecutionError(f"Failed to fetch documents for stats post-processing: {str(e)}")
|
|
508
|
+
else:
|
|
509
|
+
# Fetch documents with regular pagination
|
|
510
|
+
for hit in response.get("hits", {}).get("hits", []):
|
|
511
|
+
all_documents.append(hit["_source"])
|
|
512
|
+
|
|
513
|
+
# Apply post-processing mutators
|
|
514
|
+
if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
|
|
515
|
+
processor = QueryPostProcessor()
|
|
516
|
+
processed_docs = processor.process_results(
|
|
517
|
+
all_documents, analysis_result.post_processing_requirements, track_enrichments=False
|
|
518
|
+
)
|
|
519
|
+
# Filter if needed
|
|
520
|
+
filtered_docs = processor.filter_results(
|
|
521
|
+
processed_docs, analysis_result.post_processing_requirements
|
|
522
|
+
)
|
|
523
|
+
else:
|
|
524
|
+
filtered_docs = all_documents
|
|
525
|
+
|
|
526
|
+
# Now perform in-memory aggregation
|
|
527
|
+
from ..stats_evaluator import TQLStatsEvaluator
|
|
528
|
+
|
|
529
|
+
stats_evaluator = TQLStatsEvaluator()
|
|
530
|
+
|
|
531
|
+
# Execute the stats aggregation in memory
|
|
532
|
+
stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
|
|
533
|
+
|
|
534
|
+
# Format response for stats-only (no documents)
|
|
535
|
+
result = {
|
|
536
|
+
"stats": stats_results,
|
|
537
|
+
"total": len(filtered_docs),
|
|
538
|
+
"post_processing_applied": True,
|
|
539
|
+
"health_status": "red",
|
|
540
|
+
"health_reasons": [
|
|
541
|
+
{
|
|
542
|
+
"status": "red",
|
|
543
|
+
"query_part": "stats with post-processing",
|
|
544
|
+
"reason": f"Stats query required fetching {len(all_documents)} documents for post-processing",
|
|
545
|
+
}
|
|
546
|
+
],
|
|
547
|
+
"performance_impact": {
|
|
548
|
+
"overhead_ms": 0, # Would need timing to calculate
|
|
549
|
+
"documents_processed": len(all_documents),
|
|
550
|
+
"mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
|
|
551
|
+
},
|
|
552
|
+
"opensearch_query": complete_opensearch_query,
|
|
553
|
+
}
|
|
420
554
|
|
|
421
|
-
|
|
422
|
-
if stats_ast:
|
|
423
|
-
aggregations = stats_ast.get("aggregations", [])
|
|
424
|
-
group_by_fields = stats_ast.get("group_by", [])
|
|
555
|
+
return result
|
|
425
556
|
else:
|
|
426
|
-
|
|
427
|
-
|
|
557
|
+
# Regular stats query using OpenSearch aggregations
|
|
558
|
+
aggs_response = response.get("aggregations", {})
|
|
428
559
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
560
|
+
# Format the stats results based on the test expectations
|
|
561
|
+
# Use the correct stats AST
|
|
562
|
+
if ast.get("type") == "query_with_stats":
|
|
563
|
+
stats_ast = ast.get("stats")
|
|
564
|
+
else:
|
|
565
|
+
stats_ast = ast
|
|
566
|
+
|
|
567
|
+
# Extract aggregation info
|
|
432
568
|
if stats_ast:
|
|
433
|
-
|
|
569
|
+
aggregations = stats_ast.get("aggregations", [])
|
|
570
|
+
group_by_fields = stats_ast.get("group_by", [])
|
|
434
571
|
else:
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
572
|
+
aggregations = []
|
|
573
|
+
group_by_fields = []
|
|
574
|
+
|
|
575
|
+
# Format results differently based on whether we have grouping
|
|
576
|
+
if group_by_fields:
|
|
577
|
+
# Use the OpenSearchStatsTranslator to properly transform the response
|
|
578
|
+
from ..opensearch_stats import OpenSearchStatsTranslator
|
|
579
|
+
|
|
580
|
+
translator = OpenSearchStatsTranslator()
|
|
581
|
+
|
|
582
|
+
# Transform the response using the translator
|
|
583
|
+
transformed_response = translator.transform_response(response, stats_ast)
|
|
584
|
+
|
|
585
|
+
# The transformed response already has the correct structure
|
|
586
|
+
stats_results = transformed_response
|
|
587
|
+
|
|
588
|
+
# Add viz_hint if present in stats AST
|
|
589
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
590
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
448
591
|
else:
|
|
449
592
|
# Simple aggregations without grouping
|
|
450
593
|
if aggregations:
|
|
@@ -488,6 +631,10 @@ class OpenSearchOperations:
|
|
|
488
631
|
"values": value,
|
|
489
632
|
"group_by": [],
|
|
490
633
|
}
|
|
634
|
+
|
|
635
|
+
# Add viz_hint if present in stats AST
|
|
636
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
637
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
491
638
|
else:
|
|
492
639
|
# Multiple aggregations
|
|
493
640
|
agg_results = {}
|
|
@@ -526,50 +673,143 @@ class OpenSearchOperations:
|
|
|
526
673
|
"type": "stats",
|
|
527
674
|
"results": agg_results,
|
|
528
675
|
}
|
|
676
|
+
|
|
677
|
+
# Add viz_hint if present in stats AST
|
|
678
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
679
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
529
680
|
else:
|
|
530
681
|
stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
|
|
531
682
|
|
|
532
|
-
#
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
doc = hit["_source"].copy()
|
|
538
|
-
# Preserve metadata
|
|
539
|
-
if "_id" in hit:
|
|
540
|
-
doc["_id"] = hit["_id"]
|
|
541
|
-
if "_score" in hit:
|
|
542
|
-
doc["_score"] = hit["_score"]
|
|
543
|
-
documents.append(doc)
|
|
544
|
-
|
|
545
|
-
# Return in the expected format
|
|
683
|
+
# For stats queries, return only stats (no documents)
|
|
684
|
+
# Total from aggregation metadata or hit count
|
|
685
|
+
total_count = response.get("hits", {}).get("total", {}).get("value", 0)
|
|
686
|
+
|
|
687
|
+
# Return stats-only format
|
|
546
688
|
result = {
|
|
547
|
-
"results": documents,
|
|
548
|
-
"total": response.get("hits", {}).get("total", {}).get("value", 0),
|
|
549
689
|
"stats": stats_results,
|
|
690
|
+
"total": total_count,
|
|
550
691
|
"post_processing_applied": False,
|
|
551
|
-
"health_status": "
|
|
692
|
+
"health_status": "green",
|
|
552
693
|
"health_reasons": [],
|
|
553
694
|
"performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
|
|
554
|
-
"
|
|
695
|
+
"opensearch_query": complete_opensearch_query,
|
|
696
|
+
"query_type": "stats",
|
|
555
697
|
}
|
|
556
698
|
|
|
557
|
-
# Add query_type if documents were requested
|
|
558
|
-
if size > 0:
|
|
559
|
-
result["query_type"] = "stats_with_docs"
|
|
560
|
-
|
|
561
699
|
return result
|
|
562
700
|
|
|
563
701
|
# Extract hits for regular queries
|
|
564
|
-
|
|
702
|
+
initial_hits = response.get("hits", {}).get("hits", [])
|
|
565
703
|
total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
|
|
566
704
|
|
|
567
705
|
# Process results based on whether we need Phase 2
|
|
568
|
-
if needs_phase2:
|
|
569
|
-
#
|
|
706
|
+
if needs_phase2 and not scan_all:
|
|
707
|
+
# Pagination with post-processing - continue fetching pages until we get results
|
|
570
708
|
processor = QueryPostProcessor()
|
|
709
|
+
results: List[Dict[str, Any]] = []
|
|
710
|
+
total_documents_before_filter = 0
|
|
711
|
+
total_documents_after_filter = 0
|
|
712
|
+
current_from = from_
|
|
713
|
+
pages_checked = 0
|
|
714
|
+
max_pages_to_check = min(10, (total_hits // size) + 1) if size > 0 else 1 # Limit to prevent infinite loops
|
|
715
|
+
|
|
716
|
+
while len(results) < size and pages_checked < max_pages_to_check and current_from < total_hits:
|
|
717
|
+
# Fetch current page
|
|
718
|
+
if pages_checked > 0:
|
|
719
|
+
# Need to fetch next page
|
|
720
|
+
search_params["body"]["from"] = current_from
|
|
721
|
+
try:
|
|
722
|
+
response = client.search(**search_params)
|
|
723
|
+
except Exception as e:
|
|
724
|
+
raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
|
|
725
|
+
current_hits = response.get("hits", {}).get("hits", [])
|
|
726
|
+
else:
|
|
727
|
+
# Use initial hits for first page
|
|
728
|
+
current_hits = initial_hits
|
|
729
|
+
|
|
730
|
+
if not current_hits:
|
|
731
|
+
break # No more results
|
|
732
|
+
|
|
733
|
+
# Process the hits from this page
|
|
734
|
+
documents = []
|
|
735
|
+
hit_metadata = []
|
|
736
|
+
for hit in current_hits:
|
|
737
|
+
documents.append(hit["_source"])
|
|
738
|
+
hit_metadata.append(
|
|
739
|
+
{
|
|
740
|
+
"_id": hit.get("_id"),
|
|
741
|
+
"_score": hit.get("_score"),
|
|
742
|
+
"_explanation": hit.get("_explanation") if explain else None,
|
|
743
|
+
}
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
total_documents_before_filter += len(documents)
|
|
747
|
+
|
|
748
|
+
# Apply post-processing
|
|
749
|
+
if isinstance(analysis_result, MutatorAnalysisResult):
|
|
750
|
+
processed_docs = processor.process_results(
|
|
751
|
+
documents,
|
|
752
|
+
analysis_result.post_processing_requirements,
|
|
753
|
+
track_enrichments=kwargs.get("save_enrichment", False),
|
|
754
|
+
)
|
|
571
755
|
|
|
572
|
-
|
|
756
|
+
# Filter results
|
|
757
|
+
filtered_docs = processor.filter_results(
|
|
758
|
+
processed_docs, analysis_result.post_processing_requirements
|
|
759
|
+
)
|
|
760
|
+
else:
|
|
761
|
+
processed_docs = documents
|
|
762
|
+
filtered_docs = documents
|
|
763
|
+
|
|
764
|
+
# Add filtered results with metadata
|
|
765
|
+
for doc in filtered_docs:
|
|
766
|
+
if len(results) >= size:
|
|
767
|
+
break # We have enough results
|
|
768
|
+
|
|
769
|
+
# Find the original hit metadata
|
|
770
|
+
for i, orig_doc in enumerate(documents):
|
|
771
|
+
if orig_doc == doc or self._docs_match(orig_doc, doc):
|
|
772
|
+
# Add metadata
|
|
773
|
+
if hit_metadata[i]["_id"]:
|
|
774
|
+
doc["_id"] = hit_metadata[i]["_id"]
|
|
775
|
+
if hit_metadata[i]["_score"]:
|
|
776
|
+
doc["_score"] = hit_metadata[i]["_score"]
|
|
777
|
+
if hit_metadata[i]["_explanation"]:
|
|
778
|
+
doc["_explanation"] = hit_metadata[i]["_explanation"]
|
|
779
|
+
break
|
|
780
|
+
results.append(doc)
|
|
781
|
+
|
|
782
|
+
total_documents_after_filter += len(filtered_docs)
|
|
783
|
+
|
|
784
|
+
# Move to next page
|
|
785
|
+
current_from += size
|
|
786
|
+
pages_checked += 1
|
|
787
|
+
|
|
788
|
+
# Store filtering stats
|
|
789
|
+
pagination_stats = {
|
|
790
|
+
"page_size": size,
|
|
791
|
+
"pages_checked": pages_checked,
|
|
792
|
+
"documents_retrieved": total_documents_before_filter,
|
|
793
|
+
"documents_returned": len(results),
|
|
794
|
+
"documents_filtered": total_documents_before_filter - total_documents_after_filter,
|
|
795
|
+
"filter_rate": (
|
|
796
|
+
(
|
|
797
|
+
(total_documents_before_filter - total_documents_after_filter)
|
|
798
|
+
/ total_documents_before_filter
|
|
799
|
+
* 100
|
|
800
|
+
)
|
|
801
|
+
if total_documents_before_filter > 0
|
|
802
|
+
else 0
|
|
803
|
+
),
|
|
804
|
+
"actual_from": from_, # Original from
|
|
805
|
+
"actual_to": current_from, # Where we ended up searching to
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
elif needs_phase2 and scan_all:
|
|
809
|
+
# scan_all mode with post-processing - process all results
|
|
810
|
+
processor = QueryPostProcessor()
|
|
811
|
+
|
|
812
|
+
# Extract all documents from hits
|
|
573
813
|
documents = []
|
|
574
814
|
hit_metadata = []
|
|
575
815
|
for hit in hits:
|
|
@@ -590,10 +830,9 @@ class OpenSearchOperations:
|
|
|
590
830
|
track_enrichments=kwargs.get("save_enrichment", False),
|
|
591
831
|
)
|
|
592
832
|
|
|
593
|
-
# Then filter results based on requirements
|
|
833
|
+
# Then filter results based on requirements
|
|
594
834
|
filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
|
|
595
835
|
else:
|
|
596
|
-
# No post-processing needed
|
|
597
836
|
processed_docs = documents
|
|
598
837
|
filtered_docs = documents
|
|
599
838
|
|
|
@@ -601,7 +840,6 @@ class OpenSearchOperations:
|
|
|
601
840
|
results = []
|
|
602
841
|
for doc in filtered_docs:
|
|
603
842
|
# Find the original hit metadata for this document
|
|
604
|
-
# This is a simple approach - in production you might want to track IDs
|
|
605
843
|
for i, orig_doc in enumerate(documents):
|
|
606
844
|
if orig_doc == doc or self._docs_match(orig_doc, doc):
|
|
607
845
|
# Add metadata
|
|
@@ -613,9 +851,17 @@ class OpenSearchOperations:
|
|
|
613
851
|
doc["_explanation"] = hit_metadata[i]["_explanation"]
|
|
614
852
|
break
|
|
615
853
|
results.append(doc)
|
|
854
|
+
|
|
855
|
+
pagination_stats = {
|
|
856
|
+
"documents_scanned": len(documents),
|
|
857
|
+
"documents_passed": len(results),
|
|
858
|
+
"filter_rate": (len(results) / len(documents) * 100) if documents else 0,
|
|
859
|
+
}
|
|
860
|
+
|
|
616
861
|
else:
|
|
617
862
|
# No Phase 2 needed, just extract documents
|
|
618
863
|
results = []
|
|
864
|
+
hits = initial_hits # Use the initial hits
|
|
619
865
|
for hit in hits:
|
|
620
866
|
doc = hit["_source"].copy()
|
|
621
867
|
# Preserve metadata
|
|
@@ -627,6 +873,8 @@ class OpenSearchOperations:
|
|
|
627
873
|
doc["_explanation"] = hit["explanation"]
|
|
628
874
|
results.append(doc)
|
|
629
875
|
|
|
876
|
+
pagination_stats = None
|
|
877
|
+
|
|
630
878
|
# Return raw response if requested
|
|
631
879
|
if kwargs.get("raw_response", False):
|
|
632
880
|
return {
|
|
@@ -688,8 +936,8 @@ class OpenSearchOperations:
|
|
|
688
936
|
"performance_impact": performance_impact,
|
|
689
937
|
"optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
|
|
690
938
|
"opensearch_query": (
|
|
691
|
-
|
|
692
|
-
), # Include the query
|
|
939
|
+
complete_opensearch_query if "complete_opensearch_query" in locals() else {}
|
|
940
|
+
), # Include the full query body
|
|
693
941
|
"time_range": time_range,
|
|
694
942
|
"timestamp_field": timestamp_field,
|
|
695
943
|
"query_type": "regular", # Regular query (not stats)
|
|
@@ -703,19 +951,57 @@ class OpenSearchOperations:
|
|
|
703
951
|
},
|
|
704
952
|
}
|
|
705
953
|
|
|
954
|
+
# Add pagination stats if available
|
|
955
|
+
if pagination_stats:
|
|
956
|
+
result["post_processing_stats"] = pagination_stats
|
|
957
|
+
|
|
706
958
|
# Add pagination info for non-scan queries
|
|
707
959
|
if not scan_all:
|
|
708
|
-
|
|
960
|
+
# Cap displayed total at 10000 for consistency
|
|
961
|
+
displayed_total = min(opensearch_total, 10000)
|
|
962
|
+
|
|
963
|
+
pagination_info = {
|
|
709
964
|
"size": size,
|
|
710
|
-
"
|
|
711
|
-
"
|
|
965
|
+
"from": from_,
|
|
966
|
+
"total": displayed_total,
|
|
967
|
+
"actual_total": opensearch_total, # Real total for reference
|
|
968
|
+
"returned": len(results),
|
|
712
969
|
}
|
|
713
970
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
971
|
+
if needs_phase2 and pagination_stats:
|
|
972
|
+
# Post-processing was applied - update pagination to reflect auto-pagination
|
|
973
|
+
actual_last_position = pagination_stats.get("actual_to", from_ + size)
|
|
974
|
+
|
|
975
|
+
# Update from to reflect where we actually searched to
|
|
976
|
+
if pagination_stats["pages_checked"] > 1:
|
|
977
|
+
# We auto-paginated, so update the effective "from" position
|
|
978
|
+
pagination_info["from"] = from_
|
|
979
|
+
pagination_info["actual_from_searched"] = from_
|
|
980
|
+
pagination_info["actual_to_searched"] = actual_last_position
|
|
981
|
+
pagination_info["auto_paginated"] = True
|
|
982
|
+
pagination_info["pages_auto_fetched"] = pagination_stats["pages_checked"]
|
|
983
|
+
|
|
984
|
+
# Has more if we haven't reached the 10000 limit
|
|
985
|
+
pagination_info["has_more"] = actual_last_position < 10000 and actual_last_position < opensearch_total
|
|
986
|
+
pagination_info["documents_retrieved"] = pagination_stats["documents_retrieved"]
|
|
987
|
+
pagination_info["documents_filtered"] = pagination_stats["documents_filtered"]
|
|
988
|
+
pagination_info["filter_rate"] = f"{pagination_stats['filter_rate']:.1f}%"
|
|
989
|
+
|
|
990
|
+
# Calculate the last valid page number (page that contains the 10,000th record)
|
|
991
|
+
last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
|
|
992
|
+
pagination_info["last_page"] = last_page
|
|
993
|
+
pagination_info["current_page"] = from_ // size
|
|
994
|
+
else:
|
|
995
|
+
# Regular pagination without post-processing
|
|
996
|
+
# Has more if we got full page and haven't reached 10000 limit
|
|
997
|
+
pagination_info["has_more"] = len(initial_hits) == size and (from_ + size < 10000)
|
|
998
|
+
|
|
999
|
+
# Calculate the last valid page number
|
|
1000
|
+
last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
|
|
1001
|
+
pagination_info["last_page"] = last_page
|
|
1002
|
+
pagination_info["current_page"] = from_ // size
|
|
1003
|
+
|
|
1004
|
+
result["pagination"] = pagination_info
|
|
719
1005
|
|
|
720
1006
|
return result
|
|
721
1007
|
|
|
@@ -740,7 +1026,7 @@ class OpenSearchOperations:
|
|
|
740
1026
|
def _extract_grouped_buckets( # noqa: C901
|
|
741
1027
|
self,
|
|
742
1028
|
aggs_response: Dict[str, Any],
|
|
743
|
-
group_by_fields: List[
|
|
1029
|
+
group_by_fields: List[Any],
|
|
744
1030
|
aggregations: List[Dict[str, Any]],
|
|
745
1031
|
stats_ast: Dict[str, Any],
|
|
746
1032
|
) -> List[Dict[str, Any]]:
|
|
@@ -748,7 +1034,7 @@ class OpenSearchOperations:
|
|
|
748
1034
|
|
|
749
1035
|
Args:
|
|
750
1036
|
aggs_response: OpenSearch aggregations response
|
|
751
|
-
group_by_fields: List of fields used for grouping
|
|
1037
|
+
group_by_fields: List of fields used for grouping (can be strings or dicts)
|
|
752
1038
|
aggregations: List of aggregation specifications
|
|
753
1039
|
stats_ast: The stats AST for reference
|
|
754
1040
|
|
|
@@ -757,9 +1043,19 @@ class OpenSearchOperations:
|
|
|
757
1043
|
"""
|
|
758
1044
|
buckets = []
|
|
759
1045
|
|
|
1046
|
+
# Normalize group_by_fields to extract field names
|
|
1047
|
+
normalized_fields = []
|
|
1048
|
+
for field in group_by_fields:
|
|
1049
|
+
if isinstance(field, str):
|
|
1050
|
+
normalized_fields.append(field)
|
|
1051
|
+
elif isinstance(field, dict) and "field" in field:
|
|
1052
|
+
normalized_fields.append(field["field"])
|
|
1053
|
+
else:
|
|
1054
|
+
normalized_fields.append(str(field))
|
|
1055
|
+
|
|
760
1056
|
# For single-level grouping
|
|
761
|
-
if len(
|
|
762
|
-
field =
|
|
1057
|
+
if len(normalized_fields) == 1:
|
|
1058
|
+
field = normalized_fields[0]
|
|
763
1059
|
# Look for the terms aggregation with the group field name
|
|
764
1060
|
terms_agg_name = f"group_by_{field}"
|
|
765
1061
|
|
|
@@ -811,6 +1107,16 @@ class OpenSearchOperations:
|
|
|
811
1107
|
bucket_result[output_key] = agg_value["value"]
|
|
812
1108
|
else:
|
|
813
1109
|
bucket_result[output_key] = agg_value
|
|
1110
|
+
else:
|
|
1111
|
+
# For count(*), also check doc_count
|
|
1112
|
+
if func == "count" and field_name == "*":
|
|
1113
|
+
bucket_result[output_key] = bucket.get("doc_count", 0)
|
|
1114
|
+
else:
|
|
1115
|
+
# Try to find any aggregation value in the bucket
|
|
1116
|
+
for key, value in bucket.items():
|
|
1117
|
+
if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
|
|
1118
|
+
bucket_result[output_key] = value["value"]
|
|
1119
|
+
break
|
|
814
1120
|
|
|
815
1121
|
buckets.append(bucket_result)
|
|
816
1122
|
|
|
@@ -820,7 +1126,7 @@ class OpenSearchOperations:
|
|
|
820
1126
|
current_agg = aggs_response
|
|
821
1127
|
|
|
822
1128
|
# Find the first group_by aggregation
|
|
823
|
-
for field in
|
|
1129
|
+
for field in normalized_fields:
|
|
824
1130
|
group_key = f"group_by_{field}"
|
|
825
1131
|
if group_key in current_agg:
|
|
826
1132
|
current_agg = current_agg[group_key]
|
|
@@ -831,7 +1137,7 @@ class OpenSearchOperations:
|
|
|
831
1137
|
|
|
832
1138
|
# Process nested buckets recursively
|
|
833
1139
|
if "buckets" in current_agg:
|
|
834
|
-
buckets = self._process_nested_buckets(current_agg["buckets"],
|
|
1140
|
+
buckets = self._process_nested_buckets(current_agg["buckets"], normalized_fields, aggregations, 0)
|
|
835
1141
|
|
|
836
1142
|
return buckets
|
|
837
1143
|
|
|
@@ -846,7 +1152,7 @@ class OpenSearchOperations:
|
|
|
846
1152
|
|
|
847
1153
|
Args:
|
|
848
1154
|
buckets_data: List of bucket data from OpenSearch
|
|
849
|
-
group_by_fields: List of fields used for grouping
|
|
1155
|
+
group_by_fields: List of fields used for grouping (already normalized to strings)
|
|
850
1156
|
aggregations: List of aggregation specifications
|
|
851
1157
|
level: Current nesting level (0-based)
|
|
852
1158
|
|
|
@@ -910,6 +1216,16 @@ class OpenSearchOperations:
|
|
|
910
1216
|
result[output_key] = agg_value["value"]
|
|
911
1217
|
else:
|
|
912
1218
|
result[output_key] = agg_value
|
|
1219
|
+
else:
|
|
1220
|
+
# For count(*), also check doc_count
|
|
1221
|
+
if func == "count" and field_name == "*":
|
|
1222
|
+
result[output_key] = bucket.get("doc_count", 0)
|
|
1223
|
+
else:
|
|
1224
|
+
# Try to find any aggregation value in the bucket
|
|
1225
|
+
for key, value in bucket.items():
|
|
1226
|
+
if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
|
|
1227
|
+
result[output_key] = value["value"]
|
|
1228
|
+
break
|
|
913
1229
|
|
|
914
1230
|
results.append(result)
|
|
915
1231
|
|