tellaro-query-language 0.1.9__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellaro_query_language-0.1.9.dist-info → tellaro_query_language-0.2.1.dist-info}/METADATA +23 -1
- {tellaro_query_language-0.1.9.dist-info → tellaro_query_language-0.2.1.dist-info}/RECORD +26 -26
- tql/core.py +230 -36
- tql/core_components/opensearch_operations.py +413 -90
- tql/core_components/stats_operations.py +11 -1
- tql/evaluator.py +39 -2
- tql/evaluator_components/special_expressions.py +25 -6
- tql/evaluator_components/value_comparison.py +31 -3
- tql/mutator_analyzer.py +640 -242
- tql/mutators/__init__.py +5 -1
- tql/mutators/dns.py +76 -53
- tql/mutators/security.py +101 -100
- tql/mutators/string.py +74 -0
- tql/opensearch_components/field_mapping.py +9 -3
- tql/opensearch_components/lucene_converter.py +12 -0
- tql/opensearch_components/query_converter.py +134 -25
- tql/opensearch_stats.py +170 -39
- tql/parser.py +92 -37
- tql/parser_components/ast_builder.py +37 -1
- tql/parser_components/field_extractor.py +9 -1
- tql/parser_components/grammar.py +32 -8
- tql/post_processor.py +489 -31
- tql/stats_evaluator.py +170 -12
- {tellaro_query_language-0.1.9.dist-info → tellaro_query_language-0.2.1.dist-info}/LICENSE +0 -0
- {tellaro_query_language-0.1.9.dist-info → tellaro_query_language-0.2.1.dist-info}/WHEEL +0 -0
- {tellaro_query_language-0.1.9.dist-info → tellaro_query_language-0.2.1.dist-info}/entry_points.txt +0 -0
|
@@ -45,11 +45,20 @@ class OpenSearchOperations:
|
|
|
45
45
|
# Parse the query
|
|
46
46
|
ast = self.parser.parse(query)
|
|
47
47
|
|
|
48
|
+
# Analyze the query for mutators
|
|
49
|
+
from ..mutator_analyzer import MutatorAnalyzer
|
|
50
|
+
|
|
51
|
+
analyzer = MutatorAnalyzer(self.field_mappings)
|
|
52
|
+
analysis_result = analyzer.analyze_ast(ast, context="opensearch")
|
|
53
|
+
|
|
54
|
+
# Use the optimized AST (with array operators removed)
|
|
55
|
+
optimized_ast = analysis_result.optimized_ast
|
|
56
|
+
|
|
48
57
|
# Create OpenSearch backend
|
|
49
58
|
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
50
59
|
|
|
51
|
-
# Convert to OpenSearch query
|
|
52
|
-
opensearch_query = backend.convert(
|
|
60
|
+
# Convert to OpenSearch query using the optimized AST
|
|
61
|
+
opensearch_query = backend.convert(optimized_ast)
|
|
53
62
|
|
|
54
63
|
return opensearch_query
|
|
55
64
|
|
|
@@ -96,7 +105,7 @@ class OpenSearchOperations:
|
|
|
96
105
|
}
|
|
97
106
|
|
|
98
107
|
# Create analyzer
|
|
99
|
-
analyzer = MutatorAnalyzer(self.
|
|
108
|
+
analyzer = MutatorAnalyzer(self.field_mappings)
|
|
100
109
|
|
|
101
110
|
# Analyze the AST
|
|
102
111
|
return analyzer.analyze_ast(ast)
|
|
@@ -175,7 +184,7 @@ class OpenSearchOperations:
|
|
|
175
184
|
query: TQL query string
|
|
176
185
|
index: OpenSearch index name (uses environment variable if not provided)
|
|
177
186
|
size: Maximum number of results to return (default: 10000)
|
|
178
|
-
from_:
|
|
187
|
+
from_: Starting offset for pagination (max 10000 - size)
|
|
179
188
|
sort: List of sort specifications
|
|
180
189
|
source_includes: Fields to include in response
|
|
181
190
|
source_excludes: Fields to exclude from response
|
|
@@ -226,6 +235,13 @@ class OpenSearchOperations:
|
|
|
226
235
|
is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
|
|
227
236
|
|
|
228
237
|
if is_stats_query:
|
|
238
|
+
# Analyze the query to check for mutators
|
|
239
|
+
analysis_result = self.analyze_opensearch_query(query)
|
|
240
|
+
has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
|
|
241
|
+
needs_post_processing_for_stats = (
|
|
242
|
+
has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
|
|
243
|
+
)
|
|
244
|
+
|
|
229
245
|
# Handle stats queries differently
|
|
230
246
|
from ..opensearch_stats import OpenSearchStatsTranslator
|
|
231
247
|
|
|
@@ -240,7 +256,11 @@ class OpenSearchOperations:
|
|
|
240
256
|
# Convert filter to OpenSearch query
|
|
241
257
|
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
242
258
|
if filter_ast:
|
|
243
|
-
|
|
259
|
+
# Use the optimized AST if we have mutators
|
|
260
|
+
if has_mutators and needs_post_processing_for_stats:
|
|
261
|
+
filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
|
|
262
|
+
else:
|
|
263
|
+
filter_query = backend.convert(filter_ast)["query"]
|
|
244
264
|
else:
|
|
245
265
|
filter_query = {"match_all": {}}
|
|
246
266
|
else:
|
|
@@ -248,19 +268,27 @@ class OpenSearchOperations:
|
|
|
248
268
|
stats_ast = ast
|
|
249
269
|
filter_query = {"match_all": {}}
|
|
250
270
|
|
|
251
|
-
#
|
|
252
|
-
if
|
|
253
|
-
|
|
271
|
+
# For stats queries with post-processing mutators, we need to handle them differently
|
|
272
|
+
if needs_post_processing_for_stats:
|
|
273
|
+
# We'll need to fetch all documents and aggregate in memory
|
|
274
|
+
opensearch_query = {"query": filter_query}
|
|
275
|
+
needs_phase2 = True
|
|
276
|
+
# Store the stats AST for later processing
|
|
277
|
+
stats_ast_for_post_processing = stats_ast
|
|
254
278
|
else:
|
|
255
|
-
|
|
279
|
+
# Build aggregations for direct OpenSearch execution
|
|
280
|
+
if stats_ast:
|
|
281
|
+
stats_result = translator.translate_stats(stats_ast, self.field_mappings)
|
|
282
|
+
else:
|
|
283
|
+
stats_result = {"aggs": {}}
|
|
256
284
|
|
|
257
|
-
|
|
258
|
-
|
|
285
|
+
# Extract the aggregations (translate_stats returns {"aggs": {...}})
|
|
286
|
+
aggregations = stats_result.get("aggs", {})
|
|
259
287
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
288
|
+
# Build the complete query
|
|
289
|
+
opensearch_query = {"query": filter_query, "aggs": aggregations}
|
|
290
|
+
needs_phase2 = False
|
|
291
|
+
stats_ast_for_post_processing = None
|
|
264
292
|
else:
|
|
265
293
|
# Parse and analyze the query normally
|
|
266
294
|
analysis_result = self.analyze_opensearch_query(query)
|
|
@@ -316,17 +344,49 @@ class OpenSearchOperations:
|
|
|
316
344
|
base_query = search_body.get("query", {})
|
|
317
345
|
time_filter = {"range": {timestamp_field: time_range}}
|
|
318
346
|
|
|
319
|
-
# Wrap the existing query with time filter
|
|
347
|
+
# Wrap the existing query with time filter in filter context
|
|
320
348
|
if base_query:
|
|
321
|
-
|
|
349
|
+
# If the base query is already a bool query, add to its filter array
|
|
350
|
+
if isinstance(base_query, dict) and base_query.get("bool"):
|
|
351
|
+
bool_query = base_query["bool"]
|
|
352
|
+
if "filter" in bool_query:
|
|
353
|
+
# Add to existing filter array
|
|
354
|
+
if isinstance(bool_query["filter"], list):
|
|
355
|
+
bool_query["filter"].append(time_filter)
|
|
356
|
+
else:
|
|
357
|
+
# Convert single filter to array
|
|
358
|
+
bool_query["filter"] = [bool_query["filter"], time_filter]
|
|
359
|
+
else:
|
|
360
|
+
# No filter array yet, create one
|
|
361
|
+
bool_query["filter"] = [time_filter]
|
|
362
|
+
search_body["query"] = base_query
|
|
363
|
+
else:
|
|
364
|
+
# Wrap in bool query with filter
|
|
365
|
+
search_body["query"] = {"bool": {"filter": [base_query, time_filter]}}
|
|
322
366
|
else:
|
|
323
367
|
search_body["query"] = time_filter
|
|
324
368
|
|
|
325
|
-
|
|
369
|
+
# For stats queries, set size based on whether we need documents for post-processing
|
|
370
|
+
if is_stats_query:
|
|
371
|
+
if needs_phase2:
|
|
372
|
+
# Need all documents for post-processing
|
|
373
|
+
search_body.update({"size": 10000, "track_total_hits": track_total_hits})
|
|
374
|
+
else:
|
|
375
|
+
# Pure aggregation query - no documents needed
|
|
376
|
+
search_body.update({"size": 0, "track_total_hits": track_total_hits})
|
|
377
|
+
else:
|
|
378
|
+
search_body.update({"size": size, "track_total_hits": track_total_hits})
|
|
326
379
|
|
|
327
380
|
# Add optional parameters
|
|
328
381
|
if sort:
|
|
329
382
|
search_body["sort"] = sort
|
|
383
|
+
|
|
384
|
+
# Add from parameter for pagination (limit to 10000 total)
|
|
385
|
+
if from_ > 0:
|
|
386
|
+
# Ensure we don't exceed the 10000 limit
|
|
387
|
+
max_allowed_from = 10000 - size
|
|
388
|
+
from_ = min(from_, max_allowed_from)
|
|
389
|
+
search_body["from"] = from_
|
|
330
390
|
if source_includes or source_excludes:
|
|
331
391
|
search_body["_source"] = {}
|
|
332
392
|
if source_includes:
|
|
@@ -339,6 +399,9 @@ class OpenSearchOperations:
|
|
|
339
399
|
# Add any additional parameters from kwargs
|
|
340
400
|
search_body.update(kwargs)
|
|
341
401
|
|
|
402
|
+
# Store the complete search body for debugging
|
|
403
|
+
complete_opensearch_query = search_body.copy()
|
|
404
|
+
|
|
342
405
|
# Build search parameters
|
|
343
406
|
search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
|
|
344
407
|
|
|
@@ -406,43 +469,125 @@ class OpenSearchOperations:
|
|
|
406
469
|
|
|
407
470
|
# Handle stats query results differently
|
|
408
471
|
if is_stats_query:
|
|
409
|
-
|
|
410
|
-
|
|
472
|
+
if needs_phase2 and "stats_ast_for_post_processing" in locals():
|
|
473
|
+
# Stats query with post-processing - need to aggregate in memory
|
|
474
|
+
# First, get all documents and apply mutators
|
|
475
|
+
all_documents = []
|
|
476
|
+
|
|
477
|
+
# Handle scroll for large datasets
|
|
478
|
+
if scan_all or needs_phase2:
|
|
479
|
+
# Use scroll to get all documents
|
|
480
|
+
scroll_params = search_params.copy()
|
|
481
|
+
scroll_params["scroll"] = scroll_timeout
|
|
482
|
+
scroll_params["body"]["size"] = min(10000, scroll_size)
|
|
411
483
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
484
|
+
try:
|
|
485
|
+
# Initial search
|
|
486
|
+
scroll_response = client.search(**scroll_params)
|
|
487
|
+
scroll_hits = scroll_response.get("hits", {}).get("hits", [])
|
|
488
|
+
|
|
489
|
+
while scroll_hits:
|
|
490
|
+
for hit in scroll_hits:
|
|
491
|
+
all_documents.append(hit["_source"])
|
|
492
|
+
|
|
493
|
+
scroll_id = scroll_response.get("_scroll_id")
|
|
494
|
+
if not scroll_id:
|
|
495
|
+
break
|
|
496
|
+
|
|
497
|
+
scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
498
|
+
scroll_hits = scroll_response.get("hits", {}).get("hits", [])
|
|
499
|
+
|
|
500
|
+
# Clean up scroll
|
|
501
|
+
if scroll_id:
|
|
502
|
+
try:
|
|
503
|
+
client.clear_scroll(scroll_id=scroll_id)
|
|
504
|
+
except Exception:
|
|
505
|
+
pass
|
|
506
|
+
except Exception as e:
|
|
507
|
+
raise TQLExecutionError(f"Failed to fetch documents for stats post-processing: {str(e)}")
|
|
508
|
+
else:
|
|
509
|
+
# Fetch documents with regular pagination
|
|
510
|
+
for hit in response.get("hits", {}).get("hits", []):
|
|
511
|
+
all_documents.append(hit["_source"])
|
|
512
|
+
|
|
513
|
+
# Apply post-processing mutators
|
|
514
|
+
if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
|
|
515
|
+
processor = QueryPostProcessor()
|
|
516
|
+
processed_docs = processor.process_results(
|
|
517
|
+
all_documents, analysis_result.post_processing_requirements, track_enrichments=False
|
|
518
|
+
)
|
|
519
|
+
# Filter if needed
|
|
520
|
+
filtered_docs = processor.filter_results(
|
|
521
|
+
processed_docs, analysis_result.post_processing_requirements
|
|
522
|
+
)
|
|
523
|
+
else:
|
|
524
|
+
filtered_docs = all_documents
|
|
525
|
+
|
|
526
|
+
# Now perform in-memory aggregation
|
|
527
|
+
from ..stats_evaluator import TQLStatsEvaluator
|
|
528
|
+
|
|
529
|
+
stats_evaluator = TQLStatsEvaluator()
|
|
530
|
+
|
|
531
|
+
# Execute the stats aggregation in memory
|
|
532
|
+
stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
|
|
533
|
+
|
|
534
|
+
# Format response for stats-only (no documents)
|
|
535
|
+
result = {
|
|
536
|
+
"stats": stats_results,
|
|
537
|
+
"total": len(filtered_docs),
|
|
538
|
+
"post_processing_applied": True,
|
|
539
|
+
"health_status": "red",
|
|
540
|
+
"health_reasons": [
|
|
541
|
+
{
|
|
542
|
+
"status": "red",
|
|
543
|
+
"query_part": "stats with post-processing",
|
|
544
|
+
"reason": f"Stats query required fetching {len(all_documents)} documents for post-processing",
|
|
545
|
+
}
|
|
546
|
+
],
|
|
547
|
+
"performance_impact": {
|
|
548
|
+
"overhead_ms": 0, # Would need timing to calculate
|
|
549
|
+
"documents_processed": len(all_documents),
|
|
550
|
+
"mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
|
|
551
|
+
},
|
|
552
|
+
"opensearch_query": complete_opensearch_query,
|
|
553
|
+
}
|
|
418
554
|
|
|
419
|
-
|
|
420
|
-
if stats_ast:
|
|
421
|
-
aggregations = stats_ast.get("aggregations", [])
|
|
422
|
-
group_by_fields = stats_ast.get("group_by", [])
|
|
555
|
+
return result
|
|
423
556
|
else:
|
|
424
|
-
|
|
425
|
-
|
|
557
|
+
# Regular stats query using OpenSearch aggregations
|
|
558
|
+
aggs_response = response.get("aggregations", {})
|
|
426
559
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
560
|
+
# Format the stats results based on the test expectations
|
|
561
|
+
# Use the correct stats AST
|
|
562
|
+
if ast.get("type") == "query_with_stats":
|
|
563
|
+
stats_ast = ast.get("stats")
|
|
564
|
+
else:
|
|
565
|
+
stats_ast = ast
|
|
566
|
+
|
|
567
|
+
# Extract aggregation info
|
|
430
568
|
if stats_ast:
|
|
431
|
-
|
|
569
|
+
aggregations = stats_ast.get("aggregations", [])
|
|
570
|
+
group_by_fields = stats_ast.get("group_by", [])
|
|
432
571
|
else:
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
572
|
+
aggregations = []
|
|
573
|
+
group_by_fields = []
|
|
574
|
+
|
|
575
|
+
# Format results differently based on whether we have grouping
|
|
576
|
+
if group_by_fields:
|
|
577
|
+
# Use the OpenSearchStatsTranslator to properly transform the response
|
|
578
|
+
from ..opensearch_stats import OpenSearchStatsTranslator
|
|
579
|
+
|
|
580
|
+
translator = OpenSearchStatsTranslator()
|
|
581
|
+
|
|
582
|
+
# Transform the response using the translator
|
|
583
|
+
transformed_response = translator.transform_response(response, stats_ast)
|
|
584
|
+
|
|
585
|
+
# The transformed response already has the correct structure
|
|
586
|
+
stats_results = transformed_response
|
|
587
|
+
|
|
588
|
+
# Add viz_hint if present in stats AST
|
|
589
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
590
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
446
591
|
else:
|
|
447
592
|
# Simple aggregations without grouping
|
|
448
593
|
if aggregations:
|
|
@@ -486,6 +631,10 @@ class OpenSearchOperations:
|
|
|
486
631
|
"values": value,
|
|
487
632
|
"group_by": [],
|
|
488
633
|
}
|
|
634
|
+
|
|
635
|
+
# Add viz_hint if present in stats AST
|
|
636
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
637
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
489
638
|
else:
|
|
490
639
|
# Multiple aggregations
|
|
491
640
|
agg_results = {}
|
|
@@ -524,50 +673,143 @@ class OpenSearchOperations:
|
|
|
524
673
|
"type": "stats",
|
|
525
674
|
"results": agg_results,
|
|
526
675
|
}
|
|
676
|
+
|
|
677
|
+
# Add viz_hint if present in stats AST
|
|
678
|
+
if stats_ast and stats_ast.get("viz_hint"):
|
|
679
|
+
stats_results["viz_hint"] = stats_ast["viz_hint"]
|
|
527
680
|
else:
|
|
528
681
|
stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
|
|
529
682
|
|
|
530
|
-
#
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
doc = hit["_source"].copy()
|
|
536
|
-
# Preserve metadata
|
|
537
|
-
if "_id" in hit:
|
|
538
|
-
doc["_id"] = hit["_id"]
|
|
539
|
-
if "_score" in hit:
|
|
540
|
-
doc["_score"] = hit["_score"]
|
|
541
|
-
documents.append(doc)
|
|
542
|
-
|
|
543
|
-
# Return in the expected format
|
|
683
|
+
# For stats queries, return only stats (no documents)
|
|
684
|
+
# Total from aggregation metadata or hit count
|
|
685
|
+
total_count = response.get("hits", {}).get("total", {}).get("value", 0)
|
|
686
|
+
|
|
687
|
+
# Return stats-only format
|
|
544
688
|
result = {
|
|
545
|
-
"results": documents,
|
|
546
|
-
"total": response.get("hits", {}).get("total", {}).get("value", 0),
|
|
547
689
|
"stats": stats_results,
|
|
690
|
+
"total": total_count,
|
|
548
691
|
"post_processing_applied": False,
|
|
549
|
-
"health_status": "
|
|
692
|
+
"health_status": "green",
|
|
550
693
|
"health_reasons": [],
|
|
551
694
|
"performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
|
|
552
|
-
"
|
|
695
|
+
"opensearch_query": complete_opensearch_query,
|
|
696
|
+
"query_type": "stats",
|
|
553
697
|
}
|
|
554
698
|
|
|
555
|
-
# Add query_type if documents were requested
|
|
556
|
-
if size > 0:
|
|
557
|
-
result["query_type"] = "stats_with_docs"
|
|
558
|
-
|
|
559
699
|
return result
|
|
560
700
|
|
|
561
701
|
# Extract hits for regular queries
|
|
562
|
-
|
|
702
|
+
initial_hits = response.get("hits", {}).get("hits", [])
|
|
563
703
|
total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
|
|
564
704
|
|
|
565
705
|
# Process results based on whether we need Phase 2
|
|
566
|
-
if needs_phase2:
|
|
567
|
-
#
|
|
706
|
+
if needs_phase2 and not scan_all:
|
|
707
|
+
# Pagination with post-processing - continue fetching pages until we get results
|
|
708
|
+
processor = QueryPostProcessor()
|
|
709
|
+
results: List[Dict[str, Any]] = []
|
|
710
|
+
total_documents_before_filter = 0
|
|
711
|
+
total_documents_after_filter = 0
|
|
712
|
+
current_from = from_
|
|
713
|
+
pages_checked = 0
|
|
714
|
+
max_pages_to_check = min(10, (total_hits // size) + 1) if size > 0 else 1 # Limit to prevent infinite loops
|
|
715
|
+
|
|
716
|
+
while len(results) < size and pages_checked < max_pages_to_check and current_from < total_hits:
|
|
717
|
+
# Fetch current page
|
|
718
|
+
if pages_checked > 0:
|
|
719
|
+
# Need to fetch next page
|
|
720
|
+
search_params["body"]["from"] = current_from
|
|
721
|
+
try:
|
|
722
|
+
response = client.search(**search_params)
|
|
723
|
+
except Exception as e:
|
|
724
|
+
raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
|
|
725
|
+
current_hits = response.get("hits", {}).get("hits", [])
|
|
726
|
+
else:
|
|
727
|
+
# Use initial hits for first page
|
|
728
|
+
current_hits = initial_hits
|
|
729
|
+
|
|
730
|
+
if not current_hits:
|
|
731
|
+
break # No more results
|
|
732
|
+
|
|
733
|
+
# Process the hits from this page
|
|
734
|
+
documents = []
|
|
735
|
+
hit_metadata = []
|
|
736
|
+
for hit in current_hits:
|
|
737
|
+
documents.append(hit["_source"])
|
|
738
|
+
hit_metadata.append(
|
|
739
|
+
{
|
|
740
|
+
"_id": hit.get("_id"),
|
|
741
|
+
"_score": hit.get("_score"),
|
|
742
|
+
"_explanation": hit.get("_explanation") if explain else None,
|
|
743
|
+
}
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
total_documents_before_filter += len(documents)
|
|
747
|
+
|
|
748
|
+
# Apply post-processing
|
|
749
|
+
if isinstance(analysis_result, MutatorAnalysisResult):
|
|
750
|
+
processed_docs = processor.process_results(
|
|
751
|
+
documents,
|
|
752
|
+
analysis_result.post_processing_requirements,
|
|
753
|
+
track_enrichments=kwargs.get("save_enrichment", False),
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
# Filter results
|
|
757
|
+
filtered_docs = processor.filter_results(
|
|
758
|
+
processed_docs, analysis_result.post_processing_requirements
|
|
759
|
+
)
|
|
760
|
+
else:
|
|
761
|
+
processed_docs = documents
|
|
762
|
+
filtered_docs = documents
|
|
763
|
+
|
|
764
|
+
# Add filtered results with metadata
|
|
765
|
+
for doc in filtered_docs:
|
|
766
|
+
if len(results) >= size:
|
|
767
|
+
break # We have enough results
|
|
768
|
+
|
|
769
|
+
# Find the original hit metadata
|
|
770
|
+
for i, orig_doc in enumerate(documents):
|
|
771
|
+
if orig_doc == doc or self._docs_match(orig_doc, doc):
|
|
772
|
+
# Add metadata
|
|
773
|
+
if hit_metadata[i]["_id"]:
|
|
774
|
+
doc["_id"] = hit_metadata[i]["_id"]
|
|
775
|
+
if hit_metadata[i]["_score"]:
|
|
776
|
+
doc["_score"] = hit_metadata[i]["_score"]
|
|
777
|
+
if hit_metadata[i]["_explanation"]:
|
|
778
|
+
doc["_explanation"] = hit_metadata[i]["_explanation"]
|
|
779
|
+
break
|
|
780
|
+
results.append(doc)
|
|
781
|
+
|
|
782
|
+
total_documents_after_filter += len(filtered_docs)
|
|
783
|
+
|
|
784
|
+
# Move to next page
|
|
785
|
+
current_from += size
|
|
786
|
+
pages_checked += 1
|
|
787
|
+
|
|
788
|
+
# Store filtering stats
|
|
789
|
+
pagination_stats = {
|
|
790
|
+
"page_size": size,
|
|
791
|
+
"pages_checked": pages_checked,
|
|
792
|
+
"documents_retrieved": total_documents_before_filter,
|
|
793
|
+
"documents_returned": len(results),
|
|
794
|
+
"documents_filtered": total_documents_before_filter - total_documents_after_filter,
|
|
795
|
+
"filter_rate": (
|
|
796
|
+
(
|
|
797
|
+
(total_documents_before_filter - total_documents_after_filter)
|
|
798
|
+
/ total_documents_before_filter
|
|
799
|
+
* 100
|
|
800
|
+
)
|
|
801
|
+
if total_documents_before_filter > 0
|
|
802
|
+
else 0
|
|
803
|
+
),
|
|
804
|
+
"actual_from": from_, # Original from
|
|
805
|
+
"actual_to": current_from, # Where we ended up searching to
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
elif needs_phase2 and scan_all:
|
|
809
|
+
# scan_all mode with post-processing - process all results
|
|
568
810
|
processor = QueryPostProcessor()
|
|
569
811
|
|
|
570
|
-
# Extract documents from hits
|
|
812
|
+
# Extract all documents from hits
|
|
571
813
|
documents = []
|
|
572
814
|
hit_metadata = []
|
|
573
815
|
for hit in hits:
|
|
@@ -588,10 +830,9 @@ class OpenSearchOperations:
|
|
|
588
830
|
track_enrichments=kwargs.get("save_enrichment", False),
|
|
589
831
|
)
|
|
590
832
|
|
|
591
|
-
# Then filter results based on requirements
|
|
833
|
+
# Then filter results based on requirements
|
|
592
834
|
filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
|
|
593
835
|
else:
|
|
594
|
-
# No post-processing needed
|
|
595
836
|
processed_docs = documents
|
|
596
837
|
filtered_docs = documents
|
|
597
838
|
|
|
@@ -599,7 +840,6 @@ class OpenSearchOperations:
|
|
|
599
840
|
results = []
|
|
600
841
|
for doc in filtered_docs:
|
|
601
842
|
# Find the original hit metadata for this document
|
|
602
|
-
# This is a simple approach - in production you might want to track IDs
|
|
603
843
|
for i, orig_doc in enumerate(documents):
|
|
604
844
|
if orig_doc == doc or self._docs_match(orig_doc, doc):
|
|
605
845
|
# Add metadata
|
|
@@ -611,9 +851,17 @@ class OpenSearchOperations:
|
|
|
611
851
|
doc["_explanation"] = hit_metadata[i]["_explanation"]
|
|
612
852
|
break
|
|
613
853
|
results.append(doc)
|
|
854
|
+
|
|
855
|
+
pagination_stats = {
|
|
856
|
+
"documents_scanned": len(documents),
|
|
857
|
+
"documents_passed": len(results),
|
|
858
|
+
"filter_rate": (len(results) / len(documents) * 100) if documents else 0,
|
|
859
|
+
}
|
|
860
|
+
|
|
614
861
|
else:
|
|
615
862
|
# No Phase 2 needed, just extract documents
|
|
616
863
|
results = []
|
|
864
|
+
hits = initial_hits # Use the initial hits
|
|
617
865
|
for hit in hits:
|
|
618
866
|
doc = hit["_source"].copy()
|
|
619
867
|
# Preserve metadata
|
|
@@ -625,6 +873,8 @@ class OpenSearchOperations:
|
|
|
625
873
|
doc["_explanation"] = hit["explanation"]
|
|
626
874
|
results.append(doc)
|
|
627
875
|
|
|
876
|
+
pagination_stats = None
|
|
877
|
+
|
|
628
878
|
# Return raw response if requested
|
|
629
879
|
if kwargs.get("raw_response", False):
|
|
630
880
|
return {
|
|
@@ -686,8 +936,8 @@ class OpenSearchOperations:
|
|
|
686
936
|
"performance_impact": performance_impact,
|
|
687
937
|
"optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
|
|
688
938
|
"opensearch_query": (
|
|
689
|
-
|
|
690
|
-
), # Include the query
|
|
939
|
+
complete_opensearch_query if "complete_opensearch_query" in locals() else {}
|
|
940
|
+
), # Include the full query body
|
|
691
941
|
"time_range": time_range,
|
|
692
942
|
"timestamp_field": timestamp_field,
|
|
693
943
|
"query_type": "regular", # Regular query (not stats)
|
|
@@ -701,15 +951,58 @@ class OpenSearchOperations:
|
|
|
701
951
|
},
|
|
702
952
|
}
|
|
703
953
|
|
|
954
|
+
# Add pagination stats if available
|
|
955
|
+
if pagination_stats:
|
|
956
|
+
result["post_processing_stats"] = pagination_stats
|
|
957
|
+
|
|
704
958
|
# Add pagination info for non-scan queries
|
|
705
959
|
if not scan_all:
|
|
706
|
-
|
|
960
|
+
# Cap displayed total at 10000 for consistency
|
|
961
|
+
displayed_total = min(opensearch_total, 10000)
|
|
962
|
+
|
|
963
|
+
pagination_info = {
|
|
707
964
|
"size": size,
|
|
708
965
|
"from": from_,
|
|
709
|
-
"total":
|
|
710
|
-
"
|
|
966
|
+
"total": displayed_total,
|
|
967
|
+
"actual_total": opensearch_total, # Real total for reference
|
|
968
|
+
"returned": len(results),
|
|
711
969
|
}
|
|
712
970
|
|
|
971
|
+
if needs_phase2 and pagination_stats:
|
|
972
|
+
# Post-processing was applied - update pagination to reflect auto-pagination
|
|
973
|
+
actual_last_position = pagination_stats.get("actual_to", from_ + size)
|
|
974
|
+
|
|
975
|
+
# Update from to reflect where we actually searched to
|
|
976
|
+
if pagination_stats["pages_checked"] > 1:
|
|
977
|
+
# We auto-paginated, so update the effective "from" position
|
|
978
|
+
pagination_info["from"] = from_
|
|
979
|
+
pagination_info["actual_from_searched"] = from_
|
|
980
|
+
pagination_info["actual_to_searched"] = actual_last_position
|
|
981
|
+
pagination_info["auto_paginated"] = True
|
|
982
|
+
pagination_info["pages_auto_fetched"] = pagination_stats["pages_checked"]
|
|
983
|
+
|
|
984
|
+
# Has more if we haven't reached the 10000 limit
|
|
985
|
+
pagination_info["has_more"] = actual_last_position < 10000 and actual_last_position < opensearch_total
|
|
986
|
+
pagination_info["documents_retrieved"] = pagination_stats["documents_retrieved"]
|
|
987
|
+
pagination_info["documents_filtered"] = pagination_stats["documents_filtered"]
|
|
988
|
+
pagination_info["filter_rate"] = f"{pagination_stats['filter_rate']:.1f}%"
|
|
989
|
+
|
|
990
|
+
# Calculate the last valid page number (page that contains the 10,000th record)
|
|
991
|
+
last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
|
|
992
|
+
pagination_info["last_page"] = last_page
|
|
993
|
+
pagination_info["current_page"] = from_ // size
|
|
994
|
+
else:
|
|
995
|
+
# Regular pagination without post-processing
|
|
996
|
+
# Has more if we got full page and haven't reached 10000 limit
|
|
997
|
+
pagination_info["has_more"] = len(initial_hits) == size and (from_ + size < 10000)
|
|
998
|
+
|
|
999
|
+
# Calculate the last valid page number
|
|
1000
|
+
last_page = min((10000 - 1) // size, (opensearch_total - 1) // size)
|
|
1001
|
+
pagination_info["last_page"] = last_page
|
|
1002
|
+
pagination_info["current_page"] = from_ // size
|
|
1003
|
+
|
|
1004
|
+
result["pagination"] = pagination_info
|
|
1005
|
+
|
|
713
1006
|
return result
|
|
714
1007
|
|
|
715
1008
|
def _docs_match(self, doc1: Dict[str, Any], doc2: Dict[str, Any]) -> bool:
|
|
@@ -733,7 +1026,7 @@ class OpenSearchOperations:
|
|
|
733
1026
|
def _extract_grouped_buckets( # noqa: C901
|
|
734
1027
|
self,
|
|
735
1028
|
aggs_response: Dict[str, Any],
|
|
736
|
-
group_by_fields: List[
|
|
1029
|
+
group_by_fields: List[Any],
|
|
737
1030
|
aggregations: List[Dict[str, Any]],
|
|
738
1031
|
stats_ast: Dict[str, Any],
|
|
739
1032
|
) -> List[Dict[str, Any]]:
|
|
@@ -741,7 +1034,7 @@ class OpenSearchOperations:
|
|
|
741
1034
|
|
|
742
1035
|
Args:
|
|
743
1036
|
aggs_response: OpenSearch aggregations response
|
|
744
|
-
group_by_fields: List of fields used for grouping
|
|
1037
|
+
group_by_fields: List of fields used for grouping (can be strings or dicts)
|
|
745
1038
|
aggregations: List of aggregation specifications
|
|
746
1039
|
stats_ast: The stats AST for reference
|
|
747
1040
|
|
|
@@ -750,9 +1043,19 @@ class OpenSearchOperations:
|
|
|
750
1043
|
"""
|
|
751
1044
|
buckets = []
|
|
752
1045
|
|
|
1046
|
+
# Normalize group_by_fields to extract field names
|
|
1047
|
+
normalized_fields = []
|
|
1048
|
+
for field in group_by_fields:
|
|
1049
|
+
if isinstance(field, str):
|
|
1050
|
+
normalized_fields.append(field)
|
|
1051
|
+
elif isinstance(field, dict) and "field" in field:
|
|
1052
|
+
normalized_fields.append(field["field"])
|
|
1053
|
+
else:
|
|
1054
|
+
normalized_fields.append(str(field))
|
|
1055
|
+
|
|
753
1056
|
# For single-level grouping
|
|
754
|
-
if len(
|
|
755
|
-
field =
|
|
1057
|
+
if len(normalized_fields) == 1:
|
|
1058
|
+
field = normalized_fields[0]
|
|
756
1059
|
# Look for the terms aggregation with the group field name
|
|
757
1060
|
terms_agg_name = f"group_by_{field}"
|
|
758
1061
|
|
|
@@ -804,6 +1107,16 @@ class OpenSearchOperations:
|
|
|
804
1107
|
bucket_result[output_key] = agg_value["value"]
|
|
805
1108
|
else:
|
|
806
1109
|
bucket_result[output_key] = agg_value
|
|
1110
|
+
else:
|
|
1111
|
+
# For count(*), also check doc_count
|
|
1112
|
+
if func == "count" and field_name == "*":
|
|
1113
|
+
bucket_result[output_key] = bucket.get("doc_count", 0)
|
|
1114
|
+
else:
|
|
1115
|
+
# Try to find any aggregation value in the bucket
|
|
1116
|
+
for key, value in bucket.items():
|
|
1117
|
+
if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
|
|
1118
|
+
bucket_result[output_key] = value["value"]
|
|
1119
|
+
break
|
|
807
1120
|
|
|
808
1121
|
buckets.append(bucket_result)
|
|
809
1122
|
|
|
@@ -813,7 +1126,7 @@ class OpenSearchOperations:
|
|
|
813
1126
|
current_agg = aggs_response
|
|
814
1127
|
|
|
815
1128
|
# Find the first group_by aggregation
|
|
816
|
-
for field in
|
|
1129
|
+
for field in normalized_fields:
|
|
817
1130
|
group_key = f"group_by_{field}"
|
|
818
1131
|
if group_key in current_agg:
|
|
819
1132
|
current_agg = current_agg[group_key]
|
|
@@ -824,7 +1137,7 @@ class OpenSearchOperations:
|
|
|
824
1137
|
|
|
825
1138
|
# Process nested buckets recursively
|
|
826
1139
|
if "buckets" in current_agg:
|
|
827
|
-
buckets = self._process_nested_buckets(current_agg["buckets"],
|
|
1140
|
+
buckets = self._process_nested_buckets(current_agg["buckets"], normalized_fields, aggregations, 0)
|
|
828
1141
|
|
|
829
1142
|
return buckets
|
|
830
1143
|
|
|
@@ -839,7 +1152,7 @@ class OpenSearchOperations:
|
|
|
839
1152
|
|
|
840
1153
|
Args:
|
|
841
1154
|
buckets_data: List of bucket data from OpenSearch
|
|
842
|
-
group_by_fields: List of fields used for grouping
|
|
1155
|
+
group_by_fields: List of fields used for grouping (already normalized to strings)
|
|
843
1156
|
aggregations: List of aggregation specifications
|
|
844
1157
|
level: Current nesting level (0-based)
|
|
845
1158
|
|
|
@@ -903,6 +1216,16 @@ class OpenSearchOperations:
|
|
|
903
1216
|
result[output_key] = agg_value["value"]
|
|
904
1217
|
else:
|
|
905
1218
|
result[output_key] = agg_value
|
|
1219
|
+
else:
|
|
1220
|
+
# For count(*), also check doc_count
|
|
1221
|
+
if func == "count" and field_name == "*":
|
|
1222
|
+
result[output_key] = bucket.get("doc_count", 0)
|
|
1223
|
+
else:
|
|
1224
|
+
# Try to find any aggregation value in the bucket
|
|
1225
|
+
for key, value in bucket.items():
|
|
1226
|
+
if key.startswith(f"{func}_") and isinstance(value, dict) and "value" in value:
|
|
1227
|
+
result[output_key] = value["value"]
|
|
1228
|
+
break
|
|
906
1229
|
|
|
907
1230
|
results.append(result)
|
|
908
1231
|
|