tellaro-query-language 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tellaro_query_language-0.1.0.dist-info/LICENSE +21 -0
  2. tellaro_query_language-0.1.0.dist-info/METADATA +401 -0
  3. tellaro_query_language-0.1.0.dist-info/RECORD +56 -0
  4. tellaro_query_language-0.1.0.dist-info/WHEEL +4 -0
  5. tellaro_query_language-0.1.0.dist-info/entry_points.txt +7 -0
  6. tql/__init__.py +47 -0
  7. tql/analyzer.py +385 -0
  8. tql/cache/__init__.py +7 -0
  9. tql/cache/base.py +25 -0
  10. tql/cache/memory.py +63 -0
  11. tql/cache/redis.py +68 -0
  12. tql/core.py +929 -0
  13. tql/core_components/README.md +92 -0
  14. tql/core_components/__init__.py +20 -0
  15. tql/core_components/file_operations.py +113 -0
  16. tql/core_components/opensearch_operations.py +869 -0
  17. tql/core_components/stats_operations.py +200 -0
  18. tql/core_components/validation_operations.py +599 -0
  19. tql/evaluator.py +379 -0
  20. tql/evaluator_components/README.md +131 -0
  21. tql/evaluator_components/__init__.py +17 -0
  22. tql/evaluator_components/field_access.py +176 -0
  23. tql/evaluator_components/special_expressions.py +296 -0
  24. tql/evaluator_components/value_comparison.py +315 -0
  25. tql/exceptions.py +160 -0
  26. tql/geoip_normalizer.py +233 -0
  27. tql/mutator_analyzer.py +830 -0
  28. tql/mutators/__init__.py +222 -0
  29. tql/mutators/base.py +78 -0
  30. tql/mutators/dns.py +316 -0
  31. tql/mutators/encoding.py +218 -0
  32. tql/mutators/geo.py +363 -0
  33. tql/mutators/list.py +212 -0
  34. tql/mutators/network.py +163 -0
  35. tql/mutators/security.py +225 -0
  36. tql/mutators/string.py +165 -0
  37. tql/opensearch.py +78 -0
  38. tql/opensearch_components/README.md +130 -0
  39. tql/opensearch_components/__init__.py +17 -0
  40. tql/opensearch_components/field_mapping.py +399 -0
  41. tql/opensearch_components/lucene_converter.py +305 -0
  42. tql/opensearch_components/query_converter.py +775 -0
  43. tql/opensearch_mappings.py +309 -0
  44. tql/opensearch_stats.py +451 -0
  45. tql/parser.py +1363 -0
  46. tql/parser_components/README.md +72 -0
  47. tql/parser_components/__init__.py +20 -0
  48. tql/parser_components/ast_builder.py +162 -0
  49. tql/parser_components/error_analyzer.py +101 -0
  50. tql/parser_components/field_extractor.py +112 -0
  51. tql/parser_components/grammar.py +473 -0
  52. tql/post_processor.py +737 -0
  53. tql/scripts.py +124 -0
  54. tql/stats_evaluator.py +444 -0
  55. tql/stats_transformer.py +184 -0
  56. tql/validators.py +110 -0
@@ -0,0 +1,869 @@
1
+ """OpenSearch operations for TQL.
2
+
3
+ This module handles all OpenSearch-specific operations including query conversion,
4
+ execution, and result processing.
5
+ """
6
+
7
+ import os
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ from ..exceptions import TQLExecutionError
11
+ from ..mutator_analyzer import MutatorAnalysisResult, MutatorAnalyzer
12
+ from ..opensearch import OpenSearchBackend
13
+ from ..parser import TQLParser
14
+ from ..post_processor import QueryPostProcessor
15
+
16
+
17
+ class OpenSearchOperations:
18
+ """Handles OpenSearch-specific operations for TQL."""
19
+
20
+ def __init__(self, parser: TQLParser, field_mappings: Dict[str, Any], enhanced_mappings: Dict[str, Any]):
21
+ """Initialize OpenSearch operations.
22
+
23
+ Args:
24
+ parser: TQL parser instance
25
+ field_mappings: Field mapping configuration
26
+ enhanced_mappings: Enhanced field mappings with analyzer info
27
+ """
28
+ self.parser = parser
29
+ self.field_mappings = field_mappings
30
+ self.enhanced_mappings = enhanced_mappings
31
+ self.has_analyzer_info = any(mapping.is_enhanced_mapping() for mapping in self.enhanced_mappings.values())
32
+
33
+ def to_opensearch(self, query: str) -> Dict[str, Any]:
34
+ """Convert TQL query to OpenSearch query format.
35
+
36
+ Args:
37
+ query: TQL query string
38
+
39
+ Returns:
40
+ OpenSearch query dictionary
41
+
42
+ Raises:
43
+ TQLParseError: If query parsing fails
44
+ """
45
+ # Parse the query
46
+ ast = self.parser.parse(query)
47
+
48
+ # Create OpenSearch backend
49
+ backend = OpenSearchBackend(field_mappings=self.field_mappings)
50
+
51
+ # Convert to OpenSearch query
52
+ opensearch_query = backend.convert(ast)
53
+
54
+ return opensearch_query
55
+
56
+ def to_opensearch_dsl(self, query: str) -> Dict[str, Any]:
57
+ """Convert TQL query to OpenSearch DSL format.
58
+
59
+ This is an alias for to_opensearch() for backward compatibility.
60
+
61
+ Args:
62
+ query: TQL query string
63
+
64
+ Returns:
65
+ OpenSearch DSL query dictionary
66
+ """
67
+ return self.to_opensearch(query)
68
+
69
+ def analyze_opensearch_query(self, query: str) -> Union[MutatorAnalysisResult, Dict[str, Any]]:
70
+ """Analyze a TQL query for OpenSearch optimization opportunities.
71
+
72
+ This method examines mutator usage and field mappings to determine:
73
+ 1. Which mutators can be pushed to OpenSearch (Phase 1)
74
+ 2. Which mutators must be applied post-query (Phase 2)
75
+ 3. How field mappings affect operator choices
76
+
77
+ Args:
78
+ query: TQL query string
79
+
80
+ Returns:
81
+ MutatorAnalysisResult if mutators present, otherwise analysis dict
82
+ """
83
+ # Parse the query
84
+ ast = self.parser.parse(query)
85
+
86
+ # If there are no mutators, just analyze for field mapping optimizations
87
+ if not self._has_mutators(ast):
88
+ backend = OpenSearchBackend(field_mappings=self.field_mappings)
89
+ os_query = backend.convert(ast)
90
+
91
+ return {
92
+ "has_mutators": False,
93
+ "original_query": query,
94
+ "opensearch_query": os_query,
95
+ "optimizations": self._analyze_field_optimizations(ast),
96
+ }
97
+
98
+ # Create analyzer
99
+ analyzer = MutatorAnalyzer(self.enhanced_mappings)
100
+
101
+ # Analyze the AST
102
+ return analyzer.analyze_ast(ast)
103
+
104
+ def _has_mutators(self, ast: Dict[str, Any]) -> bool:
105
+ """Check if AST contains any mutators."""
106
+ if isinstance(ast, dict):
107
+ # Check for mutators in current node
108
+ if ast.get("field_mutators") or ast.get("value_mutators"):
109
+ return True
110
+
111
+ # Check for special expressions (geo, nslookup)
112
+ if ast.get("type") in ["geo_expr", "nslookup_expr"]:
113
+ return True
114
+
115
+ # Recursively check child nodes
116
+ for key, value in ast.items():
117
+ if key in ["left", "right", "operand", "filter", "conditions"]:
118
+ if self._has_mutators(value):
119
+ return True
120
+
121
+ return False
122
+
123
+ def _analyze_field_optimizations(self, ast: Dict[str, Any]) -> List[Dict[str, str]]:
124
+ """Analyze field-specific optimizations based on mappings."""
125
+ optimizations = []
126
+
127
+ # Check if we have analyzer information
128
+ if self.has_analyzer_info:
129
+ optimizations.append(
130
+ {
131
+ "type": "field_mapping",
132
+ "description": "Enhanced field mappings with analyzer information available",
133
+ "benefit": "Queries optimized based on field types and analyzers",
134
+ }
135
+ )
136
+
137
+ return optimizations
138
+
139
+ def execute_opensearch( # noqa: C901
140
+ self,
141
+ query: str,
142
+ index: Optional[str] = None,
143
+ size: int = 10000,
144
+ from_: int = 0,
145
+ sort: Optional[List[Dict[str, Any]]] = None,
146
+ source_includes: Optional[List[str]] = None,
147
+ source_excludes: Optional[List[str]] = None,
148
+ track_total_hits: Union[bool, int] = True,
149
+ explain: bool = False,
150
+ timeout: int = 30,
151
+ preference: Optional[str] = None,
152
+ routing: Optional[str] = None,
153
+ request_cache: Optional[bool] = None,
154
+ terminate_after: Optional[int] = None,
155
+ search_type: Optional[str] = None,
156
+ scroll: Optional[str] = None,
157
+ client: Optional[Any] = None,
158
+ timestamp_field: str = "@timestamp",
159
+ time_range: Optional[Dict[str, str]] = None,
160
+ scan_all: bool = False,
161
+ scroll_size: int = 1000,
162
+ scroll_timeout: str = "5m",
163
+ **kwargs,
164
+ ) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
165
+ """Execute TQL query against OpenSearch and return results.
166
+
167
+ This method handles the complete query execution pipeline:
168
+ 1. Parse TQL query and analyze mutators
169
+ 2. Generate optimized OpenSearch query (Phase 1)
170
+ 3. Execute query against OpenSearch
171
+ 4. Apply post-processing mutators (Phase 2)
172
+ 5. Apply any result filtering
173
+
174
+ Args:
175
+ query: TQL query string
176
+ index: OpenSearch index name (uses environment variable if not provided)
177
+ size: Maximum number of results to return (default: 10000)
178
+ from_: Offset for pagination (default: 0)
179
+ sort: List of sort specifications
180
+ source_includes: Fields to include in response
181
+ source_excludes: Fields to exclude from response
182
+ track_total_hits: Whether to track total hit count
183
+ explain: Include score explanation
184
+ timeout: Query timeout
185
+ preference: Query routing preference
186
+ routing: Custom routing value
187
+ request_cache: Whether to use request cache
188
+ terminate_after: Maximum documents to collect per shard
189
+ search_type: Search execution type
190
+ scroll: Scroll timeout for scroll API
191
+ client: Optional OpenSearch client instance (for testing)
192
+ timestamp_field: Field name for timestamp filtering
193
+ time_range: Optional time range dict with 'gte' and/or 'lte' keys
194
+ scan_all: If True, use scroll API to retrieve all matching documents
195
+ scroll_size: Size per scroll when scan_all=True
196
+ scroll_timeout: Scroll timeout when scan_all=True
197
+ **kwargs: Additional OpenSearch parameters
198
+
199
+ Returns:
200
+ List of matching documents with mutators applied, or full response dict if raw=True
201
+
202
+ Raises:
203
+ TQLParseError: If query parsing fails
204
+ TQLExecutionError: If OpenSearch execution fails
205
+ ImportError: If opensearch-py is not installed
206
+ """
207
+ try:
208
+ from opensearchpy import OpenSearch
209
+ except ImportError:
210
+ raise ImportError("opensearch-py package is required for OpenSearch queries")
211
+
212
+ # Get index from environment if not provided
213
+ if index is None:
214
+ index = os.getenv("OPENSEARCH_INDEX")
215
+ if not index:
216
+ raise ValueError("OpenSearch index must be provided or set in OPENSEARCH_INDEX environment variable")
217
+
218
+ # Parse the query first to check if it's a stats query
219
+ ast = self.parser.parse(query)
220
+
221
+ # Initialize variables that might be used later
222
+ opensearch_query = None
223
+ needs_phase2 = False
224
+
225
+ # Check if this is a stats query
226
+ is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
227
+
228
+ if is_stats_query:
229
+ # Handle stats queries differently
230
+ from ..opensearch_stats import OpenSearchStatsTranslator
231
+
232
+ translator = OpenSearchStatsTranslator()
233
+
234
+ # Determine the filter and stats parts
235
+ if ast.get("type") == "query_with_stats":
236
+ # Has a filter before stats
237
+ filter_ast = ast.get("filter")
238
+ stats_ast = ast.get("stats")
239
+
240
+ # Convert filter to OpenSearch query
241
+ backend = OpenSearchBackend(field_mappings=self.field_mappings)
242
+ if filter_ast:
243
+ filter_query = backend.convert(filter_ast)["query"]
244
+ else:
245
+ filter_query = {"match_all": {}}
246
+ else:
247
+ # Pure stats query
248
+ stats_ast = ast
249
+ filter_query = {"match_all": {}}
250
+
251
+ # Build aggregations
252
+ if stats_ast:
253
+ stats_result = translator.translate_stats(stats_ast, self.field_mappings)
254
+ else:
255
+ stats_result = {"aggs": {}}
256
+
257
+ # Extract the aggregations (translate_stats returns {"aggs": {...}})
258
+ aggregations = stats_result.get("aggs", {})
259
+
260
+ # Build the complete query
261
+ opensearch_query = {"query": filter_query, "aggs": aggregations}
262
+ needs_phase2 = False
263
+ has_mutators = False
264
+ else:
265
+ # Parse and analyze the query normally
266
+ analysis_result = self.analyze_opensearch_query(query)
267
+
268
+ # Determine if we have mutators
269
+ has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
270
+
271
+ if not is_stats_query:
272
+ if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
273
+ # Use optimized AST (Phase 1) for OpenSearch
274
+ phase1_ast = analysis_result.optimized_ast
275
+ backend = OpenSearchBackend(field_mappings=self.field_mappings)
276
+ opensearch_query = backend.convert(phase1_ast)
277
+
278
+ # Check if we need Phase 2 (post-processing)
279
+ needs_phase2 = bool(analysis_result.post_processing_requirements)
280
+ # Phase 2 will be handled by post_processing_requirements
281
+ else:
282
+ # No mutators, use original query
283
+ assert isinstance(analysis_result, dict)
284
+ opensearch_query = analysis_result["opensearch_query"]
285
+ needs_phase2 = False
286
+ # No phase 2 needed for non-mutator queries
287
+
288
+ # Use provided client or create OpenSearch client
289
+ if client is None:
290
+ client = OpenSearch(
291
+ hosts=[
292
+ {
293
+ "host": os.getenv("OPENSEARCH_HOST", "localhost"),
294
+ "port": int(os.getenv("OPENSEARCH_PORT", "9200")),
295
+ }
296
+ ],
297
+ http_auth=(
298
+ (os.getenv("OPENSEARCH_USERNAME", "admin"), os.getenv("OPENSEARCH_PASSWORD", "admin"))
299
+ if os.getenv("OPENSEARCH_USERNAME")
300
+ else None
301
+ ),
302
+ use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
303
+ verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "false").lower() == "true",
304
+ ssl_show_warn=False,
305
+ )
306
+
307
+ # Build search body
308
+ # opensearch_query already contains {"query": {...}} from backend.convert()
309
+ if opensearch_query is None:
310
+ raise ValueError("Failed to generate OpenSearch query")
311
+ search_body = opensearch_query.copy()
312
+
313
+ # Handle time range filtering
314
+ if time_range is None:
315
+ # Default time range: last 15 minutes
316
+ time_range = {"gte": "now-15m", "lte": "now"}
317
+
318
+ # Add time range filter to the query
319
+ if time_range:
320
+ base_query = search_body.get("query", {})
321
+ time_filter = {"range": {timestamp_field: time_range}}
322
+
323
+ # Wrap the existing query with time filter
324
+ if base_query:
325
+ search_body["query"] = {"bool": {"must": [base_query, time_filter]}}
326
+ else:
327
+ search_body["query"] = time_filter
328
+
329
+ search_body.update({"size": size, "from": from_, "track_total_hits": track_total_hits})
330
+
331
+ # Add optional parameters
332
+ if sort:
333
+ search_body["sort"] = sort
334
+ if source_includes or source_excludes:
335
+ search_body["_source"] = {}
336
+ if source_includes:
337
+ search_body["_source"]["includes"] = source_includes
338
+ if source_excludes:
339
+ search_body["_source"]["excludes"] = source_excludes
340
+ if explain:
341
+ search_body["explain"] = explain
342
+
343
+ # Add any additional parameters from kwargs
344
+ search_body.update(kwargs)
345
+
346
+ # Build search parameters
347
+ search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
348
+
349
+ # Add optional search parameters
350
+ if preference:
351
+ search_params["preference"] = preference
352
+ if routing:
353
+ search_params["routing"] = routing
354
+ if request_cache is not None:
355
+ search_params["request_cache"] = request_cache
356
+ if terminate_after:
357
+ search_params["terminate_after"] = terminate_after
358
+ if search_type:
359
+ search_params["search_type"] = search_type
360
+ if scroll:
361
+ search_params["scroll"] = scroll
362
+
363
+ # Initialize scroll tracking
364
+ scroll_count = 0
365
+
366
+ # Handle scan_all functionality with scroll API
367
+ if scan_all:
368
+ all_hits = []
369
+ search_params["scroll"] = scroll_timeout
370
+ search_params["body"]["size"] = scroll_size
371
+ # Remove from parameter for scroll API
372
+ search_params["body"].pop("from", None)
373
+
374
+ try:
375
+ # Initial search
376
+ response = client.search(**search_params)
377
+ hits = response.get("hits", {}).get("hits", [])
378
+ all_hits.extend(hits)
379
+ scroll_count += 1
380
+
381
+ scroll_id = response.get("_scroll_id")
382
+
383
+ # Continue scrolling until no more results
384
+ while scroll_id and hits:
385
+ scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
386
+
387
+ hits = scroll_response.get("hits", {}).get("hits", [])
388
+ all_hits.extend(hits)
389
+ scroll_id = scroll_response.get("_scroll_id")
390
+ scroll_count += 1
391
+
392
+ # Clean up scroll
393
+ if scroll_id:
394
+ try:
395
+ client.clear_scroll(scroll_id=scroll_id)
396
+ except Exception:
397
+ pass # Ignore cleanup errors
398
+
399
+ # Create a response structure that mimics regular search
400
+ response = {"hits": {"total": {"value": len(all_hits)}, "hits": all_hits}}
401
+
402
+ except Exception as e:
403
+ raise TQLExecutionError(f"OpenSearch scroll query failed: {str(e)}")
404
+ else:
405
+ # Regular search
406
+ try:
407
+ response = client.search(**search_params)
408
+ except Exception as e:
409
+ raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
410
+
411
+ # Handle stats query results differently
412
+ if is_stats_query:
413
+ # Process stats aggregation results
414
+ aggs_response = response.get("aggregations", {})
415
+
416
+ # Format the stats results based on the test expectations
417
+ # Use the correct stats AST
418
+ if ast.get("type") == "query_with_stats":
419
+ stats_ast = ast.get("stats")
420
+ else:
421
+ stats_ast = ast
422
+
423
+ # Extract aggregation info
424
+ if stats_ast:
425
+ aggregations = stats_ast.get("aggregations", [])
426
+ group_by_fields = stats_ast.get("group_by", [])
427
+ else:
428
+ aggregations = []
429
+ group_by_fields = []
430
+
431
+ # Format results differently based on whether we have grouping
432
+ if group_by_fields:
433
+ # For grouped stats, we need to extract buckets
434
+ if stats_ast:
435
+ buckets = self._extract_grouped_buckets(aggs_response, group_by_fields, aggregations, stats_ast)
436
+ else:
437
+ buckets = []
438
+
439
+ # For multiple aggregations, include all operations
440
+ operations = [agg.get("function") for agg in aggregations]
441
+ fields = [agg.get("field") for agg in aggregations]
442
+
443
+ stats_results = {
444
+ "type": "stats",
445
+ "operation": operations[0] if len(operations) == 1 else operations,
446
+ "field": fields[0] if len(fields) == 1 else fields,
447
+ "values": buckets, # Array of buckets for grouped results
448
+ "group_by": group_by_fields,
449
+ }
450
+ else:
451
+ # Simple aggregations without grouping
452
+ if aggregations:
453
+ first_agg = aggregations[0]
454
+ func = first_agg.get("function", "")
455
+ field = first_agg.get("field", "*")
456
+
457
+ # Get the aggregation result
458
+ # The alias is typically func_field_0 for the first aggregation
459
+ alias = first_agg.get("alias") or f"{func}_{field}_0"
460
+ agg_result = aggs_response.get(alias, {})
461
+
462
+ # Extract the value based on aggregation type
463
+ if func == "count":
464
+ value = agg_result.get("value", 0)
465
+ elif func in ["sum", "min", "max", "avg", "average"]:
466
+ value = agg_result.get("value", 0)
467
+ elif func == "unique_count":
468
+ value = agg_result.get("value", 0)
469
+ elif func in ["percentile", "percentiles", "p", "pct"]:
470
+ # Percentiles return a values dict
471
+ values_dict = agg_result.get("values", {})
472
+ # For a single percentile, extract the value
473
+ if len(values_dict) == 1:
474
+ value = list(values_dict.values())[0]
475
+ else:
476
+ value = values_dict
477
+ else:
478
+ value = agg_result
479
+
480
+ stats_results = {
481
+ "type": "stats",
482
+ "operation": func,
483
+ "field": field,
484
+ "values": value,
485
+ "group_by": [],
486
+ }
487
+ else:
488
+ stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
489
+
490
+ # Extract hits if size > 0
491
+ hits = response.get("hits", {}).get("hits", [])
492
+ documents = []
493
+ if size > 0 and hits:
494
+ for hit in hits:
495
+ doc = hit["_source"].copy()
496
+ # Preserve metadata
497
+ if "_id" in hit:
498
+ doc["_id"] = hit["_id"]
499
+ if "_score" in hit:
500
+ doc["_score"] = hit["_score"]
501
+ documents.append(doc)
502
+
503
+ # Return in the expected format
504
+ result = {
505
+ "results": documents,
506
+ "total": response.get("hits", {}).get("total", {}).get("value", 0),
507
+ "stats": stats_results,
508
+ "post_processing_applied": False,
509
+ "health_status": "HEALTHY",
510
+ "health_reasons": [],
511
+ "performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
512
+ "scan_info": {"used_scan": False},
513
+ }
514
+
515
+ # Add query_type if documents were requested
516
+ if size > 0:
517
+ result["query_type"] = "stats_with_docs"
518
+
519
+ return result
520
+
521
+ # Extract hits for regular queries
522
+ hits = response.get("hits", {}).get("hits", [])
523
+ total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
524
+
525
+ # Process results based on whether we need Phase 2
526
+ if needs_phase2:
527
+ # Apply Phase 2 processing
528
+ processor = QueryPostProcessor()
529
+
530
+ # Extract documents from hits
531
+ documents = []
532
+ hit_metadata = []
533
+ for hit in hits:
534
+ documents.append(hit["_source"])
535
+ hit_metadata.append(
536
+ {
537
+ "_id": hit.get("_id"),
538
+ "_score": hit.get("_score"),
539
+ "_explanation": hit.get("_explanation") if explain else None,
540
+ }
541
+ )
542
+
543
+ # First apply mutators to all documents
544
+ if isinstance(analysis_result, MutatorAnalysisResult):
545
+ processed_docs = processor.process_results(
546
+ documents,
547
+ analysis_result.post_processing_requirements,
548
+ track_enrichments=kwargs.get("save_enrichment", False),
549
+ )
550
+
551
+ # Then filter results based on requirements (e.g., ALL operator, contains with mutators)
552
+ filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
553
+ else:
554
+ # No post-processing needed
555
+ processed_docs = documents
556
+ filtered_docs = documents
557
+
558
+ # Build final results with preserved metadata
559
+ results = []
560
+ for doc in filtered_docs:
561
+ # Find the original hit metadata for this document
562
+ # This is a simple approach - in production you might want to track IDs
563
+ for i, orig_doc in enumerate(documents):
564
+ if orig_doc == doc or self._docs_match(orig_doc, doc):
565
+ # Add metadata
566
+ if hit_metadata[i]["_id"]:
567
+ doc["_id"] = hit_metadata[i]["_id"]
568
+ if hit_metadata[i]["_score"]:
569
+ doc["_score"] = hit_metadata[i]["_score"]
570
+ if hit_metadata[i]["_explanation"]:
571
+ doc["_explanation"] = hit_metadata[i]["_explanation"]
572
+ break
573
+ results.append(doc)
574
+ else:
575
+ # No Phase 2 needed, just extract documents
576
+ results = []
577
+ for hit in hits:
578
+ doc = hit["_source"].copy()
579
+ # Preserve metadata
580
+ if "_id" in hit:
581
+ doc["_id"] = hit["_id"]
582
+ if "_score" in hit:
583
+ doc["_score"] = hit["_score"]
584
+ if explain and "explanation" in hit:
585
+ doc["_explanation"] = hit["explanation"]
586
+ results.append(doc)
587
+
588
+ # Return raw response if requested
589
+ if kwargs.get("raw_response", False):
590
+ return {
591
+ "took": response.get("took"),
592
+ "timed_out": response.get("timed_out"),
593
+ "hits": {
594
+ "total": response.get("hits", {}).get("total"),
595
+ "max_score": response.get("hits", {}).get("max_score"),
596
+ "hits": results,
597
+ },
598
+ }
599
+
600
+ # Build performance impact info
601
+ performance_impact = {
602
+ "has_post_processing": needs_phase2,
603
+ "impacted_fields": [],
604
+ "mutator_types": [],
605
+ "estimated_overhead": "low",
606
+ }
607
+
608
+ if needs_phase2 and isinstance(analysis_result, MutatorAnalysisResult):
609
+ impacted_fields = set()
610
+ mutator_types = set()
611
+
612
+ for req in analysis_result.post_processing_requirements:
613
+ impacted_fields.add(req.field_name)
614
+ for mutator in req.mutators:
615
+ mutator_types.add(mutator.get("name", "unknown"))
616
+
617
+ performance_impact["impacted_fields"] = list(impacted_fields)
618
+ performance_impact["mutator_types"] = list(mutator_types)
619
+
620
+ # Estimate overhead based on mutator types
621
+ expensive_mutators = {"nslookup", "geoip_lookup", "geo"}
622
+ if any(m in mutator_types for m in expensive_mutators):
623
+ performance_impact["estimated_overhead"] = "high"
624
+ elif len(mutator_types) > 2:
625
+ performance_impact["estimated_overhead"] = "medium"
626
+
627
+ # Determine health status
628
+ if needs_phase2:
629
+ health_status = "yellow"
630
+ health_reasons = ["Post-processing required - results may be incomplete with pagination"]
631
+ else:
632
+ health_status = "green"
633
+ health_reasons = []
634
+
635
+ # Get opensearch total before filtering
636
+ opensearch_total = total_hits
637
+
638
+ result = {
639
+ "results": results,
640
+ "total": len(results),
641
+ "returned": len(results), # Alias for total
642
+ "opensearch_total": opensearch_total,
643
+ "post_processing_applied": needs_phase2,
644
+ "health_status": health_status,
645
+ "health_reasons": health_reasons,
646
+ "performance_impact": performance_impact,
647
+ "optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
648
+ "opensearch_query": (
649
+ opensearch_query.get("query", {}) if opensearch_query else {}
650
+ ), # Include the query that was sent
651
+ "time_range": time_range,
652
+ "timestamp_field": timestamp_field,
653
+ "query_type": "regular", # Regular query (not stats)
654
+ "scan_info": {
655
+ "used_scan": scan_all,
656
+ "scroll_size": scroll_size if scan_all else None,
657
+ "scroll_timeout": scroll_timeout if scan_all else None,
658
+ "scroll_count": scroll_count if scan_all else None,
659
+ "documents_retrieved": len(results) if scan_all else None,
660
+ "estimated_total": total_hits if scan_all else None,
661
+ },
662
+ }
663
+
664
+ # Add pagination info for non-scan queries
665
+ if not scan_all:
666
+ result["pagination"] = {
667
+ "size": size,
668
+ "from": from_,
669
+ "total": opensearch_total,
670
+ "has_more": opensearch_total > (from_ + len(results)),
671
+ }
672
+
673
+ return result
674
+
675
+ def _docs_match(self, doc1: Dict[str, Any], doc2: Dict[str, Any]) -> bool:
676
+ """Check if two documents are the same (accounting for mutations).
677
+
678
+ This is a simple implementation - in production you'd want something more robust.
679
+ """
680
+ # If they have the same _id, they match
681
+ if "_id" in doc1 and "_id" in doc2 and doc1["_id"] == doc2["_id"]:
682
+ return True
683
+
684
+ # Otherwise do a simple comparison of a few key fields
685
+ # This is imperfect but works for most cases
686
+ key_fields = ["id", "name", "hostname", "@timestamp"]
687
+ for field in key_fields:
688
+ if field in doc1 and field in doc2 and doc1[field] == doc2[field]:
689
+ return True
690
+
691
+ return False
692
+
693
+ def _extract_grouped_buckets( # noqa: C901
694
+ self,
695
+ aggs_response: Dict[str, Any],
696
+ group_by_fields: List[str],
697
+ aggregations: List[Dict[str, Any]],
698
+ stats_ast: Dict[str, Any],
699
+ ) -> List[Dict[str, Any]]:
700
+ """Extract buckets from grouped aggregation response.
701
+
702
+ Args:
703
+ aggs_response: OpenSearch aggregations response
704
+ group_by_fields: List of fields used for grouping
705
+ aggregations: List of aggregation specifications
706
+ stats_ast: The stats AST for reference
707
+
708
+ Returns:
709
+ List of bucket dictionaries with group keys and aggregation values
710
+ """
711
+ buckets = []
712
+
713
+ # For single-level grouping
714
+ if len(group_by_fields) == 1:
715
+ field = group_by_fields[0]
716
+ # Look for the terms aggregation with the group field name
717
+ terms_agg_name = f"group_by_{field}"
718
+
719
+ # The aggregation might be named differently, check for it
720
+ # OpenSearch stats translator uses the field name directly
721
+ if field in aggs_response:
722
+ buckets_data = aggs_response[field].get("buckets", [])
723
+ elif terms_agg_name in aggs_response:
724
+ buckets_data = aggs_response[terms_agg_name].get("buckets", [])
725
+ else:
726
+ # Try to find any terms aggregation
727
+ for _key, value in aggs_response.items():
728
+ if isinstance(value, dict) and "buckets" in value:
729
+ buckets_data = value["buckets"]
730
+ break
731
+ else:
732
+ buckets_data = []
733
+
734
+ # Process each bucket
735
+ for bucket in buckets_data:
736
+ bucket_result = {field: bucket.get("key")}
737
+
738
+ # Extract aggregation values
739
+ for i, agg in enumerate(aggregations):
740
+ func = agg.get("function", "")
741
+ field_name = agg.get("field", "*")
742
+ alias = agg.get("alias") or f"{func}_{field_name}_{i}"
743
+
744
+ # Map function names to expected output names
745
+ output_key = func
746
+ if func == "avg":
747
+ output_key = "average"
748
+ elif func == "unique_count":
749
+ output_key = "distinct_count"
750
+
751
+ if alias in bucket:
752
+ agg_value = bucket[alias]
753
+ # Extract the actual value
754
+ if isinstance(agg_value, dict) and "value" in agg_value:
755
+ bucket_result[output_key] = agg_value["value"]
756
+ else:
757
+ bucket_result[output_key] = agg_value
758
+ else:
759
+ # Try without index suffix for first aggregation
760
+ simple_alias = f"{func}_{field_name}"
761
+ if simple_alias in bucket:
762
+ agg_value = bucket[simple_alias]
763
+ if isinstance(agg_value, dict) and "value" in agg_value:
764
+ bucket_result[output_key] = agg_value["value"]
765
+ else:
766
+ bucket_result[output_key] = agg_value
767
+
768
+ buckets.append(bucket_result)
769
+
770
+ else:
771
+ # Multi-level grouping - need to traverse nested structure
772
+ # Start with the outermost grouping
773
+ current_agg = aggs_response
774
+
775
+ # Find the first group_by aggregation
776
+ for field in group_by_fields:
777
+ group_key = f"group_by_{field}"
778
+ if group_key in current_agg:
779
+ current_agg = current_agg[group_key]
780
+ break
781
+ elif field in current_agg:
782
+ current_agg = current_agg[field]
783
+ break
784
+
785
+ # Process nested buckets recursively
786
+ if "buckets" in current_agg:
787
+ buckets = self._process_nested_buckets(current_agg["buckets"], group_by_fields, aggregations, 0)
788
+
789
+ return buckets
790
+
791
+ def _process_nested_buckets( # noqa: C901
792
+ self,
793
+ buckets_data: List[Dict[str, Any]],
794
+ group_by_fields: List[str],
795
+ aggregations: List[Dict[str, Any]],
796
+ level: int,
797
+ ) -> List[Dict[str, Any]]:
798
+ """Process nested buckets for multi-level grouping.
799
+
800
+ Args:
801
+ buckets_data: List of bucket data from OpenSearch
802
+ group_by_fields: List of fields used for grouping
803
+ aggregations: List of aggregation specifications
804
+ level: Current nesting level (0-based)
805
+
806
+ Returns:
807
+ Flattened list of bucket results
808
+ """
809
+ results = []
810
+
811
+ for bucket in buckets_data:
812
+ # Get the key for this level
813
+ field_name = group_by_fields[level]
814
+ bucket_key = {field_name: bucket.get("key")}
815
+
816
+ # Check if there are more levels
817
+ if level + 1 < len(group_by_fields):
818
+ # Look for the next level's aggregation
819
+ next_field = group_by_fields[level + 1]
820
+ next_group_key = f"group_by_{next_field}"
821
+
822
+ if next_group_key in bucket and "buckets" in bucket[next_group_key]:
823
+ # Recursively process nested buckets
824
+ nested_results = self._process_nested_buckets(
825
+ bucket[next_group_key]["buckets"], group_by_fields, aggregations, level + 1
826
+ )
827
+
828
+ # Merge current key with nested results
829
+ for nested in nested_results:
830
+ merged = bucket_key.copy()
831
+ merged.update(nested)
832
+ results.append(merged)
833
+ else:
834
+ # This is the innermost level - extract aggregation values
835
+ result = bucket_key.copy()
836
+
837
+ # Extract aggregation values
838
+ for i, agg in enumerate(aggregations):
839
+ func = agg.get("function", "")
840
+ field_name = agg.get("field", "*")
841
+ alias = agg.get("alias") or f"{func}_{field_name}_{i}"
842
+
843
+ # Map function names to expected output names
844
+ output_key = func
845
+ if func == "avg":
846
+ output_key = "average"
847
+ elif func == "unique_count":
848
+ output_key = "distinct_count"
849
+
850
+ if alias in bucket:
851
+ agg_value = bucket[alias]
852
+ # Extract the actual value
853
+ if isinstance(agg_value, dict) and "value" in agg_value:
854
+ result[output_key] = agg_value["value"]
855
+ else:
856
+ result[output_key] = agg_value
857
+ else:
858
+ # Try without index suffix for first aggregation
859
+ simple_alias = f"{func}_{field_name}"
860
+ if simple_alias in bucket:
861
+ agg_value = bucket[simple_alias]
862
+ if isinstance(agg_value, dict) and "value" in agg_value:
863
+ result[output_key] = agg_value["value"]
864
+ else:
865
+ result[output_key] = agg_value
866
+
867
+ results.append(result)
868
+
869
+ return results