tellaro-query-language 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tellaro_query_language-0.1.0.dist-info/LICENSE +21 -0
- tellaro_query_language-0.1.0.dist-info/METADATA +401 -0
- tellaro_query_language-0.1.0.dist-info/RECORD +56 -0
- tellaro_query_language-0.1.0.dist-info/WHEEL +4 -0
- tellaro_query_language-0.1.0.dist-info/entry_points.txt +7 -0
- tql/__init__.py +47 -0
- tql/analyzer.py +385 -0
- tql/cache/__init__.py +7 -0
- tql/cache/base.py +25 -0
- tql/cache/memory.py +63 -0
- tql/cache/redis.py +68 -0
- tql/core.py +929 -0
- tql/core_components/README.md +92 -0
- tql/core_components/__init__.py +20 -0
- tql/core_components/file_operations.py +113 -0
- tql/core_components/opensearch_operations.py +869 -0
- tql/core_components/stats_operations.py +200 -0
- tql/core_components/validation_operations.py +599 -0
- tql/evaluator.py +379 -0
- tql/evaluator_components/README.md +131 -0
- tql/evaluator_components/__init__.py +17 -0
- tql/evaluator_components/field_access.py +176 -0
- tql/evaluator_components/special_expressions.py +296 -0
- tql/evaluator_components/value_comparison.py +315 -0
- tql/exceptions.py +160 -0
- tql/geoip_normalizer.py +233 -0
- tql/mutator_analyzer.py +830 -0
- tql/mutators/__init__.py +222 -0
- tql/mutators/base.py +78 -0
- tql/mutators/dns.py +316 -0
- tql/mutators/encoding.py +218 -0
- tql/mutators/geo.py +363 -0
- tql/mutators/list.py +212 -0
- tql/mutators/network.py +163 -0
- tql/mutators/security.py +225 -0
- tql/mutators/string.py +165 -0
- tql/opensearch.py +78 -0
- tql/opensearch_components/README.md +130 -0
- tql/opensearch_components/__init__.py +17 -0
- tql/opensearch_components/field_mapping.py +399 -0
- tql/opensearch_components/lucene_converter.py +305 -0
- tql/opensearch_components/query_converter.py +775 -0
- tql/opensearch_mappings.py +309 -0
- tql/opensearch_stats.py +451 -0
- tql/parser.py +1363 -0
- tql/parser_components/README.md +72 -0
- tql/parser_components/__init__.py +20 -0
- tql/parser_components/ast_builder.py +162 -0
- tql/parser_components/error_analyzer.py +101 -0
- tql/parser_components/field_extractor.py +112 -0
- tql/parser_components/grammar.py +473 -0
- tql/post_processor.py +737 -0
- tql/scripts.py +124 -0
- tql/stats_evaluator.py +444 -0
- tql/stats_transformer.py +184 -0
- tql/validators.py +110 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
"""OpenSearch operations for TQL.
|
|
2
|
+
|
|
3
|
+
This module handles all OpenSearch-specific operations including query conversion,
|
|
4
|
+
execution, and result processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
|
|
10
|
+
from ..exceptions import TQLExecutionError
|
|
11
|
+
from ..mutator_analyzer import MutatorAnalysisResult, MutatorAnalyzer
|
|
12
|
+
from ..opensearch import OpenSearchBackend
|
|
13
|
+
from ..parser import TQLParser
|
|
14
|
+
from ..post_processor import QueryPostProcessor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class OpenSearchOperations:
|
|
18
|
+
"""Handles OpenSearch-specific operations for TQL."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, parser: TQLParser, field_mappings: Dict[str, Any], enhanced_mappings: Dict[str, Any]):
|
|
21
|
+
"""Initialize OpenSearch operations.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
parser: TQL parser instance
|
|
25
|
+
field_mappings: Field mapping configuration
|
|
26
|
+
enhanced_mappings: Enhanced field mappings with analyzer info
|
|
27
|
+
"""
|
|
28
|
+
self.parser = parser
|
|
29
|
+
self.field_mappings = field_mappings
|
|
30
|
+
self.enhanced_mappings = enhanced_mappings
|
|
31
|
+
self.has_analyzer_info = any(mapping.is_enhanced_mapping() for mapping in self.enhanced_mappings.values())
|
|
32
|
+
|
|
33
|
+
def to_opensearch(self, query: str) -> Dict[str, Any]:
|
|
34
|
+
"""Convert TQL query to OpenSearch query format.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
query: TQL query string
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
OpenSearch query dictionary
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
TQLParseError: If query parsing fails
|
|
44
|
+
"""
|
|
45
|
+
# Parse the query
|
|
46
|
+
ast = self.parser.parse(query)
|
|
47
|
+
|
|
48
|
+
# Create OpenSearch backend
|
|
49
|
+
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
50
|
+
|
|
51
|
+
# Convert to OpenSearch query
|
|
52
|
+
opensearch_query = backend.convert(ast)
|
|
53
|
+
|
|
54
|
+
return opensearch_query
|
|
55
|
+
|
|
56
|
+
def to_opensearch_dsl(self, query: str) -> Dict[str, Any]:
|
|
57
|
+
"""Convert TQL query to OpenSearch DSL format.
|
|
58
|
+
|
|
59
|
+
This is an alias for to_opensearch() for backward compatibility.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
query: TQL query string
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
OpenSearch DSL query dictionary
|
|
66
|
+
"""
|
|
67
|
+
return self.to_opensearch(query)
|
|
68
|
+
|
|
69
|
+
def analyze_opensearch_query(self, query: str) -> Union[MutatorAnalysisResult, Dict[str, Any]]:
|
|
70
|
+
"""Analyze a TQL query for OpenSearch optimization opportunities.
|
|
71
|
+
|
|
72
|
+
This method examines mutator usage and field mappings to determine:
|
|
73
|
+
1. Which mutators can be pushed to OpenSearch (Phase 1)
|
|
74
|
+
2. Which mutators must be applied post-query (Phase 2)
|
|
75
|
+
3. How field mappings affect operator choices
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
query: TQL query string
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
MutatorAnalysisResult if mutators present, otherwise analysis dict
|
|
82
|
+
"""
|
|
83
|
+
# Parse the query
|
|
84
|
+
ast = self.parser.parse(query)
|
|
85
|
+
|
|
86
|
+
# If there are no mutators, just analyze for field mapping optimizations
|
|
87
|
+
if not self._has_mutators(ast):
|
|
88
|
+
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
89
|
+
os_query = backend.convert(ast)
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"has_mutators": False,
|
|
93
|
+
"original_query": query,
|
|
94
|
+
"opensearch_query": os_query,
|
|
95
|
+
"optimizations": self._analyze_field_optimizations(ast),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Create analyzer
|
|
99
|
+
analyzer = MutatorAnalyzer(self.enhanced_mappings)
|
|
100
|
+
|
|
101
|
+
# Analyze the AST
|
|
102
|
+
return analyzer.analyze_ast(ast)
|
|
103
|
+
|
|
104
|
+
def _has_mutators(self, ast: Dict[str, Any]) -> bool:
|
|
105
|
+
"""Check if AST contains any mutators."""
|
|
106
|
+
if isinstance(ast, dict):
|
|
107
|
+
# Check for mutators in current node
|
|
108
|
+
if ast.get("field_mutators") or ast.get("value_mutators"):
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
# Check for special expressions (geo, nslookup)
|
|
112
|
+
if ast.get("type") in ["geo_expr", "nslookup_expr"]:
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
# Recursively check child nodes
|
|
116
|
+
for key, value in ast.items():
|
|
117
|
+
if key in ["left", "right", "operand", "filter", "conditions"]:
|
|
118
|
+
if self._has_mutators(value):
|
|
119
|
+
return True
|
|
120
|
+
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
def _analyze_field_optimizations(self, ast: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
124
|
+
"""Analyze field-specific optimizations based on mappings."""
|
|
125
|
+
optimizations = []
|
|
126
|
+
|
|
127
|
+
# Check if we have analyzer information
|
|
128
|
+
if self.has_analyzer_info:
|
|
129
|
+
optimizations.append(
|
|
130
|
+
{
|
|
131
|
+
"type": "field_mapping",
|
|
132
|
+
"description": "Enhanced field mappings with analyzer information available",
|
|
133
|
+
"benefit": "Queries optimized based on field types and analyzers",
|
|
134
|
+
}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return optimizations
|
|
138
|
+
|
|
139
|
+
def execute_opensearch( # noqa: C901
|
|
140
|
+
self,
|
|
141
|
+
query: str,
|
|
142
|
+
index: Optional[str] = None,
|
|
143
|
+
size: int = 10000,
|
|
144
|
+
from_: int = 0,
|
|
145
|
+
sort: Optional[List[Dict[str, Any]]] = None,
|
|
146
|
+
source_includes: Optional[List[str]] = None,
|
|
147
|
+
source_excludes: Optional[List[str]] = None,
|
|
148
|
+
track_total_hits: Union[bool, int] = True,
|
|
149
|
+
explain: bool = False,
|
|
150
|
+
timeout: int = 30,
|
|
151
|
+
preference: Optional[str] = None,
|
|
152
|
+
routing: Optional[str] = None,
|
|
153
|
+
request_cache: Optional[bool] = None,
|
|
154
|
+
terminate_after: Optional[int] = None,
|
|
155
|
+
search_type: Optional[str] = None,
|
|
156
|
+
scroll: Optional[str] = None,
|
|
157
|
+
client: Optional[Any] = None,
|
|
158
|
+
timestamp_field: str = "@timestamp",
|
|
159
|
+
time_range: Optional[Dict[str, str]] = None,
|
|
160
|
+
scan_all: bool = False,
|
|
161
|
+
scroll_size: int = 1000,
|
|
162
|
+
scroll_timeout: str = "5m",
|
|
163
|
+
**kwargs,
|
|
164
|
+
) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
|
|
165
|
+
"""Execute TQL query against OpenSearch and return results.
|
|
166
|
+
|
|
167
|
+
This method handles the complete query execution pipeline:
|
|
168
|
+
1. Parse TQL query and analyze mutators
|
|
169
|
+
2. Generate optimized OpenSearch query (Phase 1)
|
|
170
|
+
3. Execute query against OpenSearch
|
|
171
|
+
4. Apply post-processing mutators (Phase 2)
|
|
172
|
+
5. Apply any result filtering
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
query: TQL query string
|
|
176
|
+
index: OpenSearch index name (uses environment variable if not provided)
|
|
177
|
+
size: Maximum number of results to return (default: 10000)
|
|
178
|
+
from_: Offset for pagination (default: 0)
|
|
179
|
+
sort: List of sort specifications
|
|
180
|
+
source_includes: Fields to include in response
|
|
181
|
+
source_excludes: Fields to exclude from response
|
|
182
|
+
track_total_hits: Whether to track total hit count
|
|
183
|
+
explain: Include score explanation
|
|
184
|
+
timeout: Query timeout
|
|
185
|
+
preference: Query routing preference
|
|
186
|
+
routing: Custom routing value
|
|
187
|
+
request_cache: Whether to use request cache
|
|
188
|
+
terminate_after: Maximum documents to collect per shard
|
|
189
|
+
search_type: Search execution type
|
|
190
|
+
scroll: Scroll timeout for scroll API
|
|
191
|
+
client: Optional OpenSearch client instance (for testing)
|
|
192
|
+
timestamp_field: Field name for timestamp filtering
|
|
193
|
+
time_range: Optional time range dict with 'gte' and/or 'lte' keys
|
|
194
|
+
scan_all: If True, use scroll API to retrieve all matching documents
|
|
195
|
+
scroll_size: Size per scroll when scan_all=True
|
|
196
|
+
scroll_timeout: Scroll timeout when scan_all=True
|
|
197
|
+
**kwargs: Additional OpenSearch parameters
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of matching documents with mutators applied, or full response dict if raw=True
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
TQLParseError: If query parsing fails
|
|
204
|
+
TQLExecutionError: If OpenSearch execution fails
|
|
205
|
+
ImportError: If opensearch-py is not installed
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
from opensearchpy import OpenSearch
|
|
209
|
+
except ImportError:
|
|
210
|
+
raise ImportError("opensearch-py package is required for OpenSearch queries")
|
|
211
|
+
|
|
212
|
+
# Get index from environment if not provided
|
|
213
|
+
if index is None:
|
|
214
|
+
index = os.getenv("OPENSEARCH_INDEX")
|
|
215
|
+
if not index:
|
|
216
|
+
raise ValueError("OpenSearch index must be provided or set in OPENSEARCH_INDEX environment variable")
|
|
217
|
+
|
|
218
|
+
# Parse the query first to check if it's a stats query
|
|
219
|
+
ast = self.parser.parse(query)
|
|
220
|
+
|
|
221
|
+
# Initialize variables that might be used later
|
|
222
|
+
opensearch_query = None
|
|
223
|
+
needs_phase2 = False
|
|
224
|
+
|
|
225
|
+
# Check if this is a stats query
|
|
226
|
+
is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
|
|
227
|
+
|
|
228
|
+
if is_stats_query:
|
|
229
|
+
# Handle stats queries differently
|
|
230
|
+
from ..opensearch_stats import OpenSearchStatsTranslator
|
|
231
|
+
|
|
232
|
+
translator = OpenSearchStatsTranslator()
|
|
233
|
+
|
|
234
|
+
# Determine the filter and stats parts
|
|
235
|
+
if ast.get("type") == "query_with_stats":
|
|
236
|
+
# Has a filter before stats
|
|
237
|
+
filter_ast = ast.get("filter")
|
|
238
|
+
stats_ast = ast.get("stats")
|
|
239
|
+
|
|
240
|
+
# Convert filter to OpenSearch query
|
|
241
|
+
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
242
|
+
if filter_ast:
|
|
243
|
+
filter_query = backend.convert(filter_ast)["query"]
|
|
244
|
+
else:
|
|
245
|
+
filter_query = {"match_all": {}}
|
|
246
|
+
else:
|
|
247
|
+
# Pure stats query
|
|
248
|
+
stats_ast = ast
|
|
249
|
+
filter_query = {"match_all": {}}
|
|
250
|
+
|
|
251
|
+
# Build aggregations
|
|
252
|
+
if stats_ast:
|
|
253
|
+
stats_result = translator.translate_stats(stats_ast, self.field_mappings)
|
|
254
|
+
else:
|
|
255
|
+
stats_result = {"aggs": {}}
|
|
256
|
+
|
|
257
|
+
# Extract the aggregations (translate_stats returns {"aggs": {...}})
|
|
258
|
+
aggregations = stats_result.get("aggs", {})
|
|
259
|
+
|
|
260
|
+
# Build the complete query
|
|
261
|
+
opensearch_query = {"query": filter_query, "aggs": aggregations}
|
|
262
|
+
needs_phase2 = False
|
|
263
|
+
has_mutators = False
|
|
264
|
+
else:
|
|
265
|
+
# Parse and analyze the query normally
|
|
266
|
+
analysis_result = self.analyze_opensearch_query(query)
|
|
267
|
+
|
|
268
|
+
# Determine if we have mutators
|
|
269
|
+
has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
|
|
270
|
+
|
|
271
|
+
if not is_stats_query:
|
|
272
|
+
if has_mutators and isinstance(analysis_result, MutatorAnalysisResult):
|
|
273
|
+
# Use optimized AST (Phase 1) for OpenSearch
|
|
274
|
+
phase1_ast = analysis_result.optimized_ast
|
|
275
|
+
backend = OpenSearchBackend(field_mappings=self.field_mappings)
|
|
276
|
+
opensearch_query = backend.convert(phase1_ast)
|
|
277
|
+
|
|
278
|
+
# Check if we need Phase 2 (post-processing)
|
|
279
|
+
needs_phase2 = bool(analysis_result.post_processing_requirements)
|
|
280
|
+
# Phase 2 will be handled by post_processing_requirements
|
|
281
|
+
else:
|
|
282
|
+
# No mutators, use original query
|
|
283
|
+
assert isinstance(analysis_result, dict)
|
|
284
|
+
opensearch_query = analysis_result["opensearch_query"]
|
|
285
|
+
needs_phase2 = False
|
|
286
|
+
# No phase 2 needed for non-mutator queries
|
|
287
|
+
|
|
288
|
+
# Use provided client or create OpenSearch client
|
|
289
|
+
if client is None:
|
|
290
|
+
client = OpenSearch(
|
|
291
|
+
hosts=[
|
|
292
|
+
{
|
|
293
|
+
"host": os.getenv("OPENSEARCH_HOST", "localhost"),
|
|
294
|
+
"port": int(os.getenv("OPENSEARCH_PORT", "9200")),
|
|
295
|
+
}
|
|
296
|
+
],
|
|
297
|
+
http_auth=(
|
|
298
|
+
(os.getenv("OPENSEARCH_USERNAME", "admin"), os.getenv("OPENSEARCH_PASSWORD", "admin"))
|
|
299
|
+
if os.getenv("OPENSEARCH_USERNAME")
|
|
300
|
+
else None
|
|
301
|
+
),
|
|
302
|
+
use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
|
|
303
|
+
verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "false").lower() == "true",
|
|
304
|
+
ssl_show_warn=False,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# Build search body
|
|
308
|
+
# opensearch_query already contains {"query": {...}} from backend.convert()
|
|
309
|
+
if opensearch_query is None:
|
|
310
|
+
raise ValueError("Failed to generate OpenSearch query")
|
|
311
|
+
search_body = opensearch_query.copy()
|
|
312
|
+
|
|
313
|
+
# Handle time range filtering
|
|
314
|
+
if time_range is None:
|
|
315
|
+
# Default time range: last 15 minutes
|
|
316
|
+
time_range = {"gte": "now-15m", "lte": "now"}
|
|
317
|
+
|
|
318
|
+
# Add time range filter to the query
|
|
319
|
+
if time_range:
|
|
320
|
+
base_query = search_body.get("query", {})
|
|
321
|
+
time_filter = {"range": {timestamp_field: time_range}}
|
|
322
|
+
|
|
323
|
+
# Wrap the existing query with time filter
|
|
324
|
+
if base_query:
|
|
325
|
+
search_body["query"] = {"bool": {"must": [base_query, time_filter]}}
|
|
326
|
+
else:
|
|
327
|
+
search_body["query"] = time_filter
|
|
328
|
+
|
|
329
|
+
search_body.update({"size": size, "from": from_, "track_total_hits": track_total_hits})
|
|
330
|
+
|
|
331
|
+
# Add optional parameters
|
|
332
|
+
if sort:
|
|
333
|
+
search_body["sort"] = sort
|
|
334
|
+
if source_includes or source_excludes:
|
|
335
|
+
search_body["_source"] = {}
|
|
336
|
+
if source_includes:
|
|
337
|
+
search_body["_source"]["includes"] = source_includes
|
|
338
|
+
if source_excludes:
|
|
339
|
+
search_body["_source"]["excludes"] = source_excludes
|
|
340
|
+
if explain:
|
|
341
|
+
search_body["explain"] = explain
|
|
342
|
+
|
|
343
|
+
# Add any additional parameters from kwargs
|
|
344
|
+
search_body.update(kwargs)
|
|
345
|
+
|
|
346
|
+
# Build search parameters
|
|
347
|
+
search_params: Dict[str, Any] = {"index": index, "body": search_body, "timeout": timeout}
|
|
348
|
+
|
|
349
|
+
# Add optional search parameters
|
|
350
|
+
if preference:
|
|
351
|
+
search_params["preference"] = preference
|
|
352
|
+
if routing:
|
|
353
|
+
search_params["routing"] = routing
|
|
354
|
+
if request_cache is not None:
|
|
355
|
+
search_params["request_cache"] = request_cache
|
|
356
|
+
if terminate_after:
|
|
357
|
+
search_params["terminate_after"] = terminate_after
|
|
358
|
+
if search_type:
|
|
359
|
+
search_params["search_type"] = search_type
|
|
360
|
+
if scroll:
|
|
361
|
+
search_params["scroll"] = scroll
|
|
362
|
+
|
|
363
|
+
# Initialize scroll tracking
|
|
364
|
+
scroll_count = 0
|
|
365
|
+
|
|
366
|
+
# Handle scan_all functionality with scroll API
|
|
367
|
+
if scan_all:
|
|
368
|
+
all_hits = []
|
|
369
|
+
search_params["scroll"] = scroll_timeout
|
|
370
|
+
search_params["body"]["size"] = scroll_size
|
|
371
|
+
# Remove from parameter for scroll API
|
|
372
|
+
search_params["body"].pop("from", None)
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
# Initial search
|
|
376
|
+
response = client.search(**search_params)
|
|
377
|
+
hits = response.get("hits", {}).get("hits", [])
|
|
378
|
+
all_hits.extend(hits)
|
|
379
|
+
scroll_count += 1
|
|
380
|
+
|
|
381
|
+
scroll_id = response.get("_scroll_id")
|
|
382
|
+
|
|
383
|
+
# Continue scrolling until no more results
|
|
384
|
+
while scroll_id and hits:
|
|
385
|
+
scroll_response = client.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
386
|
+
|
|
387
|
+
hits = scroll_response.get("hits", {}).get("hits", [])
|
|
388
|
+
all_hits.extend(hits)
|
|
389
|
+
scroll_id = scroll_response.get("_scroll_id")
|
|
390
|
+
scroll_count += 1
|
|
391
|
+
|
|
392
|
+
# Clean up scroll
|
|
393
|
+
if scroll_id:
|
|
394
|
+
try:
|
|
395
|
+
client.clear_scroll(scroll_id=scroll_id)
|
|
396
|
+
except Exception:
|
|
397
|
+
pass # Ignore cleanup errors
|
|
398
|
+
|
|
399
|
+
# Create a response structure that mimics regular search
|
|
400
|
+
response = {"hits": {"total": {"value": len(all_hits)}, "hits": all_hits}}
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
raise TQLExecutionError(f"OpenSearch scroll query failed: {str(e)}")
|
|
404
|
+
else:
|
|
405
|
+
# Regular search
|
|
406
|
+
try:
|
|
407
|
+
response = client.search(**search_params)
|
|
408
|
+
except Exception as e:
|
|
409
|
+
raise TQLExecutionError(f"OpenSearch query failed: {str(e)}")
|
|
410
|
+
|
|
411
|
+
# Handle stats query results differently
|
|
412
|
+
if is_stats_query:
|
|
413
|
+
# Process stats aggregation results
|
|
414
|
+
aggs_response = response.get("aggregations", {})
|
|
415
|
+
|
|
416
|
+
# Format the stats results based on the test expectations
|
|
417
|
+
# Use the correct stats AST
|
|
418
|
+
if ast.get("type") == "query_with_stats":
|
|
419
|
+
stats_ast = ast.get("stats")
|
|
420
|
+
else:
|
|
421
|
+
stats_ast = ast
|
|
422
|
+
|
|
423
|
+
# Extract aggregation info
|
|
424
|
+
if stats_ast:
|
|
425
|
+
aggregations = stats_ast.get("aggregations", [])
|
|
426
|
+
group_by_fields = stats_ast.get("group_by", [])
|
|
427
|
+
else:
|
|
428
|
+
aggregations = []
|
|
429
|
+
group_by_fields = []
|
|
430
|
+
|
|
431
|
+
# Format results differently based on whether we have grouping
|
|
432
|
+
if group_by_fields:
|
|
433
|
+
# For grouped stats, we need to extract buckets
|
|
434
|
+
if stats_ast:
|
|
435
|
+
buckets = self._extract_grouped_buckets(aggs_response, group_by_fields, aggregations, stats_ast)
|
|
436
|
+
else:
|
|
437
|
+
buckets = []
|
|
438
|
+
|
|
439
|
+
# For multiple aggregations, include all operations
|
|
440
|
+
operations = [agg.get("function") for agg in aggregations]
|
|
441
|
+
fields = [agg.get("field") for agg in aggregations]
|
|
442
|
+
|
|
443
|
+
stats_results = {
|
|
444
|
+
"type": "stats",
|
|
445
|
+
"operation": operations[0] if len(operations) == 1 else operations,
|
|
446
|
+
"field": fields[0] if len(fields) == 1 else fields,
|
|
447
|
+
"values": buckets, # Array of buckets for grouped results
|
|
448
|
+
"group_by": group_by_fields,
|
|
449
|
+
}
|
|
450
|
+
else:
|
|
451
|
+
# Simple aggregations without grouping
|
|
452
|
+
if aggregations:
|
|
453
|
+
first_agg = aggregations[0]
|
|
454
|
+
func = first_agg.get("function", "")
|
|
455
|
+
field = first_agg.get("field", "*")
|
|
456
|
+
|
|
457
|
+
# Get the aggregation result
|
|
458
|
+
# The alias is typically func_field_0 for the first aggregation
|
|
459
|
+
alias = first_agg.get("alias") or f"{func}_{field}_0"
|
|
460
|
+
agg_result = aggs_response.get(alias, {})
|
|
461
|
+
|
|
462
|
+
# Extract the value based on aggregation type
|
|
463
|
+
if func == "count":
|
|
464
|
+
value = agg_result.get("value", 0)
|
|
465
|
+
elif func in ["sum", "min", "max", "avg", "average"]:
|
|
466
|
+
value = agg_result.get("value", 0)
|
|
467
|
+
elif func == "unique_count":
|
|
468
|
+
value = agg_result.get("value", 0)
|
|
469
|
+
elif func in ["percentile", "percentiles", "p", "pct"]:
|
|
470
|
+
# Percentiles return a values dict
|
|
471
|
+
values_dict = agg_result.get("values", {})
|
|
472
|
+
# For a single percentile, extract the value
|
|
473
|
+
if len(values_dict) == 1:
|
|
474
|
+
value = list(values_dict.values())[0]
|
|
475
|
+
else:
|
|
476
|
+
value = values_dict
|
|
477
|
+
else:
|
|
478
|
+
value = agg_result
|
|
479
|
+
|
|
480
|
+
stats_results = {
|
|
481
|
+
"type": "stats",
|
|
482
|
+
"operation": func,
|
|
483
|
+
"field": field,
|
|
484
|
+
"values": value,
|
|
485
|
+
"group_by": [],
|
|
486
|
+
}
|
|
487
|
+
else:
|
|
488
|
+
stats_results = {"type": "stats", "operation": "unknown", "field": "*", "values": 0, "group_by": []}
|
|
489
|
+
|
|
490
|
+
# Extract hits if size > 0
|
|
491
|
+
hits = response.get("hits", {}).get("hits", [])
|
|
492
|
+
documents = []
|
|
493
|
+
if size > 0 and hits:
|
|
494
|
+
for hit in hits:
|
|
495
|
+
doc = hit["_source"].copy()
|
|
496
|
+
# Preserve metadata
|
|
497
|
+
if "_id" in hit:
|
|
498
|
+
doc["_id"] = hit["_id"]
|
|
499
|
+
if "_score" in hit:
|
|
500
|
+
doc["_score"] = hit["_score"]
|
|
501
|
+
documents.append(doc)
|
|
502
|
+
|
|
503
|
+
# Return in the expected format
|
|
504
|
+
result = {
|
|
505
|
+
"results": documents,
|
|
506
|
+
"total": response.get("hits", {}).get("total", {}).get("value", 0),
|
|
507
|
+
"stats": stats_results,
|
|
508
|
+
"post_processing_applied": False,
|
|
509
|
+
"health_status": "HEALTHY",
|
|
510
|
+
"health_reasons": [],
|
|
511
|
+
"performance_impact": {"overhead_ms": 0, "mutators_applied": 0},
|
|
512
|
+
"scan_info": {"used_scan": False},
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
# Add query_type if documents were requested
|
|
516
|
+
if size > 0:
|
|
517
|
+
result["query_type"] = "stats_with_docs"
|
|
518
|
+
|
|
519
|
+
return result
|
|
520
|
+
|
|
521
|
+
# Extract hits for regular queries
|
|
522
|
+
hits = response.get("hits", {}).get("hits", [])
|
|
523
|
+
total_hits = response.get("hits", {}).get("total", {}).get("value", 0)
|
|
524
|
+
|
|
525
|
+
# Process results based on whether we need Phase 2
|
|
526
|
+
if needs_phase2:
|
|
527
|
+
# Apply Phase 2 processing
|
|
528
|
+
processor = QueryPostProcessor()
|
|
529
|
+
|
|
530
|
+
# Extract documents from hits
|
|
531
|
+
documents = []
|
|
532
|
+
hit_metadata = []
|
|
533
|
+
for hit in hits:
|
|
534
|
+
documents.append(hit["_source"])
|
|
535
|
+
hit_metadata.append(
|
|
536
|
+
{
|
|
537
|
+
"_id": hit.get("_id"),
|
|
538
|
+
"_score": hit.get("_score"),
|
|
539
|
+
"_explanation": hit.get("_explanation") if explain else None,
|
|
540
|
+
}
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# First apply mutators to all documents
|
|
544
|
+
if isinstance(analysis_result, MutatorAnalysisResult):
|
|
545
|
+
processed_docs = processor.process_results(
|
|
546
|
+
documents,
|
|
547
|
+
analysis_result.post_processing_requirements,
|
|
548
|
+
track_enrichments=kwargs.get("save_enrichment", False),
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Then filter results based on requirements (e.g., ALL operator, contains with mutators)
|
|
552
|
+
filtered_docs = processor.filter_results(processed_docs, analysis_result.post_processing_requirements)
|
|
553
|
+
else:
|
|
554
|
+
# No post-processing needed
|
|
555
|
+
processed_docs = documents
|
|
556
|
+
filtered_docs = documents
|
|
557
|
+
|
|
558
|
+
# Build final results with preserved metadata
|
|
559
|
+
results = []
|
|
560
|
+
for doc in filtered_docs:
|
|
561
|
+
# Find the original hit metadata for this document
|
|
562
|
+
# This is a simple approach - in production you might want to track IDs
|
|
563
|
+
for i, orig_doc in enumerate(documents):
|
|
564
|
+
if orig_doc == doc or self._docs_match(orig_doc, doc):
|
|
565
|
+
# Add metadata
|
|
566
|
+
if hit_metadata[i]["_id"]:
|
|
567
|
+
doc["_id"] = hit_metadata[i]["_id"]
|
|
568
|
+
if hit_metadata[i]["_score"]:
|
|
569
|
+
doc["_score"] = hit_metadata[i]["_score"]
|
|
570
|
+
if hit_metadata[i]["_explanation"]:
|
|
571
|
+
doc["_explanation"] = hit_metadata[i]["_explanation"]
|
|
572
|
+
break
|
|
573
|
+
results.append(doc)
|
|
574
|
+
else:
|
|
575
|
+
# No Phase 2 needed, just extract documents
|
|
576
|
+
results = []
|
|
577
|
+
for hit in hits:
|
|
578
|
+
doc = hit["_source"].copy()
|
|
579
|
+
# Preserve metadata
|
|
580
|
+
if "_id" in hit:
|
|
581
|
+
doc["_id"] = hit["_id"]
|
|
582
|
+
if "_score" in hit:
|
|
583
|
+
doc["_score"] = hit["_score"]
|
|
584
|
+
if explain and "explanation" in hit:
|
|
585
|
+
doc["_explanation"] = hit["explanation"]
|
|
586
|
+
results.append(doc)
|
|
587
|
+
|
|
588
|
+
# Return raw response if requested
|
|
589
|
+
if kwargs.get("raw_response", False):
|
|
590
|
+
return {
|
|
591
|
+
"took": response.get("took"),
|
|
592
|
+
"timed_out": response.get("timed_out"),
|
|
593
|
+
"hits": {
|
|
594
|
+
"total": response.get("hits", {}).get("total"),
|
|
595
|
+
"max_score": response.get("hits", {}).get("max_score"),
|
|
596
|
+
"hits": results,
|
|
597
|
+
},
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
# Build performance impact info
|
|
601
|
+
performance_impact = {
|
|
602
|
+
"has_post_processing": needs_phase2,
|
|
603
|
+
"impacted_fields": [],
|
|
604
|
+
"mutator_types": [],
|
|
605
|
+
"estimated_overhead": "low",
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
if needs_phase2 and isinstance(analysis_result, MutatorAnalysisResult):
|
|
609
|
+
impacted_fields = set()
|
|
610
|
+
mutator_types = set()
|
|
611
|
+
|
|
612
|
+
for req in analysis_result.post_processing_requirements:
|
|
613
|
+
impacted_fields.add(req.field_name)
|
|
614
|
+
for mutator in req.mutators:
|
|
615
|
+
mutator_types.add(mutator.get("name", "unknown"))
|
|
616
|
+
|
|
617
|
+
performance_impact["impacted_fields"] = list(impacted_fields)
|
|
618
|
+
performance_impact["mutator_types"] = list(mutator_types)
|
|
619
|
+
|
|
620
|
+
# Estimate overhead based on mutator types
|
|
621
|
+
expensive_mutators = {"nslookup", "geoip_lookup", "geo"}
|
|
622
|
+
if any(m in mutator_types for m in expensive_mutators):
|
|
623
|
+
performance_impact["estimated_overhead"] = "high"
|
|
624
|
+
elif len(mutator_types) > 2:
|
|
625
|
+
performance_impact["estimated_overhead"] = "medium"
|
|
626
|
+
|
|
627
|
+
# Determine health status
|
|
628
|
+
if needs_phase2:
|
|
629
|
+
health_status = "yellow"
|
|
630
|
+
health_reasons = ["Post-processing required - results may be incomplete with pagination"]
|
|
631
|
+
else:
|
|
632
|
+
health_status = "green"
|
|
633
|
+
health_reasons = []
|
|
634
|
+
|
|
635
|
+
# Get opensearch total before filtering
|
|
636
|
+
opensearch_total = total_hits
|
|
637
|
+
|
|
638
|
+
result = {
|
|
639
|
+
"results": results,
|
|
640
|
+
"total": len(results),
|
|
641
|
+
"returned": len(results), # Alias for total
|
|
642
|
+
"opensearch_total": opensearch_total,
|
|
643
|
+
"post_processing_applied": needs_phase2,
|
|
644
|
+
"health_status": health_status,
|
|
645
|
+
"health_reasons": health_reasons,
|
|
646
|
+
"performance_impact": performance_impact,
|
|
647
|
+
"optimizations_applied": [], # TODO: Track actual optimizations # noqa: W0511
|
|
648
|
+
"opensearch_query": (
|
|
649
|
+
opensearch_query.get("query", {}) if opensearch_query else {}
|
|
650
|
+
), # Include the query that was sent
|
|
651
|
+
"time_range": time_range,
|
|
652
|
+
"timestamp_field": timestamp_field,
|
|
653
|
+
"query_type": "regular", # Regular query (not stats)
|
|
654
|
+
"scan_info": {
|
|
655
|
+
"used_scan": scan_all,
|
|
656
|
+
"scroll_size": scroll_size if scan_all else None,
|
|
657
|
+
"scroll_timeout": scroll_timeout if scan_all else None,
|
|
658
|
+
"scroll_count": scroll_count if scan_all else None,
|
|
659
|
+
"documents_retrieved": len(results) if scan_all else None,
|
|
660
|
+
"estimated_total": total_hits if scan_all else None,
|
|
661
|
+
},
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
# Add pagination info for non-scan queries
|
|
665
|
+
if not scan_all:
|
|
666
|
+
result["pagination"] = {
|
|
667
|
+
"size": size,
|
|
668
|
+
"from": from_,
|
|
669
|
+
"total": opensearch_total,
|
|
670
|
+
"has_more": opensearch_total > (from_ + len(results)),
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
return result
|
|
674
|
+
|
|
675
|
+
def _docs_match(self, doc1: Dict[str, Any], doc2: Dict[str, Any]) -> bool:
|
|
676
|
+
"""Check if two documents are the same (accounting for mutations).
|
|
677
|
+
|
|
678
|
+
This is a simple implementation - in production you'd want something more robust.
|
|
679
|
+
"""
|
|
680
|
+
# If they have the same _id, they match
|
|
681
|
+
if "_id" in doc1 and "_id" in doc2 and doc1["_id"] == doc2["_id"]:
|
|
682
|
+
return True
|
|
683
|
+
|
|
684
|
+
# Otherwise do a simple comparison of a few key fields
|
|
685
|
+
# This is imperfect but works for most cases
|
|
686
|
+
key_fields = ["id", "name", "hostname", "@timestamp"]
|
|
687
|
+
for field in key_fields:
|
|
688
|
+
if field in doc1 and field in doc2 and doc1[field] == doc2[field]:
|
|
689
|
+
return True
|
|
690
|
+
|
|
691
|
+
return False
|
|
692
|
+
|
|
693
|
+
def _extract_grouped_buckets( # noqa: C901
|
|
694
|
+
self,
|
|
695
|
+
aggs_response: Dict[str, Any],
|
|
696
|
+
group_by_fields: List[str],
|
|
697
|
+
aggregations: List[Dict[str, Any]],
|
|
698
|
+
stats_ast: Dict[str, Any],
|
|
699
|
+
) -> List[Dict[str, Any]]:
|
|
700
|
+
"""Extract buckets from grouped aggregation response.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
aggs_response: OpenSearch aggregations response
|
|
704
|
+
group_by_fields: List of fields used for grouping
|
|
705
|
+
aggregations: List of aggregation specifications
|
|
706
|
+
stats_ast: The stats AST for reference
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
List of bucket dictionaries with group keys and aggregation values
|
|
710
|
+
"""
|
|
711
|
+
buckets = []
|
|
712
|
+
|
|
713
|
+
# For single-level grouping
|
|
714
|
+
if len(group_by_fields) == 1:
|
|
715
|
+
field = group_by_fields[0]
|
|
716
|
+
# Look for the terms aggregation with the group field name
|
|
717
|
+
terms_agg_name = f"group_by_{field}"
|
|
718
|
+
|
|
719
|
+
# The aggregation might be named differently, check for it
|
|
720
|
+
# OpenSearch stats translator uses the field name directly
|
|
721
|
+
if field in aggs_response:
|
|
722
|
+
buckets_data = aggs_response[field].get("buckets", [])
|
|
723
|
+
elif terms_agg_name in aggs_response:
|
|
724
|
+
buckets_data = aggs_response[terms_agg_name].get("buckets", [])
|
|
725
|
+
else:
|
|
726
|
+
# Try to find any terms aggregation
|
|
727
|
+
for _key, value in aggs_response.items():
|
|
728
|
+
if isinstance(value, dict) and "buckets" in value:
|
|
729
|
+
buckets_data = value["buckets"]
|
|
730
|
+
break
|
|
731
|
+
else:
|
|
732
|
+
buckets_data = []
|
|
733
|
+
|
|
734
|
+
# Process each bucket
|
|
735
|
+
for bucket in buckets_data:
|
|
736
|
+
bucket_result = {field: bucket.get("key")}
|
|
737
|
+
|
|
738
|
+
# Extract aggregation values
|
|
739
|
+
for i, agg in enumerate(aggregations):
|
|
740
|
+
func = agg.get("function", "")
|
|
741
|
+
field_name = agg.get("field", "*")
|
|
742
|
+
alias = agg.get("alias") or f"{func}_{field_name}_{i}"
|
|
743
|
+
|
|
744
|
+
# Map function names to expected output names
|
|
745
|
+
output_key = func
|
|
746
|
+
if func == "avg":
|
|
747
|
+
output_key = "average"
|
|
748
|
+
elif func == "unique_count":
|
|
749
|
+
output_key = "distinct_count"
|
|
750
|
+
|
|
751
|
+
if alias in bucket:
|
|
752
|
+
agg_value = bucket[alias]
|
|
753
|
+
# Extract the actual value
|
|
754
|
+
if isinstance(agg_value, dict) and "value" in agg_value:
|
|
755
|
+
bucket_result[output_key] = agg_value["value"]
|
|
756
|
+
else:
|
|
757
|
+
bucket_result[output_key] = agg_value
|
|
758
|
+
else:
|
|
759
|
+
# Try without index suffix for first aggregation
|
|
760
|
+
simple_alias = f"{func}_{field_name}"
|
|
761
|
+
if simple_alias in bucket:
|
|
762
|
+
agg_value = bucket[simple_alias]
|
|
763
|
+
if isinstance(agg_value, dict) and "value" in agg_value:
|
|
764
|
+
bucket_result[output_key] = agg_value["value"]
|
|
765
|
+
else:
|
|
766
|
+
bucket_result[output_key] = agg_value
|
|
767
|
+
|
|
768
|
+
buckets.append(bucket_result)
|
|
769
|
+
|
|
770
|
+
else:
|
|
771
|
+
# Multi-level grouping - need to traverse nested structure
|
|
772
|
+
# Start with the outermost grouping
|
|
773
|
+
current_agg = aggs_response
|
|
774
|
+
|
|
775
|
+
# Find the first group_by aggregation
|
|
776
|
+
for field in group_by_fields:
|
|
777
|
+
group_key = f"group_by_{field}"
|
|
778
|
+
if group_key in current_agg:
|
|
779
|
+
current_agg = current_agg[group_key]
|
|
780
|
+
break
|
|
781
|
+
elif field in current_agg:
|
|
782
|
+
current_agg = current_agg[field]
|
|
783
|
+
break
|
|
784
|
+
|
|
785
|
+
# Process nested buckets recursively
|
|
786
|
+
if "buckets" in current_agg:
|
|
787
|
+
buckets = self._process_nested_buckets(current_agg["buckets"], group_by_fields, aggregations, 0)
|
|
788
|
+
|
|
789
|
+
return buckets
|
|
790
|
+
|
|
791
|
+
def _process_nested_buckets( # noqa: C901
|
|
792
|
+
self,
|
|
793
|
+
buckets_data: List[Dict[str, Any]],
|
|
794
|
+
group_by_fields: List[str],
|
|
795
|
+
aggregations: List[Dict[str, Any]],
|
|
796
|
+
level: int,
|
|
797
|
+
) -> List[Dict[str, Any]]:
|
|
798
|
+
"""Process nested buckets for multi-level grouping.
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
buckets_data: List of bucket data from OpenSearch
|
|
802
|
+
group_by_fields: List of fields used for grouping
|
|
803
|
+
aggregations: List of aggregation specifications
|
|
804
|
+
level: Current nesting level (0-based)
|
|
805
|
+
|
|
806
|
+
Returns:
|
|
807
|
+
Flattened list of bucket results
|
|
808
|
+
"""
|
|
809
|
+
results = []
|
|
810
|
+
|
|
811
|
+
for bucket in buckets_data:
|
|
812
|
+
# Get the key for this level
|
|
813
|
+
field_name = group_by_fields[level]
|
|
814
|
+
bucket_key = {field_name: bucket.get("key")}
|
|
815
|
+
|
|
816
|
+
# Check if there are more levels
|
|
817
|
+
if level + 1 < len(group_by_fields):
|
|
818
|
+
# Look for the next level's aggregation
|
|
819
|
+
next_field = group_by_fields[level + 1]
|
|
820
|
+
next_group_key = f"group_by_{next_field}"
|
|
821
|
+
|
|
822
|
+
if next_group_key in bucket and "buckets" in bucket[next_group_key]:
|
|
823
|
+
# Recursively process nested buckets
|
|
824
|
+
nested_results = self._process_nested_buckets(
|
|
825
|
+
bucket[next_group_key]["buckets"], group_by_fields, aggregations, level + 1
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
# Merge current key with nested results
|
|
829
|
+
for nested in nested_results:
|
|
830
|
+
merged = bucket_key.copy()
|
|
831
|
+
merged.update(nested)
|
|
832
|
+
results.append(merged)
|
|
833
|
+
else:
|
|
834
|
+
# This is the innermost level - extract aggregation values
|
|
835
|
+
result = bucket_key.copy()
|
|
836
|
+
|
|
837
|
+
# Extract aggregation values
|
|
838
|
+
for i, agg in enumerate(aggregations):
|
|
839
|
+
func = agg.get("function", "")
|
|
840
|
+
field_name = agg.get("field", "*")
|
|
841
|
+
alias = agg.get("alias") or f"{func}_{field_name}_{i}"
|
|
842
|
+
|
|
843
|
+
# Map function names to expected output names
|
|
844
|
+
output_key = func
|
|
845
|
+
if func == "avg":
|
|
846
|
+
output_key = "average"
|
|
847
|
+
elif func == "unique_count":
|
|
848
|
+
output_key = "distinct_count"
|
|
849
|
+
|
|
850
|
+
if alias in bucket:
|
|
851
|
+
agg_value = bucket[alias]
|
|
852
|
+
# Extract the actual value
|
|
853
|
+
if isinstance(agg_value, dict) and "value" in agg_value:
|
|
854
|
+
result[output_key] = agg_value["value"]
|
|
855
|
+
else:
|
|
856
|
+
result[output_key] = agg_value
|
|
857
|
+
else:
|
|
858
|
+
# Try without index suffix for first aggregation
|
|
859
|
+
simple_alias = f"{func}_{field_name}"
|
|
860
|
+
if simple_alias in bucket:
|
|
861
|
+
agg_value = bucket[simple_alias]
|
|
862
|
+
if isinstance(agg_value, dict) and "value" in agg_value:
|
|
863
|
+
result[output_key] = agg_value["value"]
|
|
864
|
+
else:
|
|
865
|
+
result[output_key] = agg_value
|
|
866
|
+
|
|
867
|
+
results.append(result)
|
|
868
|
+
|
|
869
|
+
return results
|