tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/METADATA +24 -1
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/RECORD +27 -27
- tql/core.py +225 -54
- tql/core_components/opensearch_operations.py +415 -99
- tql/core_components/stats_operations.py +11 -1
- tql/evaluator.py +39 -2
- tql/evaluator_components/special_expressions.py +25 -6
- tql/evaluator_components/value_comparison.py +31 -3
- tql/mutator_analyzer.py +640 -242
- tql/mutators/__init__.py +5 -1
- tql/mutators/dns.py +76 -53
- tql/mutators/security.py +101 -100
- tql/mutators/string.py +74 -0
- tql/opensearch_components/field_mapping.py +9 -3
- tql/opensearch_components/lucene_converter.py +12 -0
- tql/opensearch_components/query_converter.py +134 -25
- tql/opensearch_mappings.py +2 -2
- tql/opensearch_stats.py +170 -39
- tql/parser.py +92 -37
- tql/parser_components/ast_builder.py +37 -1
- tql/parser_components/field_extractor.py +9 -1
- tql/parser_components/grammar.py +32 -8
- tql/post_processor.py +489 -31
- tql/stats_evaluator.py +170 -12
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/LICENSE +0 -0
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/WHEEL +0 -0
- {tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/entry_points.txt +0 -0
|
@@ -42,6 +42,18 @@ class QueryConverter:
|
|
|
42
42
|
return self._convert_geo_expr(node)
|
|
43
43
|
elif node_type == "nslookup_expr":
|
|
44
44
|
return self._convert_nslookup_expr(node)
|
|
45
|
+
elif node_type == "query_with_stats":
|
|
46
|
+
# For query_with_stats, only convert the filter part
|
|
47
|
+
# The stats part is handled by the stats engine
|
|
48
|
+
filter_node = node.get("filter")
|
|
49
|
+
if filter_node:
|
|
50
|
+
return self.convert_node(filter_node)
|
|
51
|
+
else:
|
|
52
|
+
return {"match_all": {}}
|
|
53
|
+
elif node_type == "stats_expr":
|
|
54
|
+
# Pure stats queries match all documents
|
|
55
|
+
# The aggregations are handled by the stats engine
|
|
56
|
+
return {"match_all": {}}
|
|
45
57
|
|
|
46
58
|
raise TQLValidationError(f"Unknown node type: {node}")
|
|
47
59
|
|
|
@@ -87,7 +99,8 @@ class QueryConverter:
|
|
|
87
99
|
"""Convert a comparison operation to OpenSearch query."""
|
|
88
100
|
field_name = node["field"]
|
|
89
101
|
operator = node["operator"]
|
|
90
|
-
value
|
|
102
|
+
# For exists/not_exists operators, value is None
|
|
103
|
+
value = node.get("value") if operator not in ["exists", "not_exists"] else None
|
|
91
104
|
field_mutators = node.get("field_mutators", [])
|
|
92
105
|
|
|
93
106
|
# Check if mutators change the field type
|
|
@@ -127,10 +140,16 @@ class QueryConverter:
|
|
|
127
140
|
# Note: ALL and NOT_ALL operators are handled with script queries and don't need post-processing
|
|
128
141
|
requires_post_processing = node.get("post_process_value", False) or has_type_changing_mutators
|
|
129
142
|
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
143
|
+
# Also check if we have transform mutators with filtering operators
|
|
144
|
+
# Transform mutators change the field value, so we need to use exists query
|
|
145
|
+
has_transform_mutators_with_filter = node.get("has_transform_mutators_with_filter", False)
|
|
146
|
+
|
|
147
|
+
# Also check field_mutators directly in case the flag wasn't set
|
|
148
|
+
if (
|
|
149
|
+
not has_transform_mutators_with_filter
|
|
150
|
+
and field_mutators
|
|
151
|
+
and operator
|
|
152
|
+
in [
|
|
134
153
|
"eq",
|
|
135
154
|
"=",
|
|
136
155
|
"ne",
|
|
@@ -151,10 +170,55 @@ class QueryConverter:
|
|
|
151
170
|
"lte",
|
|
152
171
|
"between",
|
|
153
172
|
"not_between",
|
|
154
|
-
]
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
173
|
+
]
|
|
174
|
+
):
|
|
175
|
+
# Check if any of the mutators are transform mutators
|
|
176
|
+
TRANSFORM_MUTATORS = {
|
|
177
|
+
"lowercase",
|
|
178
|
+
"uppercase",
|
|
179
|
+
"trim",
|
|
180
|
+
"replace",
|
|
181
|
+
"refang",
|
|
182
|
+
"defang",
|
|
183
|
+
"b64encode",
|
|
184
|
+
"b64decode",
|
|
185
|
+
"urldecode",
|
|
186
|
+
}
|
|
187
|
+
for mutator in field_mutators:
|
|
188
|
+
if mutator.get("name", "").lower() in TRANSFORM_MUTATORS:
|
|
189
|
+
has_transform_mutators_with_filter = True
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
if requires_post_processing or has_transform_mutators_with_filter:
|
|
193
|
+
# For value mutators, type-changing field mutators, or transform mutators that require post-processing, use exists query
|
|
194
|
+
# But NOT for field mutators like any/all/none - those should not affect the query
|
|
195
|
+
if node.get("value_mutators") or has_type_changing_mutators or has_transform_mutators_with_filter:
|
|
196
|
+
# Only for these mutators do we need to broaden the search
|
|
197
|
+
if operator in [
|
|
198
|
+
"eq",
|
|
199
|
+
"=",
|
|
200
|
+
"ne",
|
|
201
|
+
"!=",
|
|
202
|
+
"contains",
|
|
203
|
+
"not_contains",
|
|
204
|
+
"startswith",
|
|
205
|
+
"endswith",
|
|
206
|
+
"not_startswith",
|
|
207
|
+
"not_endswith",
|
|
208
|
+
">",
|
|
209
|
+
">=",
|
|
210
|
+
"<",
|
|
211
|
+
"<=",
|
|
212
|
+
"gt",
|
|
213
|
+
"gte",
|
|
214
|
+
"lt",
|
|
215
|
+
"lte",
|
|
216
|
+
"between",
|
|
217
|
+
"not_between",
|
|
218
|
+
]:
|
|
219
|
+
# For these operators with mutators, use exists query to get all docs with the field
|
|
220
|
+
# The actual filtering will happen in post-processing
|
|
221
|
+
return {"exists": {"field": opensearch_field}}
|
|
158
222
|
|
|
159
223
|
# Handle special wildcard conversion for keyword fields
|
|
160
224
|
if use_wildcard and operator == "contains":
|
|
@@ -162,23 +226,38 @@ class QueryConverter:
|
|
|
162
226
|
|
|
163
227
|
# Convert operator to OpenSearch query
|
|
164
228
|
if operator in ["eq", "="]:
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
229
|
+
# For fields with mappings, use the optimized query type
|
|
230
|
+
if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
|
|
231
|
+
# Check if we're using a text field
|
|
232
|
+
is_text_field = self._is_text_field(field_name, opensearch_field)
|
|
233
|
+
if is_text_field:
|
|
234
|
+
return {"match": {opensearch_field: value}}
|
|
235
|
+
else:
|
|
236
|
+
return {"term": {opensearch_field: value}}
|
|
171
237
|
else:
|
|
172
|
-
|
|
238
|
+
# For unmapped fields, use match_phrase for strings (safer default)
|
|
239
|
+
# This ensures compatibility with both text and keyword fields
|
|
240
|
+
if isinstance(value, str):
|
|
241
|
+
return {"match_phrase": {opensearch_field: value}}
|
|
242
|
+
else:
|
|
243
|
+
# For non-string values (numbers, booleans), use term query
|
|
244
|
+
return {"term": {opensearch_field: value}}
|
|
173
245
|
elif operator in ["ne", "!="]:
|
|
174
|
-
#
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
246
|
+
# For fields with mappings, use the optimized query type
|
|
247
|
+
if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
|
|
248
|
+
# Check if we're using a text field
|
|
249
|
+
is_text_field = self._is_text_field(field_name, opensearch_field)
|
|
250
|
+
if is_text_field:
|
|
251
|
+
return {"bool": {"must_not": {"match": {opensearch_field: value}}}}
|
|
252
|
+
else:
|
|
253
|
+
return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
|
|
180
254
|
else:
|
|
181
|
-
|
|
255
|
+
# For unmapped fields, use match_phrase for strings (safer default)
|
|
256
|
+
if isinstance(value, str):
|
|
257
|
+
return {"bool": {"must_not": {"match_phrase": {opensearch_field: value}}}}
|
|
258
|
+
else:
|
|
259
|
+
# For non-string values (numbers, booleans), use term query
|
|
260
|
+
return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
|
|
182
261
|
elif operator in ["gt", ">"]:
|
|
183
262
|
return {"range": {opensearch_field: {"gt": value}}}
|
|
184
263
|
elif operator in ["gte", ">="]:
|
|
@@ -434,8 +513,25 @@ class QueryConverter:
|
|
|
434
513
|
right_query = self.convert_node(node["right"])
|
|
435
514
|
|
|
436
515
|
if operator == "and":
|
|
437
|
-
|
|
516
|
+
# Collect all must clauses, flattening where appropriate
|
|
517
|
+
must_clauses = []
|
|
518
|
+
|
|
519
|
+
# Helper function to extract clauses
|
|
520
|
+
def extract_must_clauses(query):
|
|
521
|
+
if isinstance(query, dict) and "bool" in query:
|
|
522
|
+
bool_query = query["bool"]
|
|
523
|
+
# If it only has must clauses, extract them
|
|
524
|
+
if set(bool_query.keys()) == {"must"} and isinstance(bool_query["must"], list):
|
|
525
|
+
return bool_query["must"]
|
|
526
|
+
return [query]
|
|
527
|
+
|
|
528
|
+
# Extract and flatten must clauses
|
|
529
|
+
must_clauses.extend(extract_must_clauses(left_query))
|
|
530
|
+
must_clauses.extend(extract_must_clauses(right_query))
|
|
531
|
+
|
|
532
|
+
return {"bool": {"must": must_clauses}}
|
|
438
533
|
elif operator == "or":
|
|
534
|
+
# OR still needs should clause
|
|
439
535
|
return {"bool": {"should": [left_query, right_query], "minimum_should_match": 1}}
|
|
440
536
|
else:
|
|
441
537
|
raise TQLUnsupportedOperationError(f"Logical operator '{operator}' not supported for OpenSearch")
|
|
@@ -720,15 +816,28 @@ class QueryConverter:
|
|
|
720
816
|
value: Value to convert
|
|
721
817
|
|
|
722
818
|
Returns:
|
|
723
|
-
Converted value (bool, None, or original)
|
|
819
|
+
Converted value (bool, None, numeric, or original)
|
|
724
820
|
"""
|
|
725
821
|
if isinstance(value, str):
|
|
822
|
+
# Check for boolean values
|
|
726
823
|
if value.lower() == "true":
|
|
727
824
|
return True
|
|
728
825
|
elif value.lower() == "false":
|
|
729
826
|
return False
|
|
730
827
|
elif value.lower() == "null":
|
|
731
828
|
return None
|
|
829
|
+
# Check if it's a numeric string
|
|
830
|
+
elif value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
|
|
831
|
+
# Convert to integer
|
|
832
|
+
return int(value)
|
|
833
|
+
else:
|
|
834
|
+
# Try to parse as float
|
|
835
|
+
try:
|
|
836
|
+
# Check if it has a decimal point
|
|
837
|
+
if "." in value:
|
|
838
|
+
return float(value)
|
|
839
|
+
except ValueError:
|
|
840
|
+
pass
|
|
732
841
|
return value
|
|
733
842
|
|
|
734
843
|
def _is_text_field(self, field_name: str, opensearch_field: str) -> bool:
|
tql/opensearch_mappings.py
CHANGED
|
@@ -52,7 +52,7 @@ def extract_field_mappings_from_opensearch(
|
|
|
52
52
|
try:
|
|
53
53
|
# Extract field names from the TQL query
|
|
54
54
|
field_names = tql_instance.extract_fields(tql_query)
|
|
55
|
-
logger.
|
|
55
|
+
logger.debug(f"Extracted {len(field_names)} fields from TQL query: {field_names}")
|
|
56
56
|
|
|
57
57
|
if not field_names:
|
|
58
58
|
logger.warning("No fields found in TQL query")
|
|
@@ -68,7 +68,7 @@ def extract_field_mappings_from_opensearch(
|
|
|
68
68
|
# Extract and convert mappings to TQL format
|
|
69
69
|
tql_mappings = _convert_opensearch_mappings_to_tql_format(mapping_response, field_names)
|
|
70
70
|
|
|
71
|
-
logger.
|
|
71
|
+
logger.debug(f"Successfully converted mappings for {len(tql_mappings)} fields")
|
|
72
72
|
return tql_mappings
|
|
73
73
|
|
|
74
74
|
except Exception as e:
|
tql/opensearch_stats.py
CHANGED
|
@@ -4,6 +4,7 @@ This module translates TQL stats queries to OpenSearch aggregation DSL.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
import json
|
|
7
8
|
|
|
8
9
|
from .exceptions import TQLError
|
|
9
10
|
|
|
@@ -84,6 +85,9 @@ class OpenSearchStatsTranslator:
|
|
|
84
85
|
if group_by_fields:
|
|
85
86
|
# Build nested terms aggregations for grouping
|
|
86
87
|
aggs_dsl = self._build_grouped_aggregations(aggregations, group_by_fields, field_mappings)
|
|
88
|
+
print(
|
|
89
|
+
f"\n=== OpenSearch Aggregation Query ===\nGroup by: {group_by_fields}\nAggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
|
|
90
|
+
)
|
|
87
91
|
else:
|
|
88
92
|
# Simple aggregations without grouping
|
|
89
93
|
aggs_dsl = self._build_simple_aggregations(aggregations, field_mappings)
|
|
@@ -163,14 +167,14 @@ class OpenSearchStatsTranslator:
|
|
|
163
167
|
def _build_grouped_aggregations(
|
|
164
168
|
self,
|
|
165
169
|
aggregations: List[Dict[str, Any]],
|
|
166
|
-
group_by_fields: List[
|
|
170
|
+
group_by_fields: List[Any],
|
|
167
171
|
field_mappings: Optional[Dict[str, str]] = None,
|
|
168
172
|
) -> Dict[str, Any]:
|
|
169
173
|
"""Build aggregations with grouping.
|
|
170
174
|
|
|
171
175
|
Args:
|
|
172
176
|
aggregations: List of aggregation specifications
|
|
173
|
-
group_by_fields: Fields to group by
|
|
177
|
+
group_by_fields: Fields to group by (can be strings or dicts with bucket_size)
|
|
174
178
|
field_mappings: Optional field mappings
|
|
175
179
|
|
|
176
180
|
Returns:
|
|
@@ -193,18 +197,38 @@ class OpenSearchStatsTranslator:
|
|
|
193
197
|
size = agg.get("limit", 10)
|
|
194
198
|
break
|
|
195
199
|
|
|
200
|
+
# Normalize group_by_fields to handle both old (string) and new (dict) formats
|
|
201
|
+
normalized_fields = []
|
|
202
|
+
for field in group_by_fields:
|
|
203
|
+
if isinstance(field, str):
|
|
204
|
+
# Old format: just field name, use default bucket size
|
|
205
|
+
normalized_fields.append({"field": field, "bucket_size": 10})
|
|
206
|
+
elif isinstance(field, dict):
|
|
207
|
+
# New format: {"field": "name", "bucket_size": N}
|
|
208
|
+
bucket_size = field.get("bucket_size", 10) if field.get("bucket_size") is not None else 10
|
|
209
|
+
normalized_fields.append({"field": field["field"], "bucket_size": bucket_size})
|
|
210
|
+
else:
|
|
211
|
+
# Shouldn't happen but handle gracefully
|
|
212
|
+
normalized_fields.append({"field": str(field), "bucket_size": 10})
|
|
213
|
+
|
|
196
214
|
# Build nested terms aggregations for each group_by field
|
|
197
215
|
current_aggs = inner_aggs
|
|
198
216
|
|
|
199
217
|
# Process group_by fields in reverse order to build proper nesting
|
|
200
|
-
for
|
|
201
|
-
|
|
218
|
+
for i, field_spec in enumerate(reversed(normalized_fields)):
|
|
219
|
+
field_name = field_spec["field"]
|
|
220
|
+
bucket_size = field_spec["bucket_size"]
|
|
221
|
+
|
|
222
|
+
# Always respect user-specified bucket sizes
|
|
223
|
+
# The user has explicitly set limits with "top N" syntax
|
|
224
|
+
|
|
225
|
+
terms_agg = {"terms": {"field": field_name, "size": bucket_size}}
|
|
202
226
|
|
|
203
227
|
# Add ordering if this is the outermost aggregation and we have order field
|
|
204
|
-
if
|
|
228
|
+
if field_name == normalized_fields[0]["field"] and order_field:
|
|
205
229
|
# For nested aggregations, we need the full path
|
|
206
230
|
order_path = order_field
|
|
207
|
-
if len(
|
|
231
|
+
if len(normalized_fields) > 1:
|
|
208
232
|
# Multi-level grouping requires special handling
|
|
209
233
|
# OpenSearch doesn't support ordering by sub-aggregations in nested terms
|
|
210
234
|
# We'll need to handle this in post-processing
|
|
@@ -217,7 +241,7 @@ class OpenSearchStatsTranslator:
|
|
|
217
241
|
terms_agg["aggs"] = current_aggs
|
|
218
242
|
|
|
219
243
|
# Wrap for next level
|
|
220
|
-
current_aggs = {f"group_by_{
|
|
244
|
+
current_aggs = {f"group_by_{field_name}": terms_agg}
|
|
221
245
|
|
|
222
246
|
return current_aggs
|
|
223
247
|
|
|
@@ -314,14 +338,14 @@ class OpenSearchStatsTranslator:
|
|
|
314
338
|
return {"type": "multiple_aggregations", "results": results}
|
|
315
339
|
|
|
316
340
|
def _transform_grouped_response(
|
|
317
|
-
self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[
|
|
341
|
+
self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
|
|
318
342
|
) -> Dict[str, Any]:
|
|
319
343
|
"""Transform grouped aggregation response.
|
|
320
344
|
|
|
321
345
|
Args:
|
|
322
346
|
response: OpenSearch response
|
|
323
347
|
aggregations: Aggregation specifications
|
|
324
|
-
group_by_fields: Grouping fields
|
|
348
|
+
group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
|
|
325
349
|
|
|
326
350
|
Returns:
|
|
327
351
|
Transformed response
|
|
@@ -329,61 +353,78 @@ class OpenSearchStatsTranslator:
|
|
|
329
353
|
# Navigate to the grouped results
|
|
330
354
|
aggs_data = response.get("aggregations", {})
|
|
331
355
|
|
|
332
|
-
#
|
|
333
|
-
|
|
356
|
+
# Normalize group_by_fields to handle both old (string) and new (dict) formats
|
|
357
|
+
normalized_fields = []
|
|
358
|
+
for field in group_by_fields:
|
|
359
|
+
if isinstance(field, str):
|
|
360
|
+
# Old format: just field name
|
|
361
|
+
normalized_fields.append({"field": field, "bucket_size": None})
|
|
362
|
+
elif isinstance(field, dict):
|
|
363
|
+
# New format: {"field": "name", "bucket_size": N}
|
|
364
|
+
normalized_fields.append(field)
|
|
365
|
+
else:
|
|
366
|
+
# Shouldn't happen but handle gracefully
|
|
367
|
+
normalized_fields.append({"field": str(field), "bucket_size": None})
|
|
368
|
+
|
|
369
|
+
# Get the outermost grouping - use the field name from the normalized structure
|
|
370
|
+
first_field_name = normalized_fields[0]["field"]
|
|
371
|
+
first_group_key = f"group_by_{first_field_name}"
|
|
334
372
|
grouped_data = aggs_data.get(first_group_key, {})
|
|
335
373
|
|
|
336
374
|
# Extract buckets
|
|
337
375
|
buckets = grouped_data.get("buckets", [])
|
|
338
376
|
|
|
339
|
-
# Transform buckets
|
|
377
|
+
# Transform buckets - handle multi-level grouping recursively
|
|
340
378
|
results = []
|
|
379
|
+
print(
|
|
380
|
+
f"\n=== OpenSearch Response Debug ===\nTotal buckets at top level: {len(buckets)}\nGroup by fields: {group_by_fields}\n"
|
|
381
|
+
)
|
|
341
382
|
for bucket in buckets:
|
|
342
|
-
result = self.
|
|
383
|
+
result = self._transform_bucket_recursive(bucket, aggregations, normalized_fields, 0)
|
|
343
384
|
if result:
|
|
344
|
-
|
|
385
|
+
# Handle the case where recursive transformation returns a list (multi-level)
|
|
386
|
+
if isinstance(result, list):
|
|
387
|
+
results.extend(result)
|
|
388
|
+
else:
|
|
389
|
+
results.append(result)
|
|
345
390
|
|
|
346
|
-
return {"type": "
|
|
391
|
+
return {"type": "stats_grouped", "group_by": group_by_fields, "results": results}
|
|
347
392
|
|
|
348
393
|
def _transform_bucket(
|
|
349
|
-
self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[
|
|
394
|
+
self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any], level: int
|
|
350
395
|
) -> Optional[Dict[str, Any]]:
|
|
351
396
|
"""Transform a single bucket from grouped aggregation.
|
|
352
397
|
|
|
353
398
|
Args:
|
|
354
399
|
bucket: OpenSearch bucket
|
|
355
400
|
aggregations: Aggregation specifications
|
|
356
|
-
group_by_fields: Grouping fields
|
|
401
|
+
group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
|
|
357
402
|
level: Current nesting level
|
|
358
403
|
|
|
359
404
|
Returns:
|
|
360
405
|
Transformed bucket or None
|
|
361
406
|
"""
|
|
407
|
+
# Normalize group_by_fields to handle both old (string) and new (dict) formats
|
|
408
|
+
normalized_fields = []
|
|
409
|
+
for field in group_by_fields:
|
|
410
|
+
if isinstance(field, str):
|
|
411
|
+
# Old format: just field name
|
|
412
|
+
normalized_fields.append({"field": field, "bucket_size": None})
|
|
413
|
+
elif isinstance(field, dict):
|
|
414
|
+
# New format: {"field": "name", "bucket_size": N}
|
|
415
|
+
normalized_fields.append(field)
|
|
416
|
+
else:
|
|
417
|
+
# Shouldn't happen but handle gracefully
|
|
418
|
+
normalized_fields.append({"field": str(field), "bucket_size": None})
|
|
419
|
+
|
|
362
420
|
result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
|
|
363
421
|
|
|
364
422
|
# Add current level key
|
|
365
|
-
if level < len(
|
|
366
|
-
|
|
367
|
-
result["key"][
|
|
368
|
-
|
|
369
|
-
#
|
|
370
|
-
if level + 1 < len(group_by_fields):
|
|
371
|
-
# Navigate to next level
|
|
372
|
-
next_field = group_by_fields[level + 1]
|
|
373
|
-
next_group_key = f"group_by_{next_field}"
|
|
374
|
-
|
|
375
|
-
if next_group_key in bucket:
|
|
376
|
-
# This is a nested grouping, we need to aggregate the sub-buckets
|
|
377
|
-
# For now, we'll just take the first sub-bucket
|
|
378
|
-
# TODO: Handle proper multi-level grouping
|
|
379
|
-
sub_buckets = bucket[next_group_key].get("buckets", [])
|
|
380
|
-
if sub_buckets:
|
|
381
|
-
sub_result = self._transform_bucket(sub_buckets[0], aggregations, group_by_fields, level + 1)
|
|
382
|
-
if sub_result:
|
|
383
|
-
# Merge keys
|
|
384
|
-
result["key"].update(sub_result["key"])
|
|
385
|
-
|
|
386
|
-
# Extract aggregation values
|
|
423
|
+
if level < len(normalized_fields):
|
|
424
|
+
field_name = normalized_fields[level]["field"]
|
|
425
|
+
result["key"][field_name] = bucket.get("key")
|
|
426
|
+
|
|
427
|
+
# Extract aggregation values at the innermost level
|
|
387
428
|
if len(aggregations) == 1:
|
|
388
429
|
# Single aggregation
|
|
389
430
|
agg = aggregations[0]
|
|
@@ -402,6 +443,96 @@ class OpenSearchStatsTranslator:
|
|
|
402
443
|
|
|
403
444
|
return result
|
|
404
445
|
|
|
446
|
+
def _transform_bucket_recursive(
|
|
447
|
+
self,
|
|
448
|
+
bucket: Dict[str, Any],
|
|
449
|
+
aggregations: List[Dict[str, Any]],
|
|
450
|
+
normalized_fields: List[Dict[str, Any]],
|
|
451
|
+
level: int,
|
|
452
|
+
) -> Optional[Dict[str, Any]]:
|
|
453
|
+
"""Transform a bucket recursively for multi-level grouping.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
bucket: OpenSearch bucket
|
|
457
|
+
aggregations: Aggregation specifications
|
|
458
|
+
normalized_fields: Normalized group by fields with field names and bucket sizes
|
|
459
|
+
level: Current nesting level
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
Transformed bucket or None
|
|
463
|
+
"""
|
|
464
|
+
result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
|
|
465
|
+
|
|
466
|
+
# Add current level key
|
|
467
|
+
if level < len(normalized_fields):
|
|
468
|
+
field_name = normalized_fields[level]["field"]
|
|
469
|
+
result["key"][field_name] = bucket.get("key")
|
|
470
|
+
|
|
471
|
+
# Check if we're at the deepest level (have aggregation values)
|
|
472
|
+
is_leaf_level = True
|
|
473
|
+
for agg in aggregations:
|
|
474
|
+
alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
|
|
475
|
+
if alias in bucket:
|
|
476
|
+
is_leaf_level = True
|
|
477
|
+
break
|
|
478
|
+
|
|
479
|
+
# Check if there are sub-buckets (nested grouping)
|
|
480
|
+
next_level_field = None
|
|
481
|
+
if level + 1 < len(normalized_fields):
|
|
482
|
+
next_level_field = f"group_by_{normalized_fields[level + 1]['field']}"
|
|
483
|
+
if next_level_field in bucket and "buckets" in bucket[next_level_field]:
|
|
484
|
+
is_leaf_level = False
|
|
485
|
+
|
|
486
|
+
if is_leaf_level:
|
|
487
|
+
# Extract aggregation values at the leaf level
|
|
488
|
+
if len(aggregations) == 1:
|
|
489
|
+
# Single aggregation
|
|
490
|
+
agg = aggregations[0]
|
|
491
|
+
alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
|
|
492
|
+
value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
|
|
493
|
+
agg_key = agg.get("alias") or agg["function"]
|
|
494
|
+
result[agg_key] = value
|
|
495
|
+
else:
|
|
496
|
+
# Multiple aggregations
|
|
497
|
+
result["aggregations"] = {}
|
|
498
|
+
for i, agg in enumerate(aggregations):
|
|
499
|
+
alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_{i}"
|
|
500
|
+
value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
|
|
501
|
+
agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
|
|
502
|
+
result["aggregations"][agg_key] = value
|
|
503
|
+
else:
|
|
504
|
+
# Handle nested buckets
|
|
505
|
+
sub_buckets = bucket[next_level_field].get("buckets", [])
|
|
506
|
+
sub_results = []
|
|
507
|
+
print(
|
|
508
|
+
f" Level {level}: Processing {len(sub_buckets)} sub-buckets for field {normalized_fields[level + 1]['field']}"
|
|
509
|
+
)
|
|
510
|
+
for sub_bucket in sub_buckets:
|
|
511
|
+
sub_result_data = self._transform_bucket_recursive(
|
|
512
|
+
sub_bucket, aggregations, normalized_fields, level + 1
|
|
513
|
+
)
|
|
514
|
+
if sub_result_data:
|
|
515
|
+
# Handle the case where sub_result_data might be a list (deeper nesting)
|
|
516
|
+
if isinstance(sub_result_data, list):
|
|
517
|
+
for sub_item in sub_result_data:
|
|
518
|
+
# Merge the keys from current level with sub-level keys
|
|
519
|
+
merged_key = dict(result["key"])
|
|
520
|
+
merged_key.update(sub_item["key"])
|
|
521
|
+
sub_item["key"] = merged_key
|
|
522
|
+
sub_results.append(sub_item)
|
|
523
|
+
else:
|
|
524
|
+
# Merge the keys from different levels
|
|
525
|
+
merged_key = dict(result["key"])
|
|
526
|
+
merged_key.update(sub_result_data["key"])
|
|
527
|
+
sub_result_data["key"] = merged_key
|
|
528
|
+
sub_results.append(sub_result_data)
|
|
529
|
+
|
|
530
|
+
# For multi-level grouping, we return the sub-results as separate entries
|
|
531
|
+
# This flattens the nested structure into a list of results
|
|
532
|
+
return sub_results if sub_results else None
|
|
533
|
+
|
|
534
|
+
return result
|
|
535
|
+
|
|
405
536
|
def _extract_aggregation_value( # noqa: C901
|
|
406
537
|
self, agg_result: Dict[str, Any], function: str
|
|
407
538
|
) -> Union[int, float, Dict[str, Any], List[Any], None]:
|