tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,6 +42,18 @@ class QueryConverter:
42
42
  return self._convert_geo_expr(node)
43
43
  elif node_type == "nslookup_expr":
44
44
  return self._convert_nslookup_expr(node)
45
+ elif node_type == "query_with_stats":
46
+ # For query_with_stats, only convert the filter part
47
+ # The stats part is handled by the stats engine
48
+ filter_node = node.get("filter")
49
+ if filter_node:
50
+ return self.convert_node(filter_node)
51
+ else:
52
+ return {"match_all": {}}
53
+ elif node_type == "stats_expr":
54
+ # Pure stats queries match all documents
55
+ # The aggregations are handled by the stats engine
56
+ return {"match_all": {}}
45
57
 
46
58
  raise TQLValidationError(f"Unknown node type: {node}")
47
59
 
@@ -87,7 +99,8 @@ class QueryConverter:
87
99
  """Convert a comparison operation to OpenSearch query."""
88
100
  field_name = node["field"]
89
101
  operator = node["operator"]
90
- value = node["value"]
102
+ # For exists/not_exists operators, value is None
103
+ value = node.get("value") if operator not in ["exists", "not_exists"] else None
91
104
  field_mutators = node.get("field_mutators", [])
92
105
 
93
106
  # Check if mutators change the field type
@@ -127,10 +140,16 @@ class QueryConverter:
127
140
  # Note: ALL and NOT_ALL operators are handled with script queries and don't need post-processing
128
141
  requires_post_processing = node.get("post_process_value", False) or has_type_changing_mutators
129
142
 
130
- if requires_post_processing:
131
- # For operations that require post-processing, we need to query more broadly
132
- # to ensure we get all potentially matching documents
133
- if operator in [
143
+ # Also check if we have transform mutators with filtering operators
144
+ # Transform mutators change the field value, so we need to use exists query
145
+ has_transform_mutators_with_filter = node.get("has_transform_mutators_with_filter", False)
146
+
147
+ # Also check field_mutators directly in case the flag wasn't set
148
+ if (
149
+ not has_transform_mutators_with_filter
150
+ and field_mutators
151
+ and operator
152
+ in [
134
153
  "eq",
135
154
  "=",
136
155
  "ne",
@@ -151,10 +170,55 @@ class QueryConverter:
151
170
  "lte",
152
171
  "between",
153
172
  "not_between",
154
- ]:
155
- # For these operators, use exists query to get all docs with the field
156
- # The actual filtering will happen in post-processing
157
- return {"exists": {"field": opensearch_field}}
173
+ ]
174
+ ):
175
+ # Check if any of the mutators are transform mutators
176
+ TRANSFORM_MUTATORS = {
177
+ "lowercase",
178
+ "uppercase",
179
+ "trim",
180
+ "replace",
181
+ "refang",
182
+ "defang",
183
+ "b64encode",
184
+ "b64decode",
185
+ "urldecode",
186
+ }
187
+ for mutator in field_mutators:
188
+ if mutator.get("name", "").lower() in TRANSFORM_MUTATORS:
189
+ has_transform_mutators_with_filter = True
190
+ break
191
+
192
+ if requires_post_processing or has_transform_mutators_with_filter:
193
+ # For value mutators, type-changing field mutators, or transform mutators that require post-processing, use exists query
194
+ # But NOT for field mutators like any/all/none - those should not affect the query
195
+ if node.get("value_mutators") or has_type_changing_mutators or has_transform_mutators_with_filter:
196
+ # Only for these mutators do we need to broaden the search
197
+ if operator in [
198
+ "eq",
199
+ "=",
200
+ "ne",
201
+ "!=",
202
+ "contains",
203
+ "not_contains",
204
+ "startswith",
205
+ "endswith",
206
+ "not_startswith",
207
+ "not_endswith",
208
+ ">",
209
+ ">=",
210
+ "<",
211
+ "<=",
212
+ "gt",
213
+ "gte",
214
+ "lt",
215
+ "lte",
216
+ "between",
217
+ "not_between",
218
+ ]:
219
+ # For these operators with mutators, use exists query to get all docs with the field
220
+ # The actual filtering will happen in post-processing
221
+ return {"exists": {"field": opensearch_field}}
158
222
 
159
223
  # Handle special wildcard conversion for keyword fields
160
224
  if use_wildcard and operator == "contains":
@@ -162,23 +226,38 @@ class QueryConverter:
162
226
 
163
227
  # Convert operator to OpenSearch query
164
228
  if operator in ["eq", "="]:
165
- # Check if we're using a text field
166
- is_text_field = self._is_text_field(field_name, opensearch_field)
167
-
168
- # Use match query for text fields, term for others
169
- if is_text_field:
170
- return {"match": {opensearch_field: value}}
229
+ # For fields with mappings, use the optimized query type
230
+ if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
231
+ # Check if we're using a text field
232
+ is_text_field = self._is_text_field(field_name, opensearch_field)
233
+ if is_text_field:
234
+ return {"match": {opensearch_field: value}}
235
+ else:
236
+ return {"term": {opensearch_field: value}}
171
237
  else:
172
- return {"term": {opensearch_field: value}}
238
+ # For unmapped fields, use match_phrase for strings (safer default)
239
+ # This ensures compatibility with both text and keyword fields
240
+ if isinstance(value, str):
241
+ return {"match_phrase": {opensearch_field: value}}
242
+ else:
243
+ # For non-string values (numbers, booleans), use term query
244
+ return {"term": {opensearch_field: value}}
173
245
  elif operator in ["ne", "!="]:
174
- # Check if we're using a text field
175
- is_text_field = self._is_text_field(field_name, opensearch_field)
176
-
177
- # Use match query for text fields, term for others
178
- if is_text_field:
179
- return {"bool": {"must_not": {"match": {opensearch_field: value}}}}
246
+ # For fields with mappings, use the optimized query type
247
+ if field_name in self.intelligent_mappings or field_name in self.simple_mappings:
248
+ # Check if we're using a text field
249
+ is_text_field = self._is_text_field(field_name, opensearch_field)
250
+ if is_text_field:
251
+ return {"bool": {"must_not": {"match": {opensearch_field: value}}}}
252
+ else:
253
+ return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
180
254
  else:
181
- return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
255
+ # For unmapped fields, use match_phrase for strings (safer default)
256
+ if isinstance(value, str):
257
+ return {"bool": {"must_not": {"match_phrase": {opensearch_field: value}}}}
258
+ else:
259
+ # For non-string values (numbers, booleans), use term query
260
+ return {"bool": {"must_not": {"term": {opensearch_field: value}}}}
182
261
  elif operator in ["gt", ">"]:
183
262
  return {"range": {opensearch_field: {"gt": value}}}
184
263
  elif operator in ["gte", ">="]:
@@ -434,8 +513,25 @@ class QueryConverter:
434
513
  right_query = self.convert_node(node["right"])
435
514
 
436
515
  if operator == "and":
437
- return {"bool": {"must": [left_query, right_query]}}
516
+ # Collect all must clauses, flattening where appropriate
517
+ must_clauses = []
518
+
519
+ # Helper function to extract clauses
520
+ def extract_must_clauses(query):
521
+ if isinstance(query, dict) and "bool" in query:
522
+ bool_query = query["bool"]
523
+ # If it only has must clauses, extract them
524
+ if set(bool_query.keys()) == {"must"} and isinstance(bool_query["must"], list):
525
+ return bool_query["must"]
526
+ return [query]
527
+
528
+ # Extract and flatten must clauses
529
+ must_clauses.extend(extract_must_clauses(left_query))
530
+ must_clauses.extend(extract_must_clauses(right_query))
531
+
532
+ return {"bool": {"must": must_clauses}}
438
533
  elif operator == "or":
534
+ # OR still needs should clause
439
535
  return {"bool": {"should": [left_query, right_query], "minimum_should_match": 1}}
440
536
  else:
441
537
  raise TQLUnsupportedOperationError(f"Logical operator '{operator}' not supported for OpenSearch")
@@ -720,15 +816,28 @@ class QueryConverter:
720
816
  value: Value to convert
721
817
 
722
818
  Returns:
723
- Converted value (bool, None, or original)
819
+ Converted value (bool, None, numeric, or original)
724
820
  """
725
821
  if isinstance(value, str):
822
+ # Check for boolean values
726
823
  if value.lower() == "true":
727
824
  return True
728
825
  elif value.lower() == "false":
729
826
  return False
730
827
  elif value.lower() == "null":
731
828
  return None
829
+ # Check if it's a numeric string
830
+ elif value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
831
+ # Convert to integer
832
+ return int(value)
833
+ else:
834
+ # Try to parse as float
835
+ try:
836
+ # Check if it has a decimal point
837
+ if "." in value:
838
+ return float(value)
839
+ except ValueError:
840
+ pass
732
841
  return value
733
842
 
734
843
  def _is_text_field(self, field_name: str, opensearch_field: str) -> bool:
@@ -52,7 +52,7 @@ def extract_field_mappings_from_opensearch(
52
52
  try:
53
53
  # Extract field names from the TQL query
54
54
  field_names = tql_instance.extract_fields(tql_query)
55
- logger.info(f"Extracted {len(field_names)} fields from TQL query: {field_names}")
55
+ logger.debug(f"Extracted {len(field_names)} fields from TQL query: {field_names}")
56
56
 
57
57
  if not field_names:
58
58
  logger.warning("No fields found in TQL query")
@@ -68,7 +68,7 @@ def extract_field_mappings_from_opensearch(
68
68
  # Extract and convert mappings to TQL format
69
69
  tql_mappings = _convert_opensearch_mappings_to_tql_format(mapping_response, field_names)
70
70
 
71
- logger.info(f"Successfully converted mappings for {len(tql_mappings)} fields")
71
+ logger.debug(f"Successfully converted mappings for {len(tql_mappings)} fields")
72
72
  return tql_mappings
73
73
 
74
74
  except Exception as e:
tql/opensearch_stats.py CHANGED
@@ -4,6 +4,7 @@ This module translates TQL stats queries to OpenSearch aggregation DSL.
4
4
  """
5
5
 
6
6
  from typing import Any, Dict, List, Optional, Union
7
+ import json
7
8
 
8
9
  from .exceptions import TQLError
9
10
 
@@ -84,6 +85,9 @@ class OpenSearchStatsTranslator:
84
85
  if group_by_fields:
85
86
  # Build nested terms aggregations for grouping
86
87
  aggs_dsl = self._build_grouped_aggregations(aggregations, group_by_fields, field_mappings)
88
+ print(
89
+ f"\n=== OpenSearch Aggregation Query ===\nGroup by: {group_by_fields}\nAggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
90
+ )
87
91
  else:
88
92
  # Simple aggregations without grouping
89
93
  aggs_dsl = self._build_simple_aggregations(aggregations, field_mappings)
@@ -163,14 +167,14 @@ class OpenSearchStatsTranslator:
163
167
  def _build_grouped_aggregations(
164
168
  self,
165
169
  aggregations: List[Dict[str, Any]],
166
- group_by_fields: List[str],
170
+ group_by_fields: List[Any],
167
171
  field_mappings: Optional[Dict[str, str]] = None,
168
172
  ) -> Dict[str, Any]:
169
173
  """Build aggregations with grouping.
170
174
 
171
175
  Args:
172
176
  aggregations: List of aggregation specifications
173
- group_by_fields: Fields to group by
177
+ group_by_fields: Fields to group by (can be strings or dicts with bucket_size)
174
178
  field_mappings: Optional field mappings
175
179
 
176
180
  Returns:
@@ -193,18 +197,38 @@ class OpenSearchStatsTranslator:
193
197
  size = agg.get("limit", 10)
194
198
  break
195
199
 
200
+ # Normalize group_by_fields to handle both old (string) and new (dict) formats
201
+ normalized_fields = []
202
+ for field in group_by_fields:
203
+ if isinstance(field, str):
204
+ # Old format: just field name, use default bucket size
205
+ normalized_fields.append({"field": field, "bucket_size": 10})
206
+ elif isinstance(field, dict):
207
+ # New format: {"field": "name", "bucket_size": N}
208
+ bucket_size = field.get("bucket_size", 10) if field.get("bucket_size") is not None else 10
209
+ normalized_fields.append({"field": field["field"], "bucket_size": bucket_size})
210
+ else:
211
+ # Shouldn't happen but handle gracefully
212
+ normalized_fields.append({"field": str(field), "bucket_size": 10})
213
+
196
214
  # Build nested terms aggregations for each group_by field
197
215
  current_aggs = inner_aggs
198
216
 
199
217
  # Process group_by fields in reverse order to build proper nesting
200
- for field in reversed(group_by_fields):
201
- terms_agg = {"terms": {"field": field, "size": size}}
218
+ for i, field_spec in enumerate(reversed(normalized_fields)):
219
+ field_name = field_spec["field"]
220
+ bucket_size = field_spec["bucket_size"]
221
+
222
+ # Always respect user-specified bucket sizes
223
+ # The user has explicitly set limits with "top N" syntax
224
+
225
+ terms_agg = {"terms": {"field": field_name, "size": bucket_size}}
202
226
 
203
227
  # Add ordering if this is the outermost aggregation and we have order field
204
- if field == group_by_fields[0] and order_field:
228
+ if field_name == normalized_fields[0]["field"] and order_field:
205
229
  # For nested aggregations, we need the full path
206
230
  order_path = order_field
207
- if len(group_by_fields) > 1:
231
+ if len(normalized_fields) > 1:
208
232
  # Multi-level grouping requires special handling
209
233
  # OpenSearch doesn't support ordering by sub-aggregations in nested terms
210
234
  # We'll need to handle this in post-processing
@@ -217,7 +241,7 @@ class OpenSearchStatsTranslator:
217
241
  terms_agg["aggs"] = current_aggs
218
242
 
219
243
  # Wrap for next level
220
- current_aggs = {f"group_by_{field}": terms_agg}
244
+ current_aggs = {f"group_by_{field_name}": terms_agg}
221
245
 
222
246
  return current_aggs
223
247
 
@@ -314,14 +338,14 @@ class OpenSearchStatsTranslator:
314
338
  return {"type": "multiple_aggregations", "results": results}
315
339
 
316
340
  def _transform_grouped_response(
317
- self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[str]
341
+ self, response: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
318
342
  ) -> Dict[str, Any]:
319
343
  """Transform grouped aggregation response.
320
344
 
321
345
  Args:
322
346
  response: OpenSearch response
323
347
  aggregations: Aggregation specifications
324
- group_by_fields: Grouping fields
348
+ group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
325
349
 
326
350
  Returns:
327
351
  Transformed response
@@ -329,61 +353,78 @@ class OpenSearchStatsTranslator:
329
353
  # Navigate to the grouped results
330
354
  aggs_data = response.get("aggregations", {})
331
355
 
332
- # Get the outermost grouping
333
- first_group_key = f"group_by_{group_by_fields[0]}"
356
+ # Normalize group_by_fields to handle both old (string) and new (dict) formats
357
+ normalized_fields = []
358
+ for field in group_by_fields:
359
+ if isinstance(field, str):
360
+ # Old format: just field name
361
+ normalized_fields.append({"field": field, "bucket_size": None})
362
+ elif isinstance(field, dict):
363
+ # New format: {"field": "name", "bucket_size": N}
364
+ normalized_fields.append(field)
365
+ else:
366
+ # Shouldn't happen but handle gracefully
367
+ normalized_fields.append({"field": str(field), "bucket_size": None})
368
+
369
+ # Get the outermost grouping - use the field name from the normalized structure
370
+ first_field_name = normalized_fields[0]["field"]
371
+ first_group_key = f"group_by_{first_field_name}"
334
372
  grouped_data = aggs_data.get(first_group_key, {})
335
373
 
336
374
  # Extract buckets
337
375
  buckets = grouped_data.get("buckets", [])
338
376
 
339
- # Transform buckets
377
+ # Transform buckets - handle multi-level grouping recursively
340
378
  results = []
379
+ print(
380
+ f"\n=== OpenSearch Response Debug ===\nTotal buckets at top level: {len(buckets)}\nGroup by fields: {group_by_fields}\n"
381
+ )
341
382
  for bucket in buckets:
342
- result = self._transform_bucket(bucket, aggregations, group_by_fields, 0)
383
+ result = self._transform_bucket_recursive(bucket, aggregations, normalized_fields, 0)
343
384
  if result:
344
- results.append(result)
385
+ # Handle the case where recursive transformation returns a list (multi-level)
386
+ if isinstance(result, list):
387
+ results.extend(result)
388
+ else:
389
+ results.append(result)
345
390
 
346
- return {"type": "grouped_aggregation", "group_by": group_by_fields, "results": results}
391
+ return {"type": "stats_grouped", "group_by": group_by_fields, "results": results}
347
392
 
348
393
  def _transform_bucket(
349
- self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[str], level: int
394
+ self, bucket: Dict[str, Any], aggregations: List[Dict[str, Any]], group_by_fields: List[Any], level: int
350
395
  ) -> Optional[Dict[str, Any]]:
351
396
  """Transform a single bucket from grouped aggregation.
352
397
 
353
398
  Args:
354
399
  bucket: OpenSearch bucket
355
400
  aggregations: Aggregation specifications
356
- group_by_fields: Grouping fields
401
+ group_by_fields: Grouping fields (can be strings or dicts with bucket_size)
357
402
  level: Current nesting level
358
403
 
359
404
  Returns:
360
405
  Transformed bucket or None
361
406
  """
407
+ # Normalize group_by_fields to handle both old (string) and new (dict) formats
408
+ normalized_fields = []
409
+ for field in group_by_fields:
410
+ if isinstance(field, str):
411
+ # Old format: just field name
412
+ normalized_fields.append({"field": field, "bucket_size": None})
413
+ elif isinstance(field, dict):
414
+ # New format: {"field": "name", "bucket_size": N}
415
+ normalized_fields.append(field)
416
+ else:
417
+ # Shouldn't happen but handle gracefully
418
+ normalized_fields.append({"field": str(field), "bucket_size": None})
419
+
362
420
  result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
363
421
 
364
422
  # Add current level key
365
- if level < len(group_by_fields):
366
- field = group_by_fields[level]
367
- result["key"][field] = bucket.get("key")
368
-
369
- # Check if there are more levels
370
- if level + 1 < len(group_by_fields):
371
- # Navigate to next level
372
- next_field = group_by_fields[level + 1]
373
- next_group_key = f"group_by_{next_field}"
374
-
375
- if next_group_key in bucket:
376
- # This is a nested grouping, we need to aggregate the sub-buckets
377
- # For now, we'll just take the first sub-bucket
378
- # TODO: Handle proper multi-level grouping
379
- sub_buckets = bucket[next_group_key].get("buckets", [])
380
- if sub_buckets:
381
- sub_result = self._transform_bucket(sub_buckets[0], aggregations, group_by_fields, level + 1)
382
- if sub_result:
383
- # Merge keys
384
- result["key"].update(sub_result["key"])
385
-
386
- # Extract aggregation values
423
+ if level < len(normalized_fields):
424
+ field_name = normalized_fields[level]["field"]
425
+ result["key"][field_name] = bucket.get("key")
426
+
427
+ # Extract aggregation values at the innermost level
387
428
  if len(aggregations) == 1:
388
429
  # Single aggregation
389
430
  agg = aggregations[0]
@@ -402,6 +443,96 @@ class OpenSearchStatsTranslator:
402
443
 
403
444
  return result
404
445
 
446
+ def _transform_bucket_recursive(
447
+ self,
448
+ bucket: Dict[str, Any],
449
+ aggregations: List[Dict[str, Any]],
450
+ normalized_fields: List[Dict[str, Any]],
451
+ level: int,
452
+ ) -> Optional[Dict[str, Any]]:
453
+ """Transform a bucket recursively for multi-level grouping.
454
+
455
+ Args:
456
+ bucket: OpenSearch bucket
457
+ aggregations: Aggregation specifications
458
+ normalized_fields: Normalized group by fields with field names and bucket sizes
459
+ level: Current nesting level
460
+
461
+ Returns:
462
+ Transformed bucket or None
463
+ """
464
+ result = {"key": {}, "doc_count": bucket.get("doc_count", 0)}
465
+
466
+ # Add current level key
467
+ if level < len(normalized_fields):
468
+ field_name = normalized_fields[level]["field"]
469
+ result["key"][field_name] = bucket.get("key")
470
+
471
+ # Check if we're at the deepest level (have aggregation values)
472
+ is_leaf_level = True
473
+ for agg in aggregations:
474
+ alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
475
+ if alias in bucket:
476
+ is_leaf_level = True
477
+ break
478
+
479
+ # Check if there are sub-buckets (nested grouping)
480
+ next_level_field = None
481
+ if level + 1 < len(normalized_fields):
482
+ next_level_field = f"group_by_{normalized_fields[level + 1]['field']}"
483
+ if next_level_field in bucket and "buckets" in bucket[next_level_field]:
484
+ is_leaf_level = False
485
+
486
+ if is_leaf_level:
487
+ # Extract aggregation values at the leaf level
488
+ if len(aggregations) == 1:
489
+ # Single aggregation
490
+ agg = aggregations[0]
491
+ alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
492
+ value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
493
+ agg_key = agg.get("alias") or agg["function"]
494
+ result[agg_key] = value
495
+ else:
496
+ # Multiple aggregations
497
+ result["aggregations"] = {}
498
+ for i, agg in enumerate(aggregations):
499
+ alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_{i}"
500
+ value = self._extract_aggregation_value(bucket.get(alias, {}), agg["function"])
501
+ agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
502
+ result["aggregations"][agg_key] = value
503
+ else:
504
+ # Handle nested buckets
505
+ sub_buckets = bucket[next_level_field].get("buckets", [])
506
+ sub_results = []
507
+ print(
508
+ f" Level {level}: Processing {len(sub_buckets)} sub-buckets for field {normalized_fields[level + 1]['field']}"
509
+ )
510
+ for sub_bucket in sub_buckets:
511
+ sub_result_data = self._transform_bucket_recursive(
512
+ sub_bucket, aggregations, normalized_fields, level + 1
513
+ )
514
+ if sub_result_data:
515
+ # Handle the case where sub_result_data might be a list (deeper nesting)
516
+ if isinstance(sub_result_data, list):
517
+ for sub_item in sub_result_data:
518
+ # Merge the keys from current level with sub-level keys
519
+ merged_key = dict(result["key"])
520
+ merged_key.update(sub_item["key"])
521
+ sub_item["key"] = merged_key
522
+ sub_results.append(sub_item)
523
+ else:
524
+ # Merge the keys from different levels
525
+ merged_key = dict(result["key"])
526
+ merged_key.update(sub_result_data["key"])
527
+ sub_result_data["key"] = merged_key
528
+ sub_results.append(sub_result_data)
529
+
530
+ # For multi-level grouping, we return the sub-results as separate entries
531
+ # This flattens the nested structure into a list of results
532
+ return sub_results if sub_results else None
533
+
534
+ return result
535
+
405
536
  def _extract_aggregation_value( # noqa: C901
406
537
  self, agg_result: Dict[str, Any], function: str
407
538
  ) -> Union[int, float, Dict[str, Any], List[Any], None]: