tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tql/mutator_analyzer.py CHANGED
@@ -30,6 +30,7 @@ MUTATOR_CLASSIFICATIONS: Dict[str, MutatorType] = {
30
30
  "uppercase": MutatorType.POST_PROCESSABLE, # Always post-process (transforms result)
31
31
  "trim": MutatorType.POST_PROCESSABLE, # Always post-process (transforms result)
32
32
  "split": MutatorType.POST_PROCESSABLE, # Always post-process (returns array)
33
+ "replace": MutatorType.POST_PROCESSABLE, # Always post-process (transforms result)
33
34
  "nslookup": MutatorType.POST_PROCESSABLE, # Always post-process (enrichment)
34
35
  "geoip_lookup": MutatorType.POST_PROCESSABLE, # Always post-process (enrichment)
35
36
  "geo": MutatorType.POST_PROCESSABLE, # Always post-process (enrichment)
@@ -41,6 +42,18 @@ MUTATOR_CLASSIFICATIONS: Dict[str, MutatorType] = {
41
42
  "urldecode": MutatorType.POST_PROCESSABLE, # Always post-process (modifies value)
42
43
  "is_private": MutatorType.POST_PROCESSABLE, # Always post-process (returns bool)
43
44
  "is_global": MutatorType.POST_PROCESSABLE, # Always post-process (returns bool)
45
+ "any": MutatorType.POST_PROCESSABLE, # Always post-process (array evaluation)
46
+ "all": MutatorType.POST_PROCESSABLE, # Always post-process (array evaluation)
47
+ "none": MutatorType.POST_PROCESSABLE, # Always post-process (array evaluation)
48
+ "avg": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
49
+ "average": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
50
+ "sum": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
51
+ "min": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
52
+ "max": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
53
+ "count": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
54
+ "unique": MutatorType.POST_PROCESSABLE, # Always post-process (array computation)
55
+ "first": MutatorType.POST_PROCESSABLE, # Always post-process (array access)
56
+ "last": MutatorType.POST_PROCESSABLE, # Always post-process (array access)
44
57
  }
45
58
 
46
59
 
@@ -52,8 +65,8 @@ class PostProcessingRequirement:
52
65
  mapped_field_name: str # Field name used in OpenSearch query
53
66
  mutators: List[Dict[str, Any]] # List of mutator specifications
54
67
  applies_to: Literal[
55
- "field", "value", "geo_expr", "nslookup_expr"
56
- ] # Whether this applies to field, value mutators, geo, or nslookup expressions
68
+ "field", "value", "geo_expr", "nslookup_expr", "logical_expression"
69
+ ] # Whether this applies to field, value mutators, geo, nslookup, or logical expressions
57
70
  metadata: Optional[Dict[str, Any]] = None # Additional metadata for special processing
58
71
 
59
72
 
@@ -73,6 +86,8 @@ class MutatorAnalysisResult:
73
86
  class MutatorAnalyzer:
74
87
  """Analyzes TQL queries to determine mutator processing requirements."""
75
88
 
89
+ context: Optional[str] = None # Temporary storage for execution context
90
+
76
91
  def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None):
77
92
  """Initialize the analyzer.
78
93
 
@@ -134,9 +149,27 @@ class MutatorAnalyzer:
134
149
  # Track if enrichment saving is requested
135
150
  save_enrichment_requested = False
136
151
 
152
+ # Store context temporarily for use in _analyze_node
153
+ self.context = context
154
+
155
+ # Check if this is a stats query
156
+ is_stats_query = ast.get("type") in ["stats_expr", "query_with_stats"]
157
+
137
158
  # Analyze the AST recursively
138
159
  self._analyze_node(optimized_ast, post_processing_requirements, health_reasons, optimizations_applied)
139
160
 
161
+ # Clean up context
162
+ self.context = None
163
+
164
+ # Clean up nodes marked for removal
165
+ cleaned_ast = self._clean_ast(optimized_ast)
166
+
167
+ # If the entire AST was removed (e.g., just "field | any eq value"), return match_all
168
+ if cleaned_ast is None:
169
+ optimized_ast = {"type": "match_all"}
170
+ else:
171
+ optimized_ast = cleaned_ast
172
+
140
173
  # Check if any mutator requested enrichment saving
141
174
  for req in post_processing_requirements:
142
175
  for mutator in req.mutators:
@@ -158,7 +191,29 @@ class MutatorAnalyzer:
158
191
 
159
192
  # Determine overall health status based on context
160
193
  health_status: Literal["green", "yellow", "red"] = "green"
161
- if post_processing_requirements:
194
+
195
+ # Special handling for stats queries with post-processing in OpenSearch context
196
+ if is_stats_query and context == "opensearch" and post_processing_requirements:
197
+ # Stats queries that require post-processing have extremely poor performance
198
+ health_status = "red"
199
+ health_reasons.append(
200
+ {
201
+ "status": "red",
202
+ "query_part": "stats with post-processing",
203
+ "reason": "Stats query requires fetching all documents for post-processing mutators. "
204
+ "This will have extremely poor performance on large datasets. "
205
+ "Consider pre-processing data or using OpenSearch-compatible operations.",
206
+ }
207
+ )
208
+
209
+ # For in_memory context, we need to evaluate health considering ALL mutators
210
+ # (both those in post-processing and those remaining in the AST)
211
+ elif context == "in_memory":
212
+ # Pass the optimized AST to health evaluation for in_memory context
213
+ health_eval = self._evaluate_health_for_context(post_processing_requirements, context, optimized_ast)
214
+ health_status = health_eval["health_status"] # type: ignore[assignment]
215
+ health_reasons.extend(health_eval["health_reasons"])
216
+ elif post_processing_requirements:
162
217
  # Evaluate health based on context
163
218
  health_eval = self._evaluate_health_for_context(post_processing_requirements, context)
164
219
  health_status = health_eval["health_status"] # type: ignore[assignment]
@@ -179,6 +234,166 @@ class MutatorAnalyzer:
179
234
  save_enrichment_requested=save_enrichment_requested,
180
235
  )
181
236
 
237
+ def _clean_ast(self, node: Any) -> Any: # noqa: C901
238
+ """Remove nodes marked for removal from the AST.
239
+
240
+ Args:
241
+ node: AST node to clean
242
+
243
+ Returns:
244
+ Cleaned AST node or None if node should be removed
245
+ """
246
+ if not isinstance(node, dict):
247
+ return node
248
+
249
+ # Check if this node should be removed
250
+ if node.get("_remove_from_query"):
251
+ return None
252
+
253
+ # Clean child nodes
254
+ if node.get("type") == "logical_op":
255
+ operator = node.get("operator", "").lower()
256
+ left = self._clean_ast(node.get("left"))
257
+ right = self._clean_ast(node.get("right"))
258
+
259
+ # Special handling for OR with removed nodes
260
+ if operator == "or" and (left is None or right is None):
261
+ # If either side of OR has array operators (was removed),
262
+ # we need to return match_all and handle everything in post-processing
263
+ if left is None or right is None:
264
+ return {"type": "match_all"}
265
+
266
+ # Regular handling for AND
267
+ if left is None and right is None:
268
+ return None
269
+ elif left is None:
270
+ return right
271
+ elif right is None:
272
+ return left
273
+ else:
274
+ node["left"] = left
275
+ node["right"] = right
276
+ return node
277
+ elif node.get("type") == "unary_op":
278
+ operand = self._clean_ast(node.get("operand"))
279
+ if operand is None:
280
+ return None
281
+ node["operand"] = operand
282
+ return node
283
+
284
+ # For other node types, check if it should be converted to match_all
285
+ if node.get("_convert_to_match_all"):
286
+ return {"type": "match_all"}
287
+
288
+ return node
289
+
290
+ def _has_array_operators(self, node: Any) -> bool:
291
+ """Check if an AST node contains array operators (any, all, none).
292
+
293
+ Args:
294
+ node: AST node to check
295
+
296
+ Returns:
297
+ True if node contains array operators
298
+ """
299
+ if not isinstance(node, dict):
300
+ return False
301
+
302
+ node_type = node.get("type")
303
+
304
+ if node_type == "comparison":
305
+ # Check field mutators for array operators
306
+ field_mutators = node.get("field_mutators", [])
307
+ for mutator in field_mutators:
308
+ if mutator.get("name", "").lower() in ["any", "all", "none"]:
309
+ return True
310
+ return False
311
+ elif node_type == "logical_op":
312
+ # Check both sides
313
+ return self._has_array_operators(node.get("left", {})) or self._has_array_operators(node.get("right", {}))
314
+ elif node_type == "unary_op":
315
+ # Check operand
316
+ return self._has_array_operators(node.get("operand", {}))
317
+
318
+ return False
319
+
320
+ def _has_transform_mutators_with_filtering(self, node: Any) -> bool:
321
+ """Check if an AST node contains transform mutators with filtering operations.
322
+
323
+ Args:
324
+ node: AST node to check
325
+
326
+ Returns:
327
+ True if node contains transform mutators with filtering operations
328
+ """
329
+ if not isinstance(node, dict):
330
+ return False
331
+
332
+ node_type = node.get("type")
333
+
334
+ if node_type == "comparison":
335
+ # Check if this is a filtering operation
336
+ operator = node.get("operator", "")
337
+ is_filtering = operator in [
338
+ "eq",
339
+ "=",
340
+ "ne",
341
+ "!=",
342
+ "gt",
343
+ ">",
344
+ "gte",
345
+ ">=",
346
+ "lt",
347
+ "<",
348
+ "lte",
349
+ "<=",
350
+ "contains",
351
+ "not_contains",
352
+ "startswith",
353
+ "endswith",
354
+ "not_startswith",
355
+ "not_endswith",
356
+ "in",
357
+ "not_in",
358
+ ]
359
+
360
+ if not is_filtering:
361
+ return False
362
+
363
+ # Check field mutators for transform mutators or type-changing mutators that need post-processing
364
+ field_mutators = node.get("field_mutators", [])
365
+ for mutator in field_mutators:
366
+ mutator_name = mutator.get("name", "").lower()
367
+ # Transform mutators that modify the value OR type-changing mutators
368
+ if mutator_name in [
369
+ "lowercase",
370
+ "uppercase",
371
+ "trim",
372
+ "replace",
373
+ "refang",
374
+ "defang",
375
+ "b64encode",
376
+ "b64decode",
377
+ "urldecode",
378
+ # Type-changing mutators that need post-processing
379
+ "length",
380
+ "is_private",
381
+ "is_global",
382
+ "split",
383
+ ]:
384
+ return True
385
+ return False
386
+ elif node_type == "logical_op":
387
+ # Check both sides
388
+ return self._has_transform_mutators_with_filtering(
389
+ node.get("left", {})
390
+ ) or self._has_transform_mutators_with_filtering(node.get("right", {}))
391
+ elif node_type == "unary_op":
392
+ # Check operand
393
+ return self._has_transform_mutators_with_filtering(node.get("operand", {}))
394
+
395
+ return False
396
+
182
397
  def _analyze_node( # noqa: C901
183
398
  self,
184
399
  node: Dict[str, Any],
@@ -204,98 +419,191 @@ class MutatorAnalyzer:
204
419
  elif node_type == "collection_op":
205
420
  self._analyze_collection_node(node, post_processing_reqs, health_reasons, optimizations)
206
421
  elif node_type == "logical_op":
207
- # Recursively analyze both sides
422
+ operator = node.get("operator", "").lower()
423
+
424
+ # Check if this is an OR with array operators OR transform mutators with filtering
425
+ # BEFORE analyzing children (because analyzing children might modify the nodes)
426
+ needs_logical_expression = False
427
+ metadata_type = None
428
+
429
+ if operator == "or":
430
+ if self._has_array_operators(node):
431
+ needs_logical_expression = True
432
+ metadata_type = "or_with_array_operators"
433
+ elif self._has_transform_mutators_with_filtering(node):
434
+ needs_logical_expression = True
435
+ metadata_type = "or_with_transform_mutators"
436
+
437
+ if needs_logical_expression:
438
+ # We need to evaluate the entire OR in post-processing
439
+ # But we still want the base query to run (without array operators)
440
+
441
+ # Deep copy the original expression before it gets modified
442
+ original_expression = copy.deepcopy(node)
443
+
444
+ # Add a special requirement for the entire logical expression
445
+ post_processing_reqs.append(
446
+ PostProcessingRequirement(
447
+ field_name="_logical_expression",
448
+ mapped_field_name="_logical_expression",
449
+ mutators=[],
450
+ applies_to="logical_expression",
451
+ metadata={"expression": original_expression, "type": metadata_type},
452
+ )
453
+ )
454
+
455
+ # Always analyze both sides
208
456
  self._analyze_node(node.get("left", {}), post_processing_reqs, health_reasons, optimizations)
209
457
  self._analyze_node(node.get("right", {}), post_processing_reqs, health_reasons, optimizations)
210
458
  elif node_type == "unary_op":
459
+ operator = node.get("operator", "").lower()
460
+
461
+ # Check if this is a NOT with transform mutators that need filtering
462
+ if operator == "not" and self._has_transform_mutators_with_filtering(node.get("operand", {})):
463
+ # We need to evaluate the entire NOT in post-processing
464
+ # Deep copy the original expression before it gets modified
465
+ original_expression = copy.deepcopy(node)
466
+
467
+ # Add a special requirement for the entire logical expression
468
+ post_processing_reqs.append(
469
+ PostProcessingRequirement(
470
+ field_name="_logical_expression",
471
+ mapped_field_name="_logical_expression",
472
+ mutators=[],
473
+ applies_to="logical_expression",
474
+ metadata={"expression": original_expression, "type": "not_with_transform_mutators"},
475
+ )
476
+ )
477
+
211
478
  # Analyze the operand
212
479
  self._analyze_node(node.get("operand", {}), post_processing_reqs, health_reasons, optimizations)
213
480
  elif node_type == "geo_expr":
214
- # Geo expressions always require post-processing since they involve geoip_lookup
215
481
  field_name = node.get("field")
216
482
  conditions = node.get("conditions")
217
483
  geo_params = node.get("geo_params", {})
218
484
 
219
485
  if field_name:
220
- # Create a special post-processing requirement for geo expressions
221
- # that includes both the enrichment and the filtering
222
-
223
- # Build mutator params list from geo_params
224
- mutator_params = []
225
- for param_name, param_value in geo_params.items():
226
- mutator_params.append([param_name, param_value])
227
-
228
- geo_requirement = PostProcessingRequirement(
229
- field_name=field_name,
230
- mapped_field_name=field_name,
231
- mutators=(
232
- [{"name": "geoip_lookup", "params": mutator_params}]
233
- if mutator_params
234
- else [{"name": "geoip_lookup"}]
235
- ),
236
- applies_to="geo_expr", # Special type for geo expressions
237
- metadata={
238
- "conditions": conditions, # Include the conditions for filtering
239
- "node_type": "geo_expr",
240
- "geo_params": geo_params, # Include geo parameters
241
- },
242
- )
243
- post_processing_reqs.append(geo_requirement)
244
-
245
- # Mark the node for post-processing
246
- node["requires_post_processing"] = True
247
- node["post_process_type"] = "geo_expr"
248
-
249
- if conditions:
250
- optimizations.append(
251
- f"Geo expression on field '{field_name}' with conditions requires post-processing"
486
+ # For OpenSearch context, geo expressions require post-processing
487
+ if self.context == "opensearch":
488
+ # Create a post-processing requirement for the geo expression
489
+ # Build the geoip_lookup mutator
490
+ mutator_params = []
491
+ for param_name, param_value in geo_params.items():
492
+ mutator_params.append([param_name, param_value])
493
+
494
+ geo_mutator = {"name": "geoip_lookup"}
495
+ if mutator_params:
496
+ geo_mutator["params"] = mutator_params
497
+
498
+ # Create the requirement
499
+ req = PostProcessingRequirement(
500
+ field_name=field_name,
501
+ mapped_field_name=field_name, # Will be mapped during processing
502
+ mutators=[geo_mutator],
503
+ applies_to="geo_expr",
504
+ metadata={"conditions": conditions, "geo_params": geo_params},
252
505
  )
506
+ post_processing_reqs.append(req)
507
+
508
+ if conditions:
509
+ optimizations.append(
510
+ f"Geo expression on field '{field_name}' with conditions requires post-processing"
511
+ )
512
+ else:
513
+ optimizations.append(
514
+ f"Geo expression on field '{field_name}' for enrichment requires post-processing"
515
+ )
253
516
  else:
254
- optimizations.append(f"Geo expression on field '{field_name}' for enrichment only")
517
+ # For in-memory evaluation, handled during evaluation phase
518
+ if conditions:
519
+ optimizations.append(
520
+ f"Geo expression on field '{field_name}' with conditions handled during evaluation"
521
+ )
522
+ else:
523
+ optimizations.append(
524
+ f"Geo expression on field '{field_name}' for enrichment handled during evaluation"
525
+ )
255
526
 
256
527
  # Don't analyze conditions recursively - they're part of the geo expression
257
528
  elif node_type == "nslookup_expr":
258
- # NSLookup expressions always require post-processing since they involve DNS lookups
259
529
  field_name = node.get("field")
260
530
  conditions = node.get("conditions")
261
531
  nslookup_params = node.get("nslookup_params", {})
262
532
 
263
533
  if field_name:
264
- # Create a special post-processing requirement for nslookup expressions
265
- # that includes both the enrichment and the filtering
266
-
267
- # Build mutator params list from nslookup_params
268
- mutator_params = []
269
- for param_name, param_value in nslookup_params.items():
270
- mutator_params.append([param_name, param_value])
271
-
272
- nslookup_requirement = PostProcessingRequirement(
273
- field_name=field_name,
274
- mapped_field_name=field_name,
275
- mutators=(
276
- [{"name": "nslookup", "params": mutator_params}] if mutator_params else [{"name": "nslookup"}]
277
- ),
278
- applies_to="nslookup_expr", # Special type for nslookup expressions
279
- metadata={
280
- "conditions": conditions, # Include the conditions for filtering
281
- "node_type": "nslookup_expr",
282
- "nslookup_params": nslookup_params, # Include nslookup parameters
283
- },
284
- )
285
- post_processing_reqs.append(nslookup_requirement)
286
-
287
- # Mark the node for post-processing
288
- node["requires_post_processing"] = True
289
- node["post_process_type"] = "nslookup_expr"
290
-
291
- if conditions:
292
- optimizations.append(
293
- f"NSLookup expression on field '{field_name}' with conditions requires post-processing"
534
+ # For OpenSearch context, nslookup expressions require post-processing
535
+ if self.context == "opensearch":
536
+ # Create a post-processing requirement for the nslookup expression
537
+ # Build the nslookup mutator
538
+ mutator_params = []
539
+ for param_name, param_value in nslookup_params.items():
540
+ mutator_params.append([param_name, param_value])
541
+
542
+ nslookup_mutator = {"name": "nslookup"}
543
+ if mutator_params:
544
+ nslookup_mutator["params"] = mutator_params
545
+
546
+ # Create the requirement
547
+ req = PostProcessingRequirement(
548
+ field_name=field_name,
549
+ mapped_field_name=field_name, # Will be mapped during processing
550
+ mutators=[nslookup_mutator],
551
+ applies_to="nslookup_expr",
552
+ metadata={"conditions": conditions, "nslookup_params": nslookup_params},
294
553
  )
554
+ post_processing_reqs.append(req)
555
+
556
+ if conditions:
557
+ optimizations.append(
558
+ f"NSLookup expression on field '{field_name}' with conditions requires post-processing"
559
+ )
560
+ else:
561
+ optimizations.append(
562
+ f"NSLookup expression on field '{field_name}' for enrichment requires post-processing"
563
+ )
295
564
  else:
296
- optimizations.append(f"NSLookup expression on field '{field_name}' for enrichment only")
565
+ # For in-memory evaluation, handled during evaluation phase
566
+ if conditions:
567
+ optimizations.append(
568
+ f"NSLookup expression on field '{field_name}' with conditions handled during evaluation"
569
+ )
570
+ else:
571
+ optimizations.append(
572
+ f"NSLookup expression on field '{field_name}' for enrichment handled during evaluation"
573
+ )
297
574
 
298
575
  # Don't analyze conditions recursively - they're part of the nslookup expression
576
+ elif node_type == "query_with_stats":
577
+ # Handle query_with_stats node by analyzing the filter part
578
+ filter_node = node.get("filter")
579
+ if filter_node:
580
+ self._analyze_node(filter_node, post_processing_reqs, health_reasons, optimizations)
581
+
582
+ # Analyze the stats part if it contains mutators (though this is rare)
583
+ stats_node = node.get("stats")
584
+ if stats_node:
585
+ self._analyze_node(stats_node, post_processing_reqs, health_reasons, optimizations)
586
+
587
+ elif node_type == "stats_expr":
588
+ # Handle pure stats expressions - they typically don't have mutators
589
+ # but check aggregations and group_by fields for any field transformations
590
+ aggregations = node.get("aggregations", [])
591
+ for agg in aggregations:
592
+ # In case aggregations have field mutators in the future
593
+ if isinstance(agg, dict) and agg.get("field_mutators"):
594
+ # Analyze field mutators within aggregations if they exist
595
+ field_mutators = agg.get("field_mutators", [])
596
+ if field_mutators:
597
+ field_name = agg.get("field", "*")
598
+ # Add post-processing requirement for mutators in aggregations
599
+ post_processing_reqs.append(
600
+ PostProcessingRequirement(
601
+ field_name=field_name,
602
+ mapped_field_name=field_name,
603
+ mutators=field_mutators,
604
+ applies_to="field",
605
+ )
606
+ )
299
607
 
300
608
  def _analyze_comparison_node( # noqa: C901
301
609
  self,
@@ -315,37 +623,226 @@ class MutatorAnalyzer:
315
623
  field_name = node.get("field")
316
624
  operator = node.get("operator")
317
625
  field_mutators = node.get("field_mutators", [])
318
- value_mutators = node.get("value_mutators", [])
319
626
 
320
627
  if not field_name or not operator:
321
628
  return
322
629
 
323
630
  # Analyze field mutators
324
631
  if field_mutators:
325
- result = self._analyze_field_mutators(field_name, field_mutators, operator)
326
-
327
- # Update node with optimized mutators
328
- if result.optimized_mutators != field_mutators:
329
- if result.optimized_mutators:
330
- node["field_mutators"] = result.optimized_mutators
632
+ # Special case: if the last mutator is any/all/none and we have a comparison operator,
633
+ # treat it as an array comparison operator, not a regular mutator
634
+ last_mutator = field_mutators[-1] if field_mutators else None
635
+ if (
636
+ last_mutator
637
+ and last_mutator.get("name", "").lower() in ["any", "all", "none"]
638
+ and operator
639
+ in [
640
+ "eq",
641
+ "=",
642
+ "ne",
643
+ "!=",
644
+ "gt",
645
+ ">",
646
+ "lt",
647
+ "<",
648
+ "gte",
649
+ ">=",
650
+ "lte",
651
+ "<=",
652
+ "contains",
653
+ "not_contains",
654
+ "startswith",
655
+ "endswith",
656
+ "not_startswith",
657
+ "not_endswith",
658
+ ]
659
+ ):
660
+
661
+ # Extract the array operator
662
+ array_operator = last_mutator["name"].lower()
663
+
664
+ # Process any mutators before the array operator
665
+ remaining_mutators = field_mutators[:-1]
666
+ if remaining_mutators:
667
+ result = self._analyze_field_mutators(field_name, remaining_mutators, operator)
668
+
669
+ # Update node with optimized mutators
670
+ if result.optimized_mutators != remaining_mutators:
671
+ if result.optimized_mutators:
672
+ node["field_mutators"] = result.optimized_mutators
673
+ else:
674
+ # Remove field_mutators if all were optimized away
675
+ node.pop("field_mutators", None)
676
+ optimizations.extend(result.optimizations)
677
+
678
+ # Add post-processing requirements for the remaining mutators
679
+ if result.post_processing_mutators:
680
+ post_processing_reqs.append(
681
+ PostProcessingRequirement(
682
+ field_name=field_name,
683
+ mapped_field_name=result.selected_field or field_name,
684
+ mutators=result.post_processing_mutators,
685
+ applies_to="field",
686
+ )
687
+ )
331
688
  else:
332
- # Remove field_mutators if all were optimized away
689
+ # No other mutators, remove field_mutators from node
333
690
  node.pop("field_mutators", None)
334
691
 
335
- optimizations.extend(result.optimizations)
336
-
337
- # Add post-processing requirements
338
- if result.post_processing_mutators:
692
+ # Add post-processing requirement for the array comparison
339
693
  post_processing_reqs.append(
340
694
  PostProcessingRequirement(
341
695
  field_name=field_name,
342
- mapped_field_name=result.selected_field or field_name,
343
- mutators=result.post_processing_mutators,
696
+ mapped_field_name=field_name,
697
+ mutators=[], # No mutators, just operator-based filtering
344
698
  applies_to="field",
345
- metadata={"operator": operator, "value": node.get("value")},
699
+ metadata={
700
+ "operator": array_operator,
701
+ "comparison_operator": operator,
702
+ "value": node.get("value"),
703
+ },
346
704
  )
347
705
  )
348
706
 
707
+ # Array operators should not affect the OpenSearch query at all
708
+ # They are purely post-processing filters
709
+ # Store the original node info in the post-processing requirement
710
+ if post_processing_reqs and post_processing_reqs[-1].metadata is not None:
711
+ post_processing_reqs[-1].metadata["original_node"] = {
712
+ "type": "comparison",
713
+ "field": field_name,
714
+ "operator": operator,
715
+ "value": node.get("value"),
716
+ "field_mutators": [{"name": array_operator}],
717
+ }
718
+
719
+ # Array operators should be completely removed from the OpenSearch query
720
+ # Mark this node for removal
721
+ node["_remove_from_query"] = True
722
+
723
+ # Don't mark the node for post-processing - let the query be generated normally
724
+ # The array operator is applied as a post-processing filter on top of the results
725
+
726
+ optimizations.append(
727
+ f"Array operator '{array_operator}' with '{operator}' will be applied in post-processing"
728
+ )
729
+
730
+ # Skip the regular mutator processing that follows
731
+ return
732
+
733
+ else:
734
+ # Regular mutator processing
735
+ result = self._analyze_field_mutators(field_name, field_mutators, operator)
736
+
737
+ # For in-memory context, keep mutators in AST for evaluation
738
+ if self.context == "in_memory":
739
+ # Don't remove mutators from AST for in-memory queries
740
+ # They need to be applied during evaluation
741
+ pass
742
+ else:
743
+ # Update node with optimized mutators for OpenSearch context
744
+ if result.optimized_mutators != field_mutators:
745
+ if result.optimized_mutators:
746
+ node["field_mutators"] = result.optimized_mutators
747
+ else:
748
+ # Remove field_mutators if all were optimized away
749
+ node.pop("field_mutators", None)
750
+
751
+ optimizations.extend(result.optimizations)
752
+
753
+ # Add post-processing requirements
754
+ if result.post_processing_mutators:
755
+ # For in-memory context, we need special handling
756
+ if self.context == "in_memory":
757
+ # Check if any mutators are transform mutators that need to be applied to results
758
+ transform_mutators = []
759
+ for mutator in result.post_processing_mutators:
760
+ mutator_name = mutator.get("name", "").lower()
761
+ # Transform mutators that modify the result
762
+ if mutator_name in [
763
+ "split",
764
+ "lowercase",
765
+ "uppercase",
766
+ "trim",
767
+ "replace",
768
+ "refang",
769
+ "defang",
770
+ ]:
771
+ transform_mutators.append(mutator)
772
+
773
+ # If we have transform mutators, add them as post-processing for result transformation
774
+ if transform_mutators:
775
+ post_processing_reqs.append(
776
+ PostProcessingRequirement(
777
+ field_name=field_name,
778
+ mapped_field_name=field_name,
779
+ mutators=transform_mutators,
780
+ applies_to="field",
781
+ metadata={"transform_only": True}, # Mark as transform-only
782
+ )
783
+ )
784
+ else:
785
+ # Always include operator and value in metadata for post-processing filtering
786
+ metadata = {"operator": operator, "value": node.get("value")}
787
+ # Include original comparison info if it exists
788
+ if node.get("_original_comparison"):
789
+ metadata["_original_comparison"] = node["_original_comparison"]
790
+
791
+ post_processing_reqs.append(
792
+ PostProcessingRequirement(
793
+ field_name=field_name,
794
+ mapped_field_name=result.selected_field or field_name,
795
+ mutators=result.post_processing_mutators,
796
+ applies_to="field",
797
+ metadata=metadata,
798
+ )
799
+ )
800
+
801
+ # Check if we have transform mutators with filtering operators
802
+ # These need special handling in query conversion
803
+ TRANSFORM_MUTATORS = {
804
+ "lowercase",
805
+ "uppercase",
806
+ "trim",
807
+ "replace",
808
+ "refang",
809
+ "defang",
810
+ "b64encode",
811
+ "b64decode",
812
+ "urldecode",
813
+ }
814
+
815
+ has_transform_with_filter = False
816
+ for mutator in result.post_processing_mutators:
817
+ if mutator.get("name", "").lower() in TRANSFORM_MUTATORS:
818
+ has_transform_with_filter = True
819
+ break
820
+
821
+ if has_transform_with_filter and operator in [
822
+ "eq",
823
+ "=",
824
+ "ne",
825
+ "!=",
826
+ "contains",
827
+ "not_contains",
828
+ "startswith",
829
+ "endswith",
830
+ "not_startswith",
831
+ "not_endswith",
832
+ ">",
833
+ ">=",
834
+ "<",
835
+ "<=",
836
+ "gt",
837
+ "gte",
838
+ "lt",
839
+ "lte",
840
+ "between",
841
+ "not_between",
842
+ ]:
843
+ # Mark the node so query converter knows to use exists query
844
+ node["has_transform_mutators_with_filter"] = True
845
+
349
846
  # Check if any mutators change the field type
350
847
  has_type_changing_mutator = any(
351
848
  mutator.get("name", "").lower()
@@ -374,6 +871,12 @@ class MutatorAnalyzer:
374
871
  # Also mark if we have type-changing mutators
375
872
  if has_type_changing_mutator:
376
873
  node["has_type_changing_mutators"] = True
874
+
875
+ # For in-memory queries with type-changing mutators, DON'T convert to exists check
876
+ # The mutators should be applied during evaluation
877
+ if self.context == "in_memory":
878
+ # Keep the original comparison intact for in-memory evaluation
879
+ pass
377
880
  elif has_type_changing_mutator:
378
881
  # For type-changing mutators with numeric operators, mark for special handling
379
882
  node["has_type_changing_mutators"] = True
@@ -385,53 +888,11 @@ class MutatorAnalyzer:
385
888
  if result.selected_field and result.selected_field != field_name:
386
889
  node["field"] = result.selected_field
387
890
 
388
- # Check if operator requires post-processing (e.g., ALL operator on arrays)
389
- if operator in ["all", "not_all"]:
390
- # These operators need post-processing for array fields
391
- post_processing_reqs.append(
392
- PostProcessingRequirement(
393
- field_name=field_name,
394
- mapped_field_name=field_name,
395
- mutators=[], # No mutators, just operator-based filtering
396
- applies_to="field",
397
- metadata={"operator": operator, "value": node.get("value")},
398
- )
399
- )
400
- # Mark for special handling in OpenSearch
401
- node["post_process_value"] = True
402
-
403
- # Analyze value mutators (these are typically post-processing)
404
- if value_mutators:
405
- post_processing_value_mutators = []
406
-
407
- for mutator in value_mutators:
408
- mutator_name = mutator.get("name", "").lower()
409
- classification = MUTATOR_CLASSIFICATIONS.get(mutator_name, MutatorType.POST_PROCESSABLE)
410
-
411
- if classification in [MutatorType.POST_PROCESSABLE, MutatorType.CONDITIONAL]:
412
- post_processing_value_mutators.append(mutator)
413
-
414
- if post_processing_value_mutators:
415
- post_processing_reqs.append(
416
- PostProcessingRequirement(
417
- field_name=field_name,
418
- mapped_field_name=field_name, # Value mutators don't affect field mapping
419
- mutators=post_processing_value_mutators,
420
- applies_to="value",
421
- )
422
- )
891
+ # Note: ALL and NOT_ALL operators are handled during evaluation, not post-processing
423
892
 
424
- # For value mutators on equality operations, we need to make the query less restrictive
425
- # This allows post-processing to correctly filter results
426
- if operator in ["eq", "=", "ne", "!="]:
427
- # Mark the node to indicate it needs special handling in OpenSearch
428
- node["post_process_value"] = True
429
- # Keep the original value for reference
430
- node["original_value"] = node.get("value")
431
-
432
- # Remove value mutators from AST since they'll be post-processed
433
- node.pop("value_mutators", None)
434
- optimizations.append(f"Moved {len(post_processing_value_mutators)} value mutator(s) to post-processing")
893
+ # Value mutators are handled during evaluation, not post-processing
894
+ # The evaluator's _evaluate_comparison method applies value mutators before comparison
895
+ # So we don't need to treat them as post-processing requirements
435
896
 
436
897
  def _analyze_collection_node(
437
898
  self,
@@ -450,7 +911,6 @@ class MutatorAnalyzer:
450
911
  """
451
912
  field_name = node.get("field")
452
913
  field_mutators = node.get("field_mutators", [])
453
- value_mutators = node.get("value_mutators", [])
454
914
 
455
915
  if not field_name:
456
916
  return
@@ -487,21 +947,14 @@ class MutatorAnalyzer:
487
947
  f"post-processing for collection operation"
488
948
  )
489
949
 
490
- if value_mutators:
491
- # Value mutators always go to post-processing for collection operations
492
- post_processing_reqs.append(
493
- PostProcessingRequirement(
494
- field_name=field_name, mapped_field_name=field_name, mutators=value_mutators, applies_to="value"
495
- )
496
- )
497
-
498
- node.pop("value_mutators", None)
499
- optimizations.append(
500
- f"Moved {len(value_mutators)} value mutator(s) to post-processing for collection operation"
501
- )
950
+ # Value mutators are handled during evaluation for collection operations too
951
+ # The evaluator applies them before comparison in _evaluate_collection_comparison
502
952
 
503
953
  def _evaluate_health_for_context( # noqa: C901
504
- self, post_processing_requirements: List[PostProcessingRequirement], context: str
954
+ self,
955
+ post_processing_requirements: List[PostProcessingRequirement],
956
+ context: str,
957
+ ast: Optional[Dict[str, Any]] = None,
505
958
  ) -> Dict[str, Any]:
506
959
  """Evaluate health status based on context and mutator performance characteristics.
507
960
 
@@ -518,9 +971,10 @@ class MutatorAnalyzer:
518
971
  slow_mutators = []
519
972
  all_mutators = []
520
973
 
521
- # Collect all mutators and their performance classes
522
- for req in post_processing_requirements:
523
- for mutator_spec in req.mutators:
974
+ # Helper function to process mutators
975
+ def process_mutators(mutator_list):
976
+ nonlocal fast_count, moderate_count, slow_count
977
+ for mutator_spec in mutator_list:
524
978
  mutator_name = mutator_spec.get("name", "")
525
979
  all_mutators.append(mutator_name)
526
980
 
@@ -540,6 +994,28 @@ class MutatorAnalyzer:
540
994
  # If we can't create the mutator, assume moderate performance
541
995
  moderate_count += 1
542
996
 
997
+ # Collect all mutators from post-processing requirements
998
+ for req in post_processing_requirements:
999
+ process_mutators(req.mutators)
1000
+
1001
+ # For in_memory context with AST, also collect mutators from the AST
1002
+ if context == "in_memory" and ast:
1003
+
1004
+ def collect_ast_mutators(node):
1005
+ if isinstance(node, dict):
1006
+ # Check for field mutators
1007
+ if "field_mutators" in node:
1008
+ process_mutators(node["field_mutators"])
1009
+ # Check for value mutators
1010
+ if "value_mutators" in node:
1011
+ process_mutators(node["value_mutators"])
1012
+ # Recurse into child nodes
1013
+ for key, value in node.items():
1014
+ if key in ["left", "right", "operand"]:
1015
+ collect_ast_mutators(value)
1016
+
1017
+ collect_ast_mutators(ast)
1018
+
543
1019
  # Determine health status based on context
544
1020
  health_status = "green"
545
1021
  health_reasons = []
@@ -692,85 +1168,19 @@ class FieldMutatorAnalyzer:
692
1168
  self, field_mapping: FieldMapping, operator: str, mutator: Dict[str, Any]
693
1169
  ) -> "MutatorOptimizationResult":
694
1170
  """Try to optimize a lowercase mutator using field mappings."""
695
- # Check if we have a text field with lowercase analyzer
696
- lowercase_field = field_mapping.text_fields.get("lowercase")
697
- standard_field = field_mapping.text_fields.get("standard")
698
-
699
- if lowercase_field:
700
- # Perfect match - we have a lowercase analyzer
701
- return MutatorOptimizationResult(
702
- can_optimize=True,
703
- selected_field=lowercase_field,
704
- post_process_mutator=None,
705
- optimization_description=f"Using field '{lowercase_field}' with lowercase analyzer instead of mutator",
706
- )
707
- elif standard_field:
708
- # Standard analyzer might handle lowercase - use it but also post-process
709
- return MutatorOptimizationResult(
710
- can_optimize=False,
711
- selected_field=standard_field,
712
- post_process_mutator=mutator,
713
- optimization_description=f"Using text field '{standard_field}' but post-processing lowercase mutator",
714
- )
715
- elif field_mapping.keyword_field:
716
- # Only keyword field available - check operator compatibility
717
- if operator in [
718
- "eq",
719
- "=",
720
- "ne",
721
- "!=",
722
- "in",
723
- "not_in",
724
- "contains",
725
- "not_contains",
726
- "startswith",
727
- "endswith",
728
- "not_startswith",
729
- "not_endswith",
730
- ]:
731
- # These operators will work with post-processing
732
- return MutatorOptimizationResult(
733
- can_optimize=False,
734
- selected_field=field_mapping.keyword_field,
735
- post_process_mutator=mutator,
736
- optimization_description=f"Using keyword field '{field_mapping.keyword_field}' "
737
- f"with post-processing",
738
- health_issue={
739
- "status": "yellow",
740
- "query_part": f"{field_mapping.base_field_name} | lowercase",
741
- "reason": "Keyword field used with lowercase mutator requires post-processing",
742
- },
743
- )
744
- else:
745
- # Range operators don't make sense with lowercase
746
- return MutatorOptimizationResult(
747
- can_optimize=False,
748
- selected_field=None,
749
- post_process_mutator=None,
750
- optimization_description="",
751
- health_issue={
752
- "status": "red",
753
- "query_part": f"{field_mapping.base_field_name} | lowercase {operator}",
754
- "reason": (
755
- f"Field '{field_mapping.base_field_name}' does not support case-insensitive "
756
- f"searching with operator '{operator}'. Available: {field_mapping.keyword_field} (keyword)"
757
- ),
758
- },
759
- )
760
- else:
761
- # No suitable fields
762
- return MutatorOptimizationResult(
763
- can_optimize=False,
764
- selected_field=None,
765
- post_process_mutator=mutator,
766
- optimization_description="No suitable field mappings for lowercase optimization",
767
- )
1171
+ # Per requirement: lowercase should always be post-processing
1172
+ # Even if we have a lowercase analyzer field, we don't optimize
1173
+ return MutatorOptimizationResult(
1174
+ can_optimize=False,
1175
+ selected_field=None,
1176
+ post_process_mutator=mutator,
1177
+ optimization_description="Lowercase mutator always requires post-processing",
1178
+ )
768
1179
 
769
1180
  def _optimize_uppercase_mutator(
770
1181
  self, field_mapping: FieldMapping, operator: str, mutator: Dict[str, Any]
771
1182
  ) -> "MutatorOptimizationResult":
772
1183
  """Try to optimize an uppercase mutator using field mappings."""
773
- # Check if we actually have an uppercase analyzer
774
1184
  # We need to check the text_fields dict directly to ensure we have the specific analyzer
775
1185
  if "uppercase" in field_mapping.text_fields:
776
1186
  uppercase_field = field_mapping.text_fields["uppercase"]
@@ -793,26 +1203,14 @@ class FieldMutatorAnalyzer:
793
1203
  self, field_mapping: FieldMapping, operator: str, mutator: Dict[str, Any]
794
1204
  ) -> "MutatorOptimizationResult":
795
1205
  """Try to optimize a trim mutator using field mappings."""
796
- # Check if any text field might handle trimming
797
- # Most analyzers include trimming by default, but we can't be sure
798
- text_field = field_mapping.text_fields.get("standard")
799
-
800
- if text_field:
801
- # Assume standard analyzer handles trimming (common case)
802
- return MutatorOptimizationResult(
803
- can_optimize=True,
804
- selected_field=text_field,
805
- post_process_mutator=None,
806
- optimization_description=f"Assuming field '{text_field}' analyzer handles trimming",
807
- )
808
- else:
809
- # No text field - requires post-processing
810
- return MutatorOptimizationResult(
811
- can_optimize=False,
812
- selected_field=None,
813
- post_process_mutator=mutator,
814
- optimization_description="No text field available for trim optimization",
815
- )
1206
+ # Trim should always require post-processing to ensure consistent behavior
1207
+ # We can't reliably know if an analyzer trims whitespace
1208
+ return MutatorOptimizationResult(
1209
+ can_optimize=False,
1210
+ selected_field=None,
1211
+ post_process_mutator=mutator,
1212
+ optimization_description="Trim mutator always requires post-processing",
1213
+ )
816
1214
 
817
1215
 
818
1216
  @dataclass