tellaro-query-language 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tellaro_query_language-0.1.0.dist-info/LICENSE +21 -0
  2. tellaro_query_language-0.1.0.dist-info/METADATA +401 -0
  3. tellaro_query_language-0.1.0.dist-info/RECORD +56 -0
  4. tellaro_query_language-0.1.0.dist-info/WHEEL +4 -0
  5. tellaro_query_language-0.1.0.dist-info/entry_points.txt +7 -0
  6. tql/__init__.py +47 -0
  7. tql/analyzer.py +385 -0
  8. tql/cache/__init__.py +7 -0
  9. tql/cache/base.py +25 -0
  10. tql/cache/memory.py +63 -0
  11. tql/cache/redis.py +68 -0
  12. tql/core.py +929 -0
  13. tql/core_components/README.md +92 -0
  14. tql/core_components/__init__.py +20 -0
  15. tql/core_components/file_operations.py +113 -0
  16. tql/core_components/opensearch_operations.py +869 -0
  17. tql/core_components/stats_operations.py +200 -0
  18. tql/core_components/validation_operations.py +599 -0
  19. tql/evaluator.py +379 -0
  20. tql/evaluator_components/README.md +131 -0
  21. tql/evaluator_components/__init__.py +17 -0
  22. tql/evaluator_components/field_access.py +176 -0
  23. tql/evaluator_components/special_expressions.py +296 -0
  24. tql/evaluator_components/value_comparison.py +315 -0
  25. tql/exceptions.py +160 -0
  26. tql/geoip_normalizer.py +233 -0
  27. tql/mutator_analyzer.py +830 -0
  28. tql/mutators/__init__.py +222 -0
  29. tql/mutators/base.py +78 -0
  30. tql/mutators/dns.py +316 -0
  31. tql/mutators/encoding.py +218 -0
  32. tql/mutators/geo.py +363 -0
  33. tql/mutators/list.py +212 -0
  34. tql/mutators/network.py +163 -0
  35. tql/mutators/security.py +225 -0
  36. tql/mutators/string.py +165 -0
  37. tql/opensearch.py +78 -0
  38. tql/opensearch_components/README.md +130 -0
  39. tql/opensearch_components/__init__.py +17 -0
  40. tql/opensearch_components/field_mapping.py +399 -0
  41. tql/opensearch_components/lucene_converter.py +305 -0
  42. tql/opensearch_components/query_converter.py +775 -0
  43. tql/opensearch_mappings.py +309 -0
  44. tql/opensearch_stats.py +451 -0
  45. tql/parser.py +1363 -0
  46. tql/parser_components/README.md +72 -0
  47. tql/parser_components/__init__.py +20 -0
  48. tql/parser_components/ast_builder.py +162 -0
  49. tql/parser_components/error_analyzer.py +101 -0
  50. tql/parser_components/field_extractor.py +112 -0
  51. tql/parser_components/grammar.py +473 -0
  52. tql/post_processor.py +737 -0
  53. tql/scripts.py +124 -0
  54. tql/stats_evaluator.py +444 -0
  55. tql/stats_transformer.py +184 -0
  56. tql/validators.py +110 -0
tql/post_processor.py ADDED
@@ -0,0 +1,737 @@
1
+ """Post-processor for applying mutators to OpenSearch query results.
2
+
3
+ This module handles the application of mutators that cannot be pre-processed
4
+ by OpenSearch field mappings/analyzers and must be applied to results after
5
+ they are returned from OpenSearch.
6
+ """
7
+
8
+ import copy
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from .mutator_analyzer import PostProcessingRequirement
12
+ from .mutators import apply_mutators
13
+
14
+
15
+ class QueryPostProcessor:
16
+ """Applies post-processing mutators to OpenSearch query results."""
17
+
18
+ def __init__(self):
19
+ """Initialize the post-processor."""
20
+
21
+ def filter_results(
22
+ self, results: List[Dict[str, Any]], requirements: List[PostProcessingRequirement]
23
+ ) -> List[Dict[str, Any]]:
24
+ """Filter results based on post-processing requirements.
25
+
26
+ This method handles operator-based filtering for all operators that require
27
+ post-processing evaluation.
28
+
29
+ Args:
30
+ results: List of result records from OpenSearch
31
+ requirements: List of post-processing requirements
32
+
33
+ Returns:
34
+ Filtered list of results
35
+ """
36
+ if not requirements:
37
+ return results
38
+
39
+ filtered_results = []
40
+
41
+ for result in results:
42
+ should_include = True
43
+
44
+ # Check each requirement
45
+ for requirement in requirements:
46
+ if requirement.metadata and "operator" in requirement.metadata:
47
+ operator = requirement.metadata["operator"]
48
+ value = requirement.metadata.get("value")
49
+
50
+ # Get the field value - either mutated or original
51
+ # First check for mutated value in temp field
52
+ temp_field_name = f"__{requirement.field_name}_mutated__"
53
+ if temp_field_name in result:
54
+ field_value = result[temp_field_name]
55
+ else:
56
+ field_value = self._get_field_value(result, requirement.field_name)
57
+
58
+ # Apply the operator check
59
+ if not self._check_operator(field_value, operator, value):
60
+ should_include = False
61
+ break
62
+
63
+ if should_include:
64
+ filtered_results.append(result)
65
+
66
+ return filtered_results
67
+
68
+ def _check_operator(self, field_value: Any, operator: str, value: Any) -> bool: # noqa: C901
69
+ """Check if a field value matches the given operator and value.
70
+
71
+ Args:
72
+ field_value: The field value to check
73
+ operator: The operator to apply
74
+ value: The value to compare against
75
+
76
+ Returns:
77
+ True if the operator check passes, False otherwise
78
+ """
79
+ # Unwrap single-element lists for comparison
80
+ if isinstance(value, list) and len(value) == 1:
81
+ value = value[0]
82
+
83
+ # Handle None/missing fields
84
+ if field_value is None:
85
+ # Most operators should return False for missing fields
86
+ return False
87
+
88
+ # String operators
89
+ if operator == "contains":
90
+ return str(value).lower() in str(field_value).lower()
91
+ elif operator == "not_contains":
92
+ return str(value).lower() not in str(field_value).lower()
93
+ elif operator == "startswith":
94
+ return str(field_value).lower().startswith(str(value).lower())
95
+ elif operator == "not_startswith":
96
+ return not str(field_value).lower().startswith(str(value).lower())
97
+ elif operator == "endswith":
98
+ return str(field_value).lower().endswith(str(value).lower())
99
+ elif operator == "not_endswith":
100
+ return not str(field_value).lower().endswith(str(value).lower())
101
+
102
+ # Equality operators
103
+ elif operator in ["eq", "="]:
104
+ # Handle boolean comparisons
105
+ if isinstance(field_value, bool) and isinstance(value, str):
106
+ # Convert string to boolean for comparison
107
+ if value.lower() == "true":
108
+ return field_value is True
109
+ elif value.lower() == "false":
110
+ return field_value is False
111
+ return field_value == value
112
+ elif operator in ["ne", "!="]:
113
+ # Handle boolean comparisons
114
+ if isinstance(field_value, bool) and isinstance(value, str):
115
+ # Convert string to boolean for comparison
116
+ if value.lower() == "true":
117
+ return field_value is not True
118
+ elif value.lower() == "false":
119
+ return field_value is not False
120
+ return field_value != value
121
+
122
+ # Comparison operators
123
+ elif operator in ["gt", ">"]:
124
+ try:
125
+ return float(field_value) > float(value)
126
+ except (ValueError, TypeError):
127
+ return str(field_value) > str(value)
128
+ elif operator in ["gte", ">="]:
129
+ try:
130
+ return float(field_value) >= float(value)
131
+ except (ValueError, TypeError):
132
+ return str(field_value) >= str(value)
133
+ elif operator in ["lt", "<"]:
134
+ try:
135
+ return float(field_value) < float(value)
136
+ except (ValueError, TypeError):
137
+ return str(field_value) < str(value)
138
+ elif operator in ["lte", "<="]:
139
+ try:
140
+ return float(field_value) <= float(value)
141
+ except (ValueError, TypeError):
142
+ return str(field_value) <= str(value)
143
+
144
+ # Array operators
145
+ elif operator == "all":
146
+ if isinstance(field_value, (list, tuple)):
147
+ # For arrays, ALL elements must equal the value
148
+ # Empty arrays should not pass ALL
149
+ return len(field_value) > 0 and all(elem == value for elem in field_value)
150
+ else:
151
+ # For single values, simple equality
152
+ return field_value == value
153
+ elif operator == "not_all":
154
+ if isinstance(field_value, (list, tuple)):
155
+ # For arrays, if ALL elements equal the value, fail
156
+ # Empty arrays should pass NOT_ALL
157
+ return len(field_value) == 0 or not all(elem == value for elem in field_value)
158
+ else:
159
+ # For single values, if equal, fail
160
+ return field_value != value
161
+
162
+ # Default to False for unknown operators
163
+ return False
164
+
165
+ def process_results(
166
+ self,
167
+ results: List[Dict[str, Any]],
168
+ requirements: List[PostProcessingRequirement],
169
+ track_enrichments: bool = False,
170
+ ) -> List[Dict[str, Any]]:
171
+ """Apply post-processing mutators to query results.
172
+
173
+ Args:
174
+ results: List of result records from OpenSearch
175
+ requirements: List of post-processing requirements
176
+ track_enrichments: If True, track which records were enriched
177
+
178
+ Returns:
179
+ List of processed results with mutators applied.
180
+ If track_enrichments is True, each result will have a '_enriched' flag.
181
+ """
182
+ if not requirements:
183
+ return results
184
+
185
+ processed_results = []
186
+
187
+ for result in results:
188
+ # Deep copy to avoid modifying original
189
+ processed_result = copy.deepcopy(result)
190
+ enriched = False
191
+
192
+ # Apply each post-processing requirement
193
+ for requirement in requirements:
194
+ try:
195
+ was_enriched = self._apply_requirement(processed_result, requirement)
196
+ if was_enriched:
197
+ enriched = True
198
+ except Exception:
199
+ # Log error but continue processing
200
+ # In a production system, you might want to log this
201
+ continue
202
+
203
+ # Track enrichment status if requested
204
+ if track_enrichments:
205
+ processed_result["_enriched"] = enriched
206
+
207
+ processed_results.append(processed_result)
208
+
209
+ return processed_results
210
+
211
+ def _apply_requirement(self, result: Dict[str, Any], requirement: PostProcessingRequirement) -> bool:
212
+ """Apply a single post-processing requirement to a result.
213
+
214
+ Args:
215
+ result: The result record to modify
216
+ requirement: The post-processing requirement to apply
217
+
218
+ Returns:
219
+ True if the record was enriched, False otherwise
220
+ """
221
+ if requirement.applies_to == "field":
222
+ return self._apply_field_mutators(result, requirement)
223
+ elif requirement.applies_to == "value":
224
+ return self._apply_value_mutators(result, requirement)
225
+ elif requirement.applies_to == "geo_expr":
226
+ return self._apply_geo_expression(result, requirement)
227
+ elif requirement.applies_to == "nslookup_expr":
228
+ return self._apply_nslookup_expression(result, requirement)
229
+ return False
230
+
231
+ def _apply_field_mutators(self, result: Dict[str, Any], requirement: PostProcessingRequirement) -> bool:
232
+ """Apply field mutators to a result record.
233
+
234
+ Args:
235
+ result: The result record to modify
236
+ requirement: The field mutator requirement
237
+
238
+ Returns:
239
+ True if enrichment occurred, False otherwise
240
+ """
241
+ # Check if this is an operator-only requirement (like ALL operator with no mutators)
242
+ if requirement.metadata and "operator" in requirement.metadata and not requirement.mutators:
243
+ # This is handled separately in filter_results
244
+ return False
245
+
246
+ # Get the field value using the mapped field name
247
+ field_value = self._get_field_value(result, requirement.mapped_field_name)
248
+
249
+ if field_value is None:
250
+ return False
251
+
252
+ # Apply mutators to the field value
253
+ try:
254
+ mutated_value = apply_mutators(field_value, requirement.mutators, requirement.field_name, result)
255
+
256
+ # Check if this is a type-changing mutator that should not replace the field
257
+ # These mutators are used for filtering, not transforming the field value
258
+ TYPE_CHANGING_FILTER_MUTATORS = {
259
+ "is_private",
260
+ "is_global",
261
+ "length",
262
+ "any",
263
+ "all",
264
+ "avg",
265
+ "average",
266
+ "sum",
267
+ "max",
268
+ "min",
269
+ "split",
270
+ }
271
+ mutator_names = {m.get("name", "").lower() for m in requirement.mutators}
272
+
273
+ # Check the operator from metadata to determine if this is for filtering only
274
+ operator = requirement.metadata.get("operator", "") if requirement.metadata else ""
275
+ is_filtering_operation = operator in [
276
+ "contains",
277
+ "not_contains",
278
+ "startswith",
279
+ "endswith",
280
+ "not_startswith",
281
+ "not_endswith",
282
+ "eq",
283
+ "=",
284
+ "ne",
285
+ "!=",
286
+ ">",
287
+ ">=",
288
+ "<",
289
+ "<=",
290
+ "gt",
291
+ "gte",
292
+ "lt",
293
+ "lte",
294
+ ]
295
+
296
+ if mutator_names.intersection(TYPE_CHANGING_FILTER_MUTATORS) or is_filtering_operation:
297
+ # For type-changing mutators or filtering operations, store the result in a temporary field
298
+ # This allows re-evaluation to work correctly
299
+ temp_field_name = f"__{requirement.field_name}_mutated__"
300
+ self._set_field_value(result, temp_field_name, mutated_value)
301
+ else:
302
+ # Update the result with the mutated value
303
+ # Use the original field name for the output
304
+ self._set_field_value(result, requirement.field_name, mutated_value)
305
+
306
+ # Check if this is an enrichment mutator
307
+ from .mutators import ENRICHMENT_MUTATORS
308
+
309
+ for mutator in requirement.mutators:
310
+ if mutator.get("name", "").lower() in ENRICHMENT_MUTATORS:
311
+ return True
312
+
313
+ except Exception:
314
+ # If mutation fails, leave original value
315
+ pass
316
+
317
+ return False
318
+
319
+ def _apply_value_mutators(self, result: Dict[str, Any], requirement: PostProcessingRequirement) -> bool:
320
+ """Apply value mutators to a result record.
321
+
322
+ Note: Value mutators are typically applied during query evaluation,
323
+ not to results. This method is included for completeness but may
324
+ not be commonly used.
325
+
326
+ Args:
327
+ result: The result record to modify
328
+ requirement: The value mutator requirement
329
+
330
+ Returns:
331
+ False (value mutators do not enrich records)
332
+ """
333
+ # Value mutators are typically applied to query values, not result values
334
+ # This method is included for completeness but may not be used in practice
335
+ return False
336
+
337
+ def _apply_geo_expression( # noqa: C901
338
+ self, result: Dict[str, Any], requirement: PostProcessingRequirement
339
+ ) -> bool:
340
+ """Apply geo expression enrichment and filtering to a result.
341
+
342
+ Args:
343
+ result: The result record to modify
344
+ requirement: The geo expression requirement
345
+
346
+ Returns:
347
+ True if geo enrichment occurred, False otherwise
348
+ """
349
+ # Get the IP field value
350
+ ip_value = self._get_field_value(result, requirement.field_name)
351
+
352
+ if not ip_value:
353
+ # No IP value, nothing to enrich
354
+ return False
355
+
356
+ # Apply geoip_lookup mutator for enrichment
357
+ try:
358
+ geo_data = apply_mutators(
359
+ ip_value, requirement.mutators, requirement.field_name, result # Contains geoip_lookup mutator
360
+ )
361
+
362
+ # The geo data is returned as a dict with geo.* and as.* fields
363
+ # We need to nest it under the parent of the IP field
364
+ if isinstance(geo_data, dict) and geo_data:
365
+ # Check if a custom field location was specified
366
+ custom_field = None
367
+ for mutator in requirement.mutators:
368
+ if "params" in mutator:
369
+ params = mutator["params"]
370
+ # Convert params from list format to dict if needed
371
+ if isinstance(params, list):
372
+ params_dict = {}
373
+ for param in params:
374
+ if len(param) == 2:
375
+ params_dict[param[0]] = param[1]
376
+ params = params_dict
377
+
378
+ if "field" in params:
379
+ custom_field = params["field"]
380
+ break
381
+
382
+ if custom_field:
383
+ # Use the custom field location
384
+ parent = self._get_or_create_parent(result, custom_field)
385
+ # Store geo data directly at the custom location
386
+ if "geo" in geo_data:
387
+ parent.update(geo_data["geo"])
388
+ # Store AS data separately if present
389
+ if "as" in geo_data and custom_field:
390
+ # If custom field has a parent, store AS data as sibling
391
+ if "." in custom_field:
392
+ as_parent_path = custom_field.rsplit(".", 1)[0]
393
+ as_parent = self._get_or_create_parent(result, as_parent_path)
394
+ as_parent["as"] = geo_data["as"]
395
+ else:
396
+ # Store at root level
397
+ result["as"] = geo_data["as"]
398
+ else:
399
+ # Default behavior: store under parent.geo and parent.as
400
+ if "." in requirement.field_name:
401
+ # Nested field like destination.ip or source.ip
402
+ parent_path = requirement.field_name.rsplit(".", 1)[0]
403
+ parent = self._get_or_create_parent(result, parent_path)
404
+
405
+ # Add geo and as data under the parent
406
+ if "geo" in geo_data:
407
+ parent["geo"] = geo_data["geo"]
408
+ if "as" in geo_data:
409
+ parent["as"] = geo_data["as"]
410
+ else:
411
+ # Top-level field like 'ip' - use generic enrichment parent
412
+ if "enrichment" not in result:
413
+ result["enrichment"] = {}
414
+
415
+ if "geo" in geo_data:
416
+ result["enrichment"]["geo"] = geo_data["geo"]
417
+ if "as" in geo_data:
418
+ result["enrichment"]["as"] = geo_data["as"]
419
+
420
+ # Note: Filtering based on conditions is handled separately
421
+ # during the filter_results phase, not here
422
+ return True # Geo enrichment occurred
423
+
424
+ except Exception:
425
+ # If geo lookup fails, continue without enrichment
426
+ pass
427
+
428
+ return False
429
+
430
+ def _apply_nslookup_expression( # noqa: C901
431
+ self, result: Dict[str, Any], requirement: PostProcessingRequirement
432
+ ) -> bool:
433
+ """Apply nslookup expression enrichment and filtering to a result.
434
+
435
+ Args:
436
+ result: The result record to modify
437
+ requirement: The nslookup expression requirement
438
+
439
+ Returns:
440
+ True if DNS enrichment occurred, False otherwise
441
+ """
442
+ # Get the field value (IP or hostname)
443
+ field_value = self._get_field_value(result, requirement.field_name)
444
+
445
+ if not field_value:
446
+ # No value, nothing to enrich
447
+ return False
448
+
449
+ # Apply nslookup mutator for enrichment
450
+ try:
451
+ dns_data = apply_mutators(
452
+ field_value, requirement.mutators, requirement.field_name, result # Contains nslookup mutator
453
+ )
454
+
455
+ # The DNS data is returned as a dict with the query value as key
456
+ # Each value contains ECS-compliant DNS data
457
+ if isinstance(dns_data, dict) and dns_data:
458
+ # DNS data should have one entry for the queried value
459
+ # Extract the ECS data for the field value
460
+ ecs_dns_data = None
461
+ if field_value in dns_data:
462
+ ecs_dns_data = dns_data[field_value]
463
+ elif len(dns_data) == 1:
464
+ # If there's only one entry, use it
465
+ ecs_dns_data = next(iter(dns_data.values()))
466
+
467
+ if ecs_dns_data:
468
+ # Check if a custom field location was specified
469
+ custom_field = None
470
+ for mutator in requirement.mutators:
471
+ if "params" in mutator:
472
+ params = mutator["params"]
473
+ # Convert params from list format to dict if needed
474
+ if isinstance(params, list):
475
+ params_dict = {}
476
+ for param in params:
477
+ if len(param) == 2:
478
+ params_dict[param[0]] = param[1]
479
+ params = params_dict
480
+
481
+ if "field" in params:
482
+ custom_field = params["field"]
483
+ break
484
+
485
+ if custom_field:
486
+ # Use the custom field location
487
+ parent = self._get_or_create_parent(result, custom_field)
488
+ # Store DNS data directly at the custom location
489
+ parent.update(ecs_dns_data)
490
+ else:
491
+ # Default behavior: store at parent.domain
492
+ if "." in requirement.field_name:
493
+ # Nested field like destination.ip or source.hostname
494
+ parent_path = requirement.field_name.rsplit(".", 1)[0]
495
+ parent = self._get_or_create_parent(result, parent_path)
496
+
497
+ # Add ECS DNS data under the parent
498
+ parent["domain"] = ecs_dns_data
499
+ else:
500
+ # Top-level field like 'ip' - use generic enrichment parent
501
+ if "enrichment" not in result:
502
+ result["enrichment"] = {}
503
+
504
+ result["enrichment"]["domain"] = ecs_dns_data
505
+
506
+ # Note: Filtering based on conditions is handled separately
507
+ # during the filter_results phase, not here
508
+ return True # DNS enrichment occurred
509
+
510
+ except Exception:
511
+ # If DNS lookup fails, continue without enrichment
512
+ pass
513
+
514
+ return False
515
+
516
+ def _get_or_create_parent(self, record: Dict[str, Any], parent_path: str) -> Dict[str, Any]:
517
+ """Get or create a parent object in the record.
518
+
519
+ Args:
520
+ record: The record to modify
521
+ parent_path: Dot-separated path to the parent
522
+
523
+ Returns:
524
+ The parent dictionary
525
+ """
526
+ parts = parent_path.split(".")
527
+ current = record
528
+
529
+ for part in parts:
530
+ if part not in current:
531
+ current[part] = {}
532
+ elif not isinstance(current[part], dict):
533
+ # If the parent exists but isn't a dict, we can't add to it
534
+ raise ValueError(f"Cannot add geo data: {parent_path} is not an object")
535
+ current = current[part]
536
+
537
+ return current
538
+
539
+ def _get_field_value(self, record: Dict[str, Any], field_path: str) -> Any:
540
+ """Get a field value from a record, supporting nested fields.
541
+
542
+ Args:
543
+ record: The record dictionary
544
+ field_path: Dot-separated field path or literal field name
545
+
546
+ Returns:
547
+ The field value, or None if not found
548
+ """
549
+ # First try the field_path as a literal key
550
+ if isinstance(record, dict) and field_path in record:
551
+ return record[field_path]
552
+
553
+ # If not found as literal, try as dot-separated nested path
554
+ parts = field_path.split(".")
555
+ current = record
556
+
557
+ for part in parts:
558
+ if isinstance(current, dict) and part in current:
559
+ current = current[part]
560
+ else:
561
+ return None
562
+
563
+ return current
564
+
565
+ def _set_field_value(self, record: Dict[str, Any], field_path: str, value: Any) -> None:
566
+ """Set a field value in a record, supporting nested fields.
567
+
568
+ Args:
569
+ record: The record dictionary to modify
570
+ field_path: Dot-separated field path or literal field name
571
+ value: The value to set
572
+ """
573
+ # For setting values, we'll use the dot-separated path approach
574
+ # and create nested structures as needed
575
+ parts = field_path.split(".")
576
+ current = record
577
+
578
+ # Navigate to the parent of the target field
579
+ for part in parts[:-1]:
580
+ if part not in current:
581
+ current[part] = {}
582
+ current = current[part]
583
+
584
+ # Set the final value
585
+ current[parts[-1]] = value
586
+
587
+
588
+ class PostProcessingContext:
589
+ """Context information for post-processing operations."""
590
+
591
+ def __init__(self, query: str, field_mappings: Dict[str, Any], requirements: List[PostProcessingRequirement]):
592
+ """Initialize post-processing context.
593
+
594
+ Args:
595
+ query: Original TQL query string
596
+ field_mappings: Field mappings used in the query
597
+ requirements: Post-processing requirements
598
+ """
599
+ self.query = query
600
+ self.field_mappings = field_mappings
601
+ self.requirements = requirements
602
+ self.stats = PostProcessingStats()
603
+
604
+ def get_performance_impact(self) -> Dict[str, Any]:
605
+ """Get information about the performance impact of post-processing.
606
+
607
+ Returns:
608
+ Dictionary with performance impact information
609
+ """
610
+ impact: Dict[str, Any] = {
611
+ "has_post_processing": bool(self.requirements),
612
+ "requirement_count": len(self.requirements),
613
+ "impacted_fields": list(set(req.field_name for req in self.requirements)),
614
+ "mutator_types": [],
615
+ "estimated_overhead": "low",
616
+ }
617
+
618
+ # Analyze mutator types for performance estimation
619
+ mutator_counts: Dict[str, int] = {}
620
+ for req in self.requirements:
621
+ for mutator in req.mutators:
622
+ mutator_name = mutator.get("name", "unknown")
623
+ mutator_counts[mutator_name] = mutator_counts.get(mutator_name, 0) + 1
624
+ if mutator_name not in impact["mutator_types"]:
625
+ impact["mutator_types"].append(mutator_name)
626
+
627
+ # Estimate overhead based on mutator types
628
+ expensive_mutators = {"geoip_lookup", "nslookup", "geo"}
629
+ if any(mutator in expensive_mutators for mutator in mutator_counts):
630
+ impact["estimated_overhead"] = "high"
631
+ elif len(self.requirements) > 5:
632
+ impact["estimated_overhead"] = "medium"
633
+
634
+ impact["mutator_usage"] = mutator_counts
635
+
636
+ return impact
637
+
638
+
639
+ class PostProcessingStats:
640
+ """Statistics tracking for post-processing operations."""
641
+
642
+ def __init__(self):
643
+ """Initialize stats tracking."""
644
+ self.processed_records = 0
645
+ self.failed_records = 0
646
+ self.mutator_applications = 0
647
+ self.errors = []
648
+
649
+ def record_success(self):
650
+ """Record a successful record processing."""
651
+ self.processed_records += 1
652
+
653
+ def record_failure(self, error: str):
654
+ """Record a failed record processing."""
655
+ self.failed_records += 1
656
+ self.errors.append(error)
657
+
658
+ def record_mutator_application(self):
659
+ """Record a mutator application."""
660
+ self.mutator_applications += 1
661
+
662
+ def get_summary(self) -> Dict[str, Any]:
663
+ """Get a summary of processing statistics.
664
+
665
+ Returns:
666
+ Dictionary with processing statistics
667
+ """
668
+ total_records = self.processed_records + self.failed_records
669
+ success_rate = (self.processed_records / total_records * 100) if total_records > 0 else 0
670
+
671
+ return {
672
+ "total_records": total_records,
673
+ "processed_successfully": self.processed_records,
674
+ "failed_records": self.failed_records,
675
+ "success_rate_percent": round(success_rate, 2),
676
+ "mutator_applications": self.mutator_applications,
677
+ "error_count": len(self.errors),
678
+ "recent_errors": self.errors[-5:] if self.errors else [], # Last 5 errors
679
+ }
680
+
681
+
682
+ class PostProcessingError(Exception):
683
+ """Exception raised during post-processing operations."""
684
+
685
+ def __init__(self, message: str, field_name: Optional[str] = None, mutator_name: Optional[str] = None):
686
+ """Initialize post-processing error.
687
+
688
+ Args:
689
+ message: Error message
690
+ field_name: Field name where error occurred
691
+ mutator_name: Mutator name that caused the error
692
+ """
693
+ super().__init__(message)
694
+ self.field_name = field_name
695
+ self.mutator_name = mutator_name
696
+
697
+
698
+ class BatchPostProcessor(QueryPostProcessor):
699
+ """Post-processor optimized for large batches of results."""
700
+
701
+ def __init__(self, batch_size: int = 1000):
702
+ """Initialize batch post-processor.
703
+
704
+ Args:
705
+ batch_size: Number of records to process in each batch
706
+ """
707
+ super().__init__()
708
+ self.batch_size = batch_size
709
+
710
+ def process_results(
711
+ self,
712
+ results: List[Dict[str, Any]],
713
+ requirements: List[PostProcessingRequirement],
714
+ track_enrichments: bool = False,
715
+ ) -> List[Dict[str, Any]]:
716
+ """Process results in batches for better memory efficiency.
717
+
718
+ Args:
719
+ results: List of result records from OpenSearch
720
+ requirements: List of post-processing requirements
721
+ track_enrichments: Whether to track enrichment operations
722
+
723
+ Returns:
724
+ List of processed results with mutators applied
725
+ """
726
+ if not requirements:
727
+ return results
728
+
729
+ processed_results = []
730
+
731
+ # Process in batches
732
+ for i in range(0, len(results), self.batch_size):
733
+ batch = results[i : i + self.batch_size]
734
+ processed_batch = super().process_results(batch, requirements, track_enrichments)
735
+ processed_results.extend(processed_batch)
736
+
737
+ return processed_results