tellaro-query-language 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tellaro_query_language-0.2.5.dist-info/LICENSE +72 -0
- tellaro_query_language-0.2.5.dist-info/METADATA +806 -0
- {tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/RECORD +25 -22
- {tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/entry_points.txt +1 -0
- tql/__init__.py +1 -1
- tql/cache/base.py +79 -7
- tql/cache/memory.py +126 -18
- tql/cli.py +484 -0
- tql/core.py +261 -5
- tql/core_components/opensearch_operations.py +23 -4
- tql/evaluator.py +3 -1
- tql/evaluator_components/special_expressions.py +62 -10
- tql/evaluator_components/value_comparison.py +70 -12
- tql/exceptions.py +6 -4
- tql/field_type_inference.py +285 -0
- tql/mutator_analyzer.py +2 -2
- tql/mutators/geo.py +57 -20
- tql/opensearch_components/query_converter.py +1 -1
- tql/opensearch_stats.py +10 -7
- tql/parser.py +56 -21
- tql/post_processor.py +44 -11
- tql/scripts.py +19 -2
- tql/stats_evaluator.py +361 -7
- tql/streaming_file_processor.py +335 -0
- tellaro_query_language-0.2.2.dist-info/LICENSE +0 -21
- tellaro_query_language-0.2.2.dist-info/METADATA +0 -433
- {tellaro_query_language-0.2.2.dist-info → tellaro_query_language-0.2.5.dist-info}/WHEEL +0 -0
tql/core.py
CHANGED
|
@@ -4,12 +4,19 @@ This module provides the main TQL class that serves as the primary interface
|
|
|
4
4
|
for parsing and executing TQL queries against different backends.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
from typing import Any, Dict, Generator, List, Optional, Union
|
|
8
8
|
|
|
9
9
|
from .analyzer import EnhancedFieldMapping
|
|
10
10
|
from .core_components import FileOperations, OpenSearchOperations, StatsOperations, ValidationOperations
|
|
11
11
|
from .evaluator import TQLEvaluator
|
|
12
|
-
from .exceptions import
|
|
12
|
+
from .exceptions import (
|
|
13
|
+
TQLExecutionError,
|
|
14
|
+
TQLOperatorError,
|
|
15
|
+
TQLParseError,
|
|
16
|
+
TQLSyntaxError,
|
|
17
|
+
TQLTypeError,
|
|
18
|
+
TQLValidationError,
|
|
19
|
+
)
|
|
13
20
|
from .mutator_analyzer import MutatorAnalysisResult
|
|
14
21
|
from .parser import TQLParser
|
|
15
22
|
from .stats_evaluator import TQLStatsEvaluator
|
|
@@ -27,7 +34,7 @@ class TQL:
|
|
|
27
34
|
>>> results = tql.query(data, query)
|
|
28
35
|
"""
|
|
29
36
|
|
|
30
|
-
def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None):
|
|
37
|
+
def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None): # noqa: C901
|
|
31
38
|
"""Initialize TQL instance.
|
|
32
39
|
|
|
33
40
|
Args:
|
|
@@ -100,8 +107,26 @@ class TQL:
|
|
|
100
107
|
# This is an OpenSearch-style mapping, map field to itself
|
|
101
108
|
self._simple_mappings[k] = k
|
|
102
109
|
else:
|
|
103
|
-
#
|
|
104
|
-
|
|
110
|
+
# Intelligent field mapping extraction for complex mappings
|
|
111
|
+
# Priority: 1) Key matching field name, 2) Key without dots (primary field), 3) First key
|
|
112
|
+
|
|
113
|
+
if k in v:
|
|
114
|
+
# Field name exists as key in mapping (e.g., {"username": {"username": "keyword", ...}})
|
|
115
|
+
self._simple_mappings[k] = k
|
|
116
|
+
else:
|
|
117
|
+
# Find primary field (keys without dots, not starting with underscore)
|
|
118
|
+
primary_fields = [
|
|
119
|
+
field_key
|
|
120
|
+
for field_key in v.keys()
|
|
121
|
+
if "." not in field_key and not field_key.startswith("_")
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
if primary_fields:
|
|
125
|
+
# Use first primary field
|
|
126
|
+
self._simple_mappings[k] = primary_fields[0]
|
|
127
|
+
else:
|
|
128
|
+
# Fallback to first key (maintain backward compatibility)
|
|
129
|
+
self._simple_mappings[k] = next(iter(v.keys()))
|
|
105
130
|
else:
|
|
106
131
|
# Default to mapping field to itself
|
|
107
132
|
self._simple_mappings[k] = k
|
|
@@ -1032,6 +1057,237 @@ class TQL:
|
|
|
1032
1057
|
"""
|
|
1033
1058
|
return self.stats_ops.analyze_stats_query(query)
|
|
1034
1059
|
|
|
1060
|
+
def query_file_streaming(
|
|
1061
|
+
self,
|
|
1062
|
+
file_path: str,
|
|
1063
|
+
query: str,
|
|
1064
|
+
input_format: str = "auto",
|
|
1065
|
+
csv_delimiter: str = ",",
|
|
1066
|
+
csv_headers: Optional[List[str]] = None,
|
|
1067
|
+
no_header: bool = False,
|
|
1068
|
+
field_types: Optional[Dict[str, str]] = None,
|
|
1069
|
+
sample_size: int = 100,
|
|
1070
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
1071
|
+
"""Execute a TQL query against a file in streaming mode.
|
|
1072
|
+
|
|
1073
|
+
This method processes files line-by-line with minimal memory usage,
|
|
1074
|
+
yielding matching records as they are found.
|
|
1075
|
+
|
|
1076
|
+
Args:
|
|
1077
|
+
file_path: Path to file
|
|
1078
|
+
query: TQL query string (filter query only, not stats)
|
|
1079
|
+
input_format: File format ('json', 'jsonl', 'csv', 'auto')
|
|
1080
|
+
csv_delimiter: CSV delimiter character
|
|
1081
|
+
csv_headers: Manual CSV header names
|
|
1082
|
+
no_header: Force CSV to be treated as having no header
|
|
1083
|
+
field_types: Manual field type mappings
|
|
1084
|
+
sample_size: Number of records to sample for type inference
|
|
1085
|
+
|
|
1086
|
+
Yields:
|
|
1087
|
+
Matching records as dictionaries
|
|
1088
|
+
|
|
1089
|
+
Raises:
|
|
1090
|
+
TQLParseError: If query parsing fails
|
|
1091
|
+
TQLExecutionError: If file processing fails
|
|
1092
|
+
"""
|
|
1093
|
+
from .streaming_file_processor import StreamingFileProcessor
|
|
1094
|
+
|
|
1095
|
+
# Parse the query
|
|
1096
|
+
ast = self.parse(query)
|
|
1097
|
+
|
|
1098
|
+
# Validate query type (only filter queries supported for streaming)
|
|
1099
|
+
query_type = ast.get("type")
|
|
1100
|
+
if query_type in ["stats_expr", "query_with_stats"]:
|
|
1101
|
+
raise TQLExecutionError("Stats queries not supported in streaming mode. Use query_file_stats() instead.")
|
|
1102
|
+
|
|
1103
|
+
# Create streaming processor
|
|
1104
|
+
processor = StreamingFileProcessor(
|
|
1105
|
+
sample_size=sample_size,
|
|
1106
|
+
csv_delimiter=csv_delimiter,
|
|
1107
|
+
field_types=field_types,
|
|
1108
|
+
csv_headers=csv_headers,
|
|
1109
|
+
no_header=no_header,
|
|
1110
|
+
)
|
|
1111
|
+
|
|
1112
|
+
# Process file and evaluate query on each record
|
|
1113
|
+
for record in processor.process_file(file_path, input_format):
|
|
1114
|
+
if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
|
|
1115
|
+
yield record
|
|
1116
|
+
|
|
1117
|
+
def query_file_stats(
|
|
1118
|
+
self,
|
|
1119
|
+
file_path: str,
|
|
1120
|
+
query: str,
|
|
1121
|
+
input_format: str = "auto",
|
|
1122
|
+
csv_delimiter: str = ",",
|
|
1123
|
+
csv_headers: Optional[List[str]] = None,
|
|
1124
|
+
no_header: bool = False,
|
|
1125
|
+
field_types: Optional[Dict[str, str]] = None,
|
|
1126
|
+
sample_size: int = 100,
|
|
1127
|
+
) -> Dict[str, Any]:
|
|
1128
|
+
"""Execute a TQL stats query against a file in streaming mode.
|
|
1129
|
+
|
|
1130
|
+
This method processes files line-by-line with accumulator-based stats
|
|
1131
|
+
calculations for memory efficiency.
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
file_path: Path to file
|
|
1135
|
+
query: TQL query string (can include filters and stats)
|
|
1136
|
+
input_format: File format ('json', 'jsonl', 'csv', 'auto')
|
|
1137
|
+
csv_delimiter: CSV delimiter character
|
|
1138
|
+
csv_headers: Manual CSV header names
|
|
1139
|
+
no_header: Force CSV to be treated as having no header
|
|
1140
|
+
field_types: Manual field type mappings
|
|
1141
|
+
sample_size: Number of records to sample for type inference
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
Dictionary containing aggregation results
|
|
1145
|
+
|
|
1146
|
+
Raises:
|
|
1147
|
+
TQLParseError: If query parsing fails
|
|
1148
|
+
TQLExecutionError: If file processing fails
|
|
1149
|
+
"""
|
|
1150
|
+
from .streaming_file_processor import StreamingFileProcessor
|
|
1151
|
+
|
|
1152
|
+
# Parse the query
|
|
1153
|
+
ast = self.parse(query)
|
|
1154
|
+
query_type = ast.get("type")
|
|
1155
|
+
|
|
1156
|
+
# Create streaming processor
|
|
1157
|
+
processor = StreamingFileProcessor(
|
|
1158
|
+
sample_size=sample_size,
|
|
1159
|
+
csv_delimiter=csv_delimiter,
|
|
1160
|
+
field_types=field_types,
|
|
1161
|
+
csv_headers=csv_headers,
|
|
1162
|
+
no_header=no_header,
|
|
1163
|
+
)
|
|
1164
|
+
|
|
1165
|
+
# Handle different query types
|
|
1166
|
+
if query_type == "stats_expr":
|
|
1167
|
+
# Pure stats query - process all records
|
|
1168
|
+
record_iter = processor.process_file(file_path, input_format)
|
|
1169
|
+
return self.stats_evaluator.evaluate_stats_streaming(record_iter, ast, self.field_mappings)
|
|
1170
|
+
|
|
1171
|
+
elif query_type == "query_with_stats":
|
|
1172
|
+
# Filter + stats query
|
|
1173
|
+
filter_ast = ast["filter"]
|
|
1174
|
+
stats_ast = ast["stats"]
|
|
1175
|
+
|
|
1176
|
+
# Create filtered iterator
|
|
1177
|
+
def filtered_records():
|
|
1178
|
+
for record in processor.process_file(file_path, input_format):
|
|
1179
|
+
if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
|
|
1180
|
+
yield record
|
|
1181
|
+
|
|
1182
|
+
return self.stats_evaluator.evaluate_stats_streaming(filtered_records(), stats_ast, self.field_mappings)
|
|
1183
|
+
|
|
1184
|
+
else:
|
|
1185
|
+
# Regular filter query - shouldn't use stats method
|
|
1186
|
+
raise TQLExecutionError("Use query_file_streaming() for filter queries without stats aggregations.")
|
|
1187
|
+
|
|
1188
|
+
def query_folder(
|
|
1189
|
+
self,
|
|
1190
|
+
folder_path: str,
|
|
1191
|
+
query: str,
|
|
1192
|
+
pattern: str = "*",
|
|
1193
|
+
input_format: str = "auto",
|
|
1194
|
+
recursive: bool = False,
|
|
1195
|
+
parallel: int = 4,
|
|
1196
|
+
csv_delimiter: str = ",",
|
|
1197
|
+
csv_headers: Optional[List[str]] = None,
|
|
1198
|
+
no_header: bool = False,
|
|
1199
|
+
field_types: Optional[Dict[str, str]] = None,
|
|
1200
|
+
sample_size: int = 100,
|
|
1201
|
+
) -> Dict[str, Any]:
|
|
1202
|
+
"""Execute a TQL query against multiple files in a folder.
|
|
1203
|
+
|
|
1204
|
+
This method processes all matching files and aggregates results,
|
|
1205
|
+
supporting both filter queries (with records) and stats queries.
|
|
1206
|
+
|
|
1207
|
+
Args:
|
|
1208
|
+
folder_path: Path to folder
|
|
1209
|
+
query: TQL query string
|
|
1210
|
+
pattern: Glob pattern for file matching
|
|
1211
|
+
input_format: File format ('json', 'jsonl', 'csv', 'auto')
|
|
1212
|
+
recursive: Process subdirectories recursively
|
|
1213
|
+
parallel: Number of parallel workers
|
|
1214
|
+
csv_delimiter: CSV delimiter character
|
|
1215
|
+
csv_headers: Manual CSV header names
|
|
1216
|
+
no_header: Force CSV to be treated as having no header
|
|
1217
|
+
field_types: Manual field type mappings
|
|
1218
|
+
sample_size: Number of records to sample for type inference
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
Dictionary containing results and/or stats aggregated across all files
|
|
1222
|
+
|
|
1223
|
+
Raises:
|
|
1224
|
+
TQLParseError: If query parsing fails
|
|
1225
|
+
TQLExecutionError: If folder processing fails
|
|
1226
|
+
"""
|
|
1227
|
+
from .streaming_file_processor import StreamingFileProcessor
|
|
1228
|
+
|
|
1229
|
+
# Parse the query
|
|
1230
|
+
ast = self.parse(query)
|
|
1231
|
+
query_type = ast.get("type")
|
|
1232
|
+
|
|
1233
|
+
# Create streaming processor
|
|
1234
|
+
processor = StreamingFileProcessor(
|
|
1235
|
+
sample_size=sample_size,
|
|
1236
|
+
csv_delimiter=csv_delimiter,
|
|
1237
|
+
field_types=field_types,
|
|
1238
|
+
csv_headers=csv_headers,
|
|
1239
|
+
no_header=no_header,
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
# Process folder based on query type
|
|
1243
|
+
if query_type == "stats_expr":
|
|
1244
|
+
# Pure stats query - aggregate across all files
|
|
1245
|
+
|
|
1246
|
+
def all_records():
|
|
1247
|
+
for _file_path, record in processor.process_folder(
|
|
1248
|
+
folder_path, pattern, input_format, recursive, parallel
|
|
1249
|
+
):
|
|
1250
|
+
yield record
|
|
1251
|
+
|
|
1252
|
+
stats_result = self.stats_evaluator.evaluate_stats_streaming(all_records(), ast, self.field_mappings)
|
|
1253
|
+
return {"stats": stats_result, "files_processed": "multiple"}
|
|
1254
|
+
|
|
1255
|
+
elif query_type == "query_with_stats":
|
|
1256
|
+
# Filter + stats query
|
|
1257
|
+
filter_ast = ast["filter"]
|
|
1258
|
+
stats_ast = ast["stats"]
|
|
1259
|
+
|
|
1260
|
+
def filtered_records():
|
|
1261
|
+
for _file_path, record in processor.process_folder(
|
|
1262
|
+
folder_path, pattern, input_format, recursive, parallel
|
|
1263
|
+
):
|
|
1264
|
+
if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
|
|
1265
|
+
yield record
|
|
1266
|
+
|
|
1267
|
+
stats_result = self.stats_evaluator.evaluate_stats_streaming(
|
|
1268
|
+
filtered_records(), stats_ast, self.field_mappings
|
|
1269
|
+
)
|
|
1270
|
+
return {"stats": stats_result, "files_processed": "multiple"}
|
|
1271
|
+
|
|
1272
|
+
else:
|
|
1273
|
+
# Regular filter query - collect matching records from all files
|
|
1274
|
+
matched_records = []
|
|
1275
|
+
files_processed = 0
|
|
1276
|
+
files_with_matches = 0
|
|
1277
|
+
|
|
1278
|
+
for file_path, record in processor.process_folder(folder_path, pattern, input_format, recursive, parallel):
|
|
1279
|
+
files_processed += 1
|
|
1280
|
+
if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
|
|
1281
|
+
matched_records.append({"_source_file": file_path, **record})
|
|
1282
|
+
files_with_matches += 1
|
|
1283
|
+
|
|
1284
|
+
return {
|
|
1285
|
+
"results": matched_records,
|
|
1286
|
+
"total": len(matched_records),
|
|
1287
|
+
"files_processed": files_processed,
|
|
1288
|
+
"files_with_matches": files_with_matches,
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1035
1291
|
def _apply_mutators_to_record(self, ast: Dict[str, Any], record: Dict[str, Any]) -> Dict[str, Any]:
|
|
1036
1292
|
"""Apply any mutators in the AST to enrich the record.
|
|
1037
1293
|
|
|
@@ -239,7 +239,7 @@ class OpenSearchOperations:
|
|
|
239
239
|
analysis_result = self.analyze_opensearch_query(query)
|
|
240
240
|
has_mutators = isinstance(analysis_result, MutatorAnalysisResult)
|
|
241
241
|
needs_post_processing_for_stats = (
|
|
242
|
-
has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False
|
|
242
|
+
has_mutators and bool(analysis_result.post_processing_requirements) if has_mutators else False # type: ignore[union-attr]
|
|
243
243
|
)
|
|
244
244
|
|
|
245
245
|
# Handle stats queries differently
|
|
@@ -258,7 +258,7 @@ class OpenSearchOperations:
|
|
|
258
258
|
if filter_ast:
|
|
259
259
|
# Use the optimized AST if we have mutators
|
|
260
260
|
if has_mutators and needs_post_processing_for_stats:
|
|
261
|
-
filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"]
|
|
261
|
+
filter_query = backend.convert(analysis_result.optimized_ast.get("filter", filter_ast))["query"] # type: ignore[union-attr]
|
|
262
262
|
else:
|
|
263
263
|
filter_query = backend.convert(filter_ast)["query"]
|
|
264
264
|
else:
|
|
@@ -529,6 +529,8 @@ class OpenSearchOperations:
|
|
|
529
529
|
stats_evaluator = TQLStatsEvaluator()
|
|
530
530
|
|
|
531
531
|
# Execute the stats aggregation in memory
|
|
532
|
+
if stats_ast_for_post_processing is None:
|
|
533
|
+
raise ValueError("Stats AST is None but phase2 processing was requested")
|
|
532
534
|
stats_results = stats_evaluator.evaluate_stats(filtered_docs, stats_ast_for_post_processing, {})
|
|
533
535
|
|
|
534
536
|
# Format response for stats-only (no documents)
|
|
@@ -547,7 +549,7 @@ class OpenSearchOperations:
|
|
|
547
549
|
"performance_impact": {
|
|
548
550
|
"overhead_ms": 0, # Would need timing to calculate
|
|
549
551
|
"documents_processed": len(all_documents),
|
|
550
|
-
"mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0,
|
|
552
|
+
"mutators_applied": len(analysis_result.post_processing_requirements) if has_mutators else 0, # type: ignore[union-attr]
|
|
551
553
|
},
|
|
552
554
|
"opensearch_query": complete_opensearch_query,
|
|
553
555
|
}
|
|
@@ -580,6 +582,8 @@ class OpenSearchOperations:
|
|
|
580
582
|
translator = OpenSearchStatsTranslator()
|
|
581
583
|
|
|
582
584
|
# Transform the response using the translator
|
|
585
|
+
if stats_ast is None:
|
|
586
|
+
raise ValueError("Stats AST is None but grouping was detected")
|
|
583
587
|
transformed_response = translator.transform_response(response, stats_ast)
|
|
584
588
|
|
|
585
589
|
# The transformed response already has the correct structure
|
|
@@ -925,6 +929,21 @@ class OpenSearchOperations:
|
|
|
925
929
|
# Get opensearch total before filtering
|
|
926
930
|
opensearch_total = total_hits
|
|
927
931
|
|
|
932
|
+
# Track optimization features used in this query
|
|
933
|
+
optimizations_applied = []
|
|
934
|
+
if scan_all:
|
|
935
|
+
optimizations_applied.append("scroll_api")
|
|
936
|
+
if needs_phase2 and pagination_stats and pagination_stats.get("pages_checked", 0) > 1:
|
|
937
|
+
optimizations_applied.append("auto_pagination")
|
|
938
|
+
if request_cache:
|
|
939
|
+
optimizations_applied.append("request_cache")
|
|
940
|
+
if preference:
|
|
941
|
+
optimizations_applied.append("preference_routing")
|
|
942
|
+
if routing:
|
|
943
|
+
optimizations_applied.append("custom_routing")
|
|
944
|
+
if terminate_after:
|
|
945
|
+
optimizations_applied.append("early_termination")
|
|
946
|
+
|
|
928
947
|
result = {
|
|
929
948
|
"results": results,
|
|
930
949
|
"total": len(results),
|
|
@@ -934,7 +953,7 @@ class OpenSearchOperations:
|
|
|
934
953
|
"health_status": health_status,
|
|
935
954
|
"health_reasons": health_reasons,
|
|
936
955
|
"performance_impact": performance_impact,
|
|
937
|
-
"optimizations_applied":
|
|
956
|
+
"optimizations_applied": optimizations_applied,
|
|
938
957
|
"opensearch_query": (
|
|
939
958
|
complete_opensearch_query if "complete_opensearch_query" in locals() else {}
|
|
940
959
|
), # Include the full query body
|
tql/evaluator.py
CHANGED
|
@@ -67,7 +67,7 @@ class TQLEvaluator:
|
|
|
67
67
|
field_mappings = field_mappings or {}
|
|
68
68
|
return self._evaluate_node(ast, record, field_mappings)
|
|
69
69
|
|
|
70
|
-
def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool:
|
|
70
|
+
def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool: # noqa: C901
|
|
71
71
|
"""Evaluate a single AST node against a record.
|
|
72
72
|
|
|
73
73
|
Args:
|
|
@@ -350,6 +350,8 @@ class TQLEvaluator:
|
|
|
350
350
|
return left_missing or right_missing
|
|
351
351
|
elif node_type == "unary_op":
|
|
352
352
|
# Don't recurse through NOT operators - they handle missing fields themselves
|
|
353
|
+
# The NOT operator has special logic at lines 213-254 that handles missing fields correctly
|
|
354
|
+
# Recursing here would cause double-handling and incorrect results
|
|
353
355
|
return False
|
|
354
356
|
elif node_type == "collection_op":
|
|
355
357
|
field_name = node["field"]
|
|
@@ -15,15 +15,27 @@ class SpecialExpressionEvaluator:
|
|
|
15
15
|
# Sentinel value to distinguish missing fields from None values
|
|
16
16
|
_MISSING_FIELD = object()
|
|
17
17
|
|
|
18
|
-
def __init__(self, get_field_value_func, evaluate_node_func):
|
|
18
|
+
def __init__(self, get_field_value_func, evaluate_node_func, set_field_value_func=None):
|
|
19
19
|
"""Initialize the special expression evaluator.
|
|
20
20
|
|
|
21
21
|
Args:
|
|
22
22
|
get_field_value_func: Function to get field values from records
|
|
23
23
|
evaluate_node_func: Function to evaluate AST nodes
|
|
24
|
+
set_field_value_func: Optional function to set field values in records
|
|
24
25
|
"""
|
|
25
26
|
self._get_field_value = get_field_value_func
|
|
26
27
|
self._evaluate_node = evaluate_node_func
|
|
28
|
+
self._set_field_value = set_field_value_func or self._default_set_field_value
|
|
29
|
+
|
|
30
|
+
def _default_set_field_value(self, record: Dict[str, Any], field_path: str, value: Any) -> None:
|
|
31
|
+
"""Default implementation of set_field_value for nested field assignment."""
|
|
32
|
+
parts = field_path.split(".")
|
|
33
|
+
current = record
|
|
34
|
+
for part in parts[:-1]:
|
|
35
|
+
if part not in current:
|
|
36
|
+
current[part] = {}
|
|
37
|
+
current = current[part]
|
|
38
|
+
current[parts[-1]] = value
|
|
27
39
|
|
|
28
40
|
def evaluate_geo_expr( # noqa: C901
|
|
29
41
|
self, node: Dict[str, Any], record: Dict[str, Any], field_mappings: Dict[str, str]
|
|
@@ -106,19 +118,26 @@ class SpecialExpressionEvaluator:
|
|
|
106
118
|
elif "as" in record:
|
|
107
119
|
geo_data["as"] = record["as"]
|
|
108
120
|
else:
|
|
109
|
-
# Default locations
|
|
121
|
+
# Default locations (ECS style)
|
|
110
122
|
if "." in actual_field:
|
|
111
|
-
# For nested fields like destination.ip, check destination.geo
|
|
123
|
+
# For nested fields like destination.ip, check destination.geo and destination.as
|
|
112
124
|
parent_path = actual_field.rsplit(".", 1)[0]
|
|
113
125
|
parent = self._get_field_value(record, parent_path)
|
|
114
|
-
if isinstance(parent, dict) and "geo" in parent:
|
|
115
|
-
# Found geo data under parent
|
|
116
|
-
geo_data =
|
|
126
|
+
if isinstance(parent, dict) and ("geo" in parent or "as" in parent):
|
|
127
|
+
# Found geo/as data under parent
|
|
128
|
+
geo_data = {}
|
|
129
|
+
if "geo" in parent:
|
|
130
|
+
geo_data["geo"] = parent["geo"]
|
|
131
|
+
if "as" in parent:
|
|
132
|
+
geo_data["as"] = parent["as"]
|
|
117
133
|
else:
|
|
118
|
-
# For top-level fields, check
|
|
119
|
-
if "
|
|
120
|
-
|
|
121
|
-
|
|
134
|
+
# For top-level fields like ip, check top-level geo and as fields (ECS style)
|
|
135
|
+
if "geo" in record or "as" in record:
|
|
136
|
+
geo_data = {}
|
|
137
|
+
if "geo" in record:
|
|
138
|
+
geo_data["geo"] = record["geo"]
|
|
139
|
+
if "as" in record:
|
|
140
|
+
geo_data["as"] = record["as"]
|
|
122
141
|
|
|
123
142
|
# Check if we should use existing geo data or force a new lookup
|
|
124
143
|
force_lookup = geo_params.get("force", False)
|
|
@@ -148,6 +167,39 @@ class SpecialExpressionEvaluator:
|
|
|
148
167
|
# Apply geo lookup
|
|
149
168
|
geo_data = apply_mutators(field_value, [geo_mutator], actual_field, record)
|
|
150
169
|
|
|
170
|
+
# Always include enrichment in query results (save=True adds to record for output)
|
|
171
|
+
# Note: This does not modify source files - enrichment only appears in query results
|
|
172
|
+
save_enrichment = geo_params.get("save", True)
|
|
173
|
+
if save_enrichment and geo_data and isinstance(geo_data, dict):
|
|
174
|
+
# Determine where to save the enrichment
|
|
175
|
+
if custom_field:
|
|
176
|
+
# Save to custom field location
|
|
177
|
+
self._set_field_value(record, custom_field, geo_data.get("geo"))
|
|
178
|
+
if "as" in geo_data:
|
|
179
|
+
# Save AS data as sibling to geo field
|
|
180
|
+
if "." in custom_field:
|
|
181
|
+
as_parent_path = custom_field.rsplit(".", 1)[0]
|
|
182
|
+
parent = self._get_field_value(record, as_parent_path)
|
|
183
|
+
if isinstance(parent, dict):
|
|
184
|
+
parent["as"] = geo_data["as"]
|
|
185
|
+
else:
|
|
186
|
+
record["as"] = geo_data["as"]
|
|
187
|
+
elif "." in actual_field:
|
|
188
|
+
# For nested fields like destination.ip, save to destination.geo and destination.as (ECS style)
|
|
189
|
+
parent_path = actual_field.rsplit(".", 1)[0]
|
|
190
|
+
parent = self._get_field_value(record, parent_path)
|
|
191
|
+
if isinstance(parent, dict):
|
|
192
|
+
if "geo" in geo_data:
|
|
193
|
+
parent["geo"] = geo_data["geo"]
|
|
194
|
+
if "as" in geo_data:
|
|
195
|
+
parent["as"] = geo_data["as"]
|
|
196
|
+
else:
|
|
197
|
+
# For top-level fields like ip, save to top-level geo and as fields (ECS style)
|
|
198
|
+
if "geo" in geo_data:
|
|
199
|
+
record["geo"] = geo_data["geo"]
|
|
200
|
+
if "as" in geo_data:
|
|
201
|
+
record["as"] = geo_data["as"]
|
|
202
|
+
|
|
151
203
|
# Now evaluate the conditions against the geo data
|
|
152
204
|
if conditions:
|
|
153
205
|
# Handle None geo_data (e.g., private IPs or lookup failures)
|
|
@@ -6,6 +6,7 @@ operator implementations, and special cases like CIDR matching.
|
|
|
6
6
|
|
|
7
7
|
import ipaddress
|
|
8
8
|
import re
|
|
9
|
+
from functools import lru_cache
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
11
12
|
|
|
@@ -15,6 +16,23 @@ class ValueComparator:
|
|
|
15
16
|
# Sentinel value to distinguish missing fields from None values
|
|
16
17
|
_MISSING_FIELD = object()
|
|
17
18
|
|
|
19
|
+
@staticmethod
|
|
20
|
+
@lru_cache(maxsize=256)
|
|
21
|
+
def _compile_regex(pattern: str) -> re.Pattern:
|
|
22
|
+
"""Compile and cache regex patterns for performance.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
pattern: Regex pattern string
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Compiled regex pattern
|
|
29
|
+
|
|
30
|
+
Note:
|
|
31
|
+
Uses LRU cache with max 256 patterns. This significantly improves
|
|
32
|
+
performance when the same regex patterns are used repeatedly in queries.
|
|
33
|
+
"""
|
|
34
|
+
return re.compile(pattern)
|
|
35
|
+
|
|
18
36
|
def compare_values(self, field_value: Any, operator: str, expected_value: Any) -> bool: # noqa: C901
|
|
19
37
|
"""Compare a field value against an expected value using the given operator.
|
|
20
38
|
|
|
@@ -49,9 +67,17 @@ class ValueComparator:
|
|
|
49
67
|
return False
|
|
50
68
|
|
|
51
69
|
# Handle None field values (field exists but is None)
|
|
70
|
+
# IMPORTANT: None is a valid value, distinct from missing fields.
|
|
71
|
+
# For 'exists' operator: This code path should NOT be reached because 'exists'
|
|
72
|
+
# checks field presence in the record, not the value. The evaluator handles
|
|
73
|
+
# 'exists' before calling compare_values. If we reach here with None, it means
|
|
74
|
+
# the field exists but has None value, which should NOT match 'exists'.
|
|
52
75
|
if field_value is None:
|
|
53
76
|
if operator in ["exists"]:
|
|
54
|
-
|
|
77
|
+
# Field key exists in record but value is None
|
|
78
|
+
# Semantics: 'exists' means "field has a non-null value"
|
|
79
|
+
# This matches database behavior where NULL != EXISTS
|
|
80
|
+
return False # None value does not satisfy 'exists'
|
|
55
81
|
elif operator in ["is"]:
|
|
56
82
|
# Check for null comparison - expected_value can be None or "null"
|
|
57
83
|
return expected_value is None or (isinstance(expected_value, str) and expected_value.lower() == "null")
|
|
@@ -68,6 +94,20 @@ class ValueComparator:
|
|
|
68
94
|
if isinstance(field_value, str) and field_value.lower() in ["true", "false"]:
|
|
69
95
|
field_value = field_value.lower() == "true"
|
|
70
96
|
|
|
97
|
+
# Type compatibility check for numeric operators
|
|
98
|
+
# If operator requires numeric comparison, both values must be numeric
|
|
99
|
+
# Exception: Arrays are handled specially in the operator logic below
|
|
100
|
+
if operator in ["gt", "gte", "lt", "lte", ">", ">=", "<", "<="]:
|
|
101
|
+
# Skip check if field_value is an array - handled by array logic below
|
|
102
|
+
if not isinstance(field_value, (list, tuple)):
|
|
103
|
+
field_is_numeric = isinstance(field_value, (int, float)) and not isinstance(field_value, bool)
|
|
104
|
+
expected_is_numeric = isinstance(expected_value, (int, float)) and not isinstance(expected_value, bool)
|
|
105
|
+
|
|
106
|
+
if not (field_is_numeric and expected_is_numeric):
|
|
107
|
+
# At least one value failed numeric conversion
|
|
108
|
+
# Cannot perform numeric comparison - return False
|
|
109
|
+
return False
|
|
110
|
+
|
|
71
111
|
try:
|
|
72
112
|
if operator in ["eq", "="]:
|
|
73
113
|
# Handle array fields - check if ANY element equals expected value
|
|
@@ -104,27 +144,30 @@ class ValueComparator:
|
|
|
104
144
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
105
145
|
expected_value = expected_value[0]
|
|
106
146
|
# Handle list fields by checking if ANY element contains the expected value
|
|
147
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
107
148
|
if isinstance(field_value, list):
|
|
108
149
|
# For arrays, check if ANY element contains the expected value
|
|
109
|
-
return any(str(expected_value) in str(elem) for elem in field_value)
|
|
150
|
+
return any(str(expected_value).lower() in str(elem).lower() for elem in field_value)
|
|
110
151
|
else:
|
|
111
|
-
return str(expected_value) in str(field_value)
|
|
152
|
+
return str(expected_value).lower() in str(field_value).lower()
|
|
112
153
|
elif operator == "startswith":
|
|
113
154
|
# Unwrap single-element lists for string operators
|
|
114
155
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
115
156
|
expected_value = expected_value[0]
|
|
116
157
|
# Handle array fields - check if ANY element starts with expected value
|
|
158
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
117
159
|
if isinstance(field_value, (list, tuple)):
|
|
118
|
-
return any(str(elem).startswith(str(expected_value)) for elem in field_value)
|
|
119
|
-
return str(field_value).startswith(str(expected_value))
|
|
160
|
+
return any(str(elem).lower().startswith(str(expected_value).lower()) for elem in field_value)
|
|
161
|
+
return str(field_value).lower().startswith(str(expected_value).lower())
|
|
120
162
|
elif operator == "endswith":
|
|
121
163
|
# Unwrap single-element lists for string operators
|
|
122
164
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
123
165
|
expected_value = expected_value[0]
|
|
124
166
|
# Handle array fields - check if ANY element ends with expected value
|
|
167
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
125
168
|
if isinstance(field_value, (list, tuple)):
|
|
126
|
-
return any(str(elem).endswith(str(expected_value)) for elem in field_value)
|
|
127
|
-
return str(field_value).endswith(str(expected_value))
|
|
169
|
+
return any(str(elem).lower().endswith(str(expected_value).lower()) for elem in field_value)
|
|
170
|
+
return str(field_value).lower().endswith(str(expected_value).lower())
|
|
128
171
|
elif operator == "in":
|
|
129
172
|
if isinstance(expected_value, list):
|
|
130
173
|
if len(expected_value) == 1 and isinstance(field_value, list):
|
|
@@ -143,7 +186,13 @@ class ValueComparator:
|
|
|
143
186
|
# Unwrap single-element lists for string operators
|
|
144
187
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
145
188
|
expected_value = expected_value[0]
|
|
146
|
-
|
|
189
|
+
# Use cached regex compilation for performance
|
|
190
|
+
try:
|
|
191
|
+
pattern = self._compile_regex(str(expected_value))
|
|
192
|
+
return bool(pattern.search(str(field_value)))
|
|
193
|
+
except (re.error, TypeError):
|
|
194
|
+
# Invalid regex pattern, fall back to no match
|
|
195
|
+
return False
|
|
147
196
|
elif operator == "cidr":
|
|
148
197
|
# Unwrap single-element lists for CIDR
|
|
149
198
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
@@ -194,22 +243,31 @@ class ValueComparator:
|
|
|
194
243
|
# Unwrap single-element lists for string operators
|
|
195
244
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
196
245
|
expected_value = expected_value[0]
|
|
197
|
-
|
|
246
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
247
|
+
return str(expected_value).lower() not in str(field_value).lower()
|
|
198
248
|
elif operator == "not_startswith":
|
|
199
249
|
# Unwrap single-element lists for string operators
|
|
200
250
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
201
251
|
expected_value = expected_value[0]
|
|
202
|
-
|
|
252
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
253
|
+
return not str(field_value).lower().startswith(str(expected_value).lower())
|
|
203
254
|
elif operator == "not_endswith":
|
|
204
255
|
# Unwrap single-element lists for string operators
|
|
205
256
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
206
257
|
expected_value = expected_value[0]
|
|
207
|
-
|
|
258
|
+
# Case-insensitive comparison to match post-processor behavior
|
|
259
|
+
return not str(field_value).lower().endswith(str(expected_value).lower())
|
|
208
260
|
elif operator == "not_regexp":
|
|
209
261
|
# Unwrap single-element lists for string operators
|
|
210
262
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|
|
211
263
|
expected_value = expected_value[0]
|
|
212
|
-
|
|
264
|
+
# Use cached regex compilation for performance
|
|
265
|
+
try:
|
|
266
|
+
pattern = self._compile_regex(str(expected_value))
|
|
267
|
+
return not bool(pattern.search(str(field_value)))
|
|
268
|
+
except (re.error, TypeError):
|
|
269
|
+
# Invalid regex pattern, fall back to match (not regexp succeeds)
|
|
270
|
+
return True
|
|
213
271
|
elif operator == "not_cidr":
|
|
214
272
|
# Unwrap single-element lists for CIDR
|
|
215
273
|
if isinstance(expected_value, list) and len(expected_value) == 1:
|