tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tql/core.py CHANGED
@@ -4,12 +4,19 @@ This module provides the main TQL class that serves as the primary interface
4
4
  for parsing and executing TQL queries against different backends.
5
5
  """
6
6
 
7
- from typing import Any, Dict, List, Optional, Union
7
+ from typing import Any, Dict, Generator, List, Optional, Union
8
8
 
9
9
  from .analyzer import EnhancedFieldMapping
10
10
  from .core_components import FileOperations, OpenSearchOperations, StatsOperations, ValidationOperations
11
11
  from .evaluator import TQLEvaluator
12
- from .exceptions import TQLOperatorError, TQLParseError, TQLSyntaxError, TQLTypeError, TQLValidationError
12
+ from .exceptions import (
13
+ TQLExecutionError,
14
+ TQLOperatorError,
15
+ TQLParseError,
16
+ TQLSyntaxError,
17
+ TQLTypeError,
18
+ TQLValidationError,
19
+ )
13
20
  from .mutator_analyzer import MutatorAnalysisResult
14
21
  from .parser import TQLParser
15
22
  from .stats_evaluator import TQLStatsEvaluator
@@ -27,7 +34,7 @@ class TQL:
27
34
  >>> results = tql.query(data, query)
28
35
  """
29
36
 
30
- def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None):
37
+ def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None): # noqa: C901
31
38
  """Initialize TQL instance.
32
39
 
33
40
  Args:
@@ -109,8 +116,9 @@ class TQL:
109
116
  else:
110
117
  # Find primary field (keys without dots, not starting with underscore)
111
118
  primary_fields = [
112
- field_key for field_key in v.keys()
113
- if '.' not in field_key and not field_key.startswith('_')
119
+ field_key
120
+ for field_key in v.keys()
121
+ if "." not in field_key and not field_key.startswith("_")
114
122
  ]
115
123
 
116
124
  if primary_fields:
@@ -1049,6 +1057,237 @@ class TQL:
1049
1057
  """
1050
1058
  return self.stats_ops.analyze_stats_query(query)
1051
1059
 
1060
+ def query_file_streaming(
1061
+ self,
1062
+ file_path: str,
1063
+ query: str,
1064
+ input_format: str = "auto",
1065
+ csv_delimiter: str = ",",
1066
+ csv_headers: Optional[List[str]] = None,
1067
+ no_header: bool = False,
1068
+ field_types: Optional[Dict[str, str]] = None,
1069
+ sample_size: int = 100,
1070
+ ) -> Generator[Dict[str, Any], None, None]:
1071
+ """Execute a TQL query against a file in streaming mode.
1072
+
1073
+ This method processes files line-by-line with minimal memory usage,
1074
+ yielding matching records as they are found.
1075
+
1076
+ Args:
1077
+ file_path: Path to file
1078
+ query: TQL query string (filter query only, not stats)
1079
+ input_format: File format ('json', 'jsonl', 'csv', 'auto')
1080
+ csv_delimiter: CSV delimiter character
1081
+ csv_headers: Manual CSV header names
1082
+ no_header: Force CSV to be treated as having no header
1083
+ field_types: Manual field type mappings
1084
+ sample_size: Number of records to sample for type inference
1085
+
1086
+ Yields:
1087
+ Matching records as dictionaries
1088
+
1089
+ Raises:
1090
+ TQLParseError: If query parsing fails
1091
+ TQLExecutionError: If file processing fails
1092
+ """
1093
+ from .streaming_file_processor import StreamingFileProcessor
1094
+
1095
+ # Parse the query
1096
+ ast = self.parse(query)
1097
+
1098
+ # Validate query type (only filter queries supported for streaming)
1099
+ query_type = ast.get("type")
1100
+ if query_type in ["stats_expr", "query_with_stats"]:
1101
+ raise TQLExecutionError("Stats queries not supported in streaming mode. Use query_file_stats() instead.")
1102
+
1103
+ # Create streaming processor
1104
+ processor = StreamingFileProcessor(
1105
+ sample_size=sample_size,
1106
+ csv_delimiter=csv_delimiter,
1107
+ field_types=field_types,
1108
+ csv_headers=csv_headers,
1109
+ no_header=no_header,
1110
+ )
1111
+
1112
+ # Process file and evaluate query on each record
1113
+ for record in processor.process_file(file_path, input_format):
1114
+ if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
1115
+ yield record
1116
+
1117
+ def query_file_stats(
1118
+ self,
1119
+ file_path: str,
1120
+ query: str,
1121
+ input_format: str = "auto",
1122
+ csv_delimiter: str = ",",
1123
+ csv_headers: Optional[List[str]] = None,
1124
+ no_header: bool = False,
1125
+ field_types: Optional[Dict[str, str]] = None,
1126
+ sample_size: int = 100,
1127
+ ) -> Dict[str, Any]:
1128
+ """Execute a TQL stats query against a file in streaming mode.
1129
+
1130
+ This method processes files line-by-line with accumulator-based stats
1131
+ calculations for memory efficiency.
1132
+
1133
+ Args:
1134
+ file_path: Path to file
1135
+ query: TQL query string (can include filters and stats)
1136
+ input_format: File format ('json', 'jsonl', 'csv', 'auto')
1137
+ csv_delimiter: CSV delimiter character
1138
+ csv_headers: Manual CSV header names
1139
+ no_header: Force CSV to be treated as having no header
1140
+ field_types: Manual field type mappings
1141
+ sample_size: Number of records to sample for type inference
1142
+
1143
+ Returns:
1144
+ Dictionary containing aggregation results
1145
+
1146
+ Raises:
1147
+ TQLParseError: If query parsing fails
1148
+ TQLExecutionError: If file processing fails
1149
+ """
1150
+ from .streaming_file_processor import StreamingFileProcessor
1151
+
1152
+ # Parse the query
1153
+ ast = self.parse(query)
1154
+ query_type = ast.get("type")
1155
+
1156
+ # Create streaming processor
1157
+ processor = StreamingFileProcessor(
1158
+ sample_size=sample_size,
1159
+ csv_delimiter=csv_delimiter,
1160
+ field_types=field_types,
1161
+ csv_headers=csv_headers,
1162
+ no_header=no_header,
1163
+ )
1164
+
1165
+ # Handle different query types
1166
+ if query_type == "stats_expr":
1167
+ # Pure stats query - process all records
1168
+ record_iter = processor.process_file(file_path, input_format)
1169
+ return self.stats_evaluator.evaluate_stats_streaming(record_iter, ast, self.field_mappings)
1170
+
1171
+ elif query_type == "query_with_stats":
1172
+ # Filter + stats query
1173
+ filter_ast = ast["filter"]
1174
+ stats_ast = ast["stats"]
1175
+
1176
+ # Create filtered iterator
1177
+ def filtered_records():
1178
+ for record in processor.process_file(file_path, input_format):
1179
+ if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
1180
+ yield record
1181
+
1182
+ return self.stats_evaluator.evaluate_stats_streaming(filtered_records(), stats_ast, self.field_mappings)
1183
+
1184
+ else:
1185
+ # Regular filter query - shouldn't use stats method
1186
+ raise TQLExecutionError("Use query_file_streaming() for filter queries without stats aggregations.")
1187
+
1188
+ def query_folder(
1189
+ self,
1190
+ folder_path: str,
1191
+ query: str,
1192
+ pattern: str = "*",
1193
+ input_format: str = "auto",
1194
+ recursive: bool = False,
1195
+ parallel: int = 4,
1196
+ csv_delimiter: str = ",",
1197
+ csv_headers: Optional[List[str]] = None,
1198
+ no_header: bool = False,
1199
+ field_types: Optional[Dict[str, str]] = None,
1200
+ sample_size: int = 100,
1201
+ ) -> Dict[str, Any]:
1202
+ """Execute a TQL query against multiple files in a folder.
1203
+
1204
+ This method processes all matching files and aggregates results,
1205
+ supporting both filter queries (with records) and stats queries.
1206
+
1207
+ Args:
1208
+ folder_path: Path to folder
1209
+ query: TQL query string
1210
+ pattern: Glob pattern for file matching
1211
+ input_format: File format ('json', 'jsonl', 'csv', 'auto')
1212
+ recursive: Process subdirectories recursively
1213
+ parallel: Number of parallel workers
1214
+ csv_delimiter: CSV delimiter character
1215
+ csv_headers: Manual CSV header names
1216
+ no_header: Force CSV to be treated as having no header
1217
+ field_types: Manual field type mappings
1218
+ sample_size: Number of records to sample for type inference
1219
+
1220
+ Returns:
1221
+ Dictionary containing results and/or stats aggregated across all files
1222
+
1223
+ Raises:
1224
+ TQLParseError: If query parsing fails
1225
+ TQLExecutionError: If folder processing fails
1226
+ """
1227
+ from .streaming_file_processor import StreamingFileProcessor
1228
+
1229
+ # Parse the query
1230
+ ast = self.parse(query)
1231
+ query_type = ast.get("type")
1232
+
1233
+ # Create streaming processor
1234
+ processor = StreamingFileProcessor(
1235
+ sample_size=sample_size,
1236
+ csv_delimiter=csv_delimiter,
1237
+ field_types=field_types,
1238
+ csv_headers=csv_headers,
1239
+ no_header=no_header,
1240
+ )
1241
+
1242
+ # Process folder based on query type
1243
+ if query_type == "stats_expr":
1244
+ # Pure stats query - aggregate across all files
1245
+
1246
+ def all_records():
1247
+ for _file_path, record in processor.process_folder(
1248
+ folder_path, pattern, input_format, recursive, parallel
1249
+ ):
1250
+ yield record
1251
+
1252
+ stats_result = self.stats_evaluator.evaluate_stats_streaming(all_records(), ast, self.field_mappings)
1253
+ return {"stats": stats_result, "files_processed": "multiple"}
1254
+
1255
+ elif query_type == "query_with_stats":
1256
+ # Filter + stats query
1257
+ filter_ast = ast["filter"]
1258
+ stats_ast = ast["stats"]
1259
+
1260
+ def filtered_records():
1261
+ for _file_path, record in processor.process_folder(
1262
+ folder_path, pattern, input_format, recursive, parallel
1263
+ ):
1264
+ if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
1265
+ yield record
1266
+
1267
+ stats_result = self.stats_evaluator.evaluate_stats_streaming(
1268
+ filtered_records(), stats_ast, self.field_mappings
1269
+ )
1270
+ return {"stats": stats_result, "files_processed": "multiple"}
1271
+
1272
+ else:
1273
+ # Regular filter query - collect matching records from all files
1274
+ matched_records = []
1275
+ files_processed = 0
1276
+ files_with_matches = 0
1277
+
1278
+ for file_path, record in processor.process_folder(folder_path, pattern, input_format, recursive, parallel):
1279
+ files_processed += 1
1280
+ if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
1281
+ matched_records.append({"_source_file": file_path, **record})
1282
+ files_with_matches += 1
1283
+
1284
+ return {
1285
+ "results": matched_records,
1286
+ "total": len(matched_records),
1287
+ "files_processed": files_processed,
1288
+ "files_with_matches": files_with_matches,
1289
+ }
1290
+
1052
1291
  def _apply_mutators_to_record(self, ast: Dict[str, Any], record: Dict[str, Any]) -> Dict[str, Any]:
1053
1292
  """Apply any mutators in the AST to enrich the record.
1054
1293
 
tql/evaluator.py CHANGED
@@ -67,7 +67,7 @@ class TQLEvaluator:
67
67
  field_mappings = field_mappings or {}
68
68
  return self._evaluate_node(ast, record, field_mappings)
69
69
 
70
- def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool:
70
+ def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool: # noqa: C901
71
71
  """Evaluate a single AST node against a record.
72
72
 
73
73
  Args:
@@ -15,15 +15,27 @@ class SpecialExpressionEvaluator:
15
15
  # Sentinel value to distinguish missing fields from None values
16
16
  _MISSING_FIELD = object()
17
17
 
18
- def __init__(self, get_field_value_func, evaluate_node_func):
18
+ def __init__(self, get_field_value_func, evaluate_node_func, set_field_value_func=None):
19
19
  """Initialize the special expression evaluator.
20
20
 
21
21
  Args:
22
22
  get_field_value_func: Function to get field values from records
23
23
  evaluate_node_func: Function to evaluate AST nodes
24
+ set_field_value_func: Optional function to set field values in records
24
25
  """
25
26
  self._get_field_value = get_field_value_func
26
27
  self._evaluate_node = evaluate_node_func
28
+ self._set_field_value = set_field_value_func or self._default_set_field_value
29
+
30
+ def _default_set_field_value(self, record: Dict[str, Any], field_path: str, value: Any) -> None:
31
+ """Default implementation of set_field_value for nested field assignment."""
32
+ parts = field_path.split(".")
33
+ current = record
34
+ for part in parts[:-1]:
35
+ if part not in current:
36
+ current[part] = {}
37
+ current = current[part]
38
+ current[parts[-1]] = value
27
39
 
28
40
  def evaluate_geo_expr( # noqa: C901
29
41
  self, node: Dict[str, Any], record: Dict[str, Any], field_mappings: Dict[str, str]
@@ -106,19 +118,26 @@ class SpecialExpressionEvaluator:
106
118
  elif "as" in record:
107
119
  geo_data["as"] = record["as"]
108
120
  else:
109
- # Default locations
121
+ # Default locations (ECS style)
110
122
  if "." in actual_field:
111
- # For nested fields like destination.ip, check destination.geo
123
+ # For nested fields like destination.ip, check destination.geo and destination.as
112
124
  parent_path = actual_field.rsplit(".", 1)[0]
113
125
  parent = self._get_field_value(record, parent_path)
114
- if isinstance(parent, dict) and "geo" in parent:
115
- # Found geo data under parent
116
- geo_data = parent
126
+ if isinstance(parent, dict) and ("geo" in parent or "as" in parent):
127
+ # Found geo/as data under parent
128
+ geo_data = {}
129
+ if "geo" in parent:
130
+ geo_data["geo"] = parent["geo"]
131
+ if "as" in parent:
132
+ geo_data["as"] = parent["as"]
117
133
  else:
118
- # For top-level fields, check enrichment.geo
119
- if "enrichment" in record and isinstance(record["enrichment"], dict):
120
- if "geo" in record["enrichment"]:
121
- geo_data = record["enrichment"]
134
+ # For top-level fields like ip, check top-level geo and as fields (ECS style)
135
+ if "geo" in record or "as" in record:
136
+ geo_data = {}
137
+ if "geo" in record:
138
+ geo_data["geo"] = record["geo"]
139
+ if "as" in record:
140
+ geo_data["as"] = record["as"]
122
141
 
123
142
  # Check if we should use existing geo data or force a new lookup
124
143
  force_lookup = geo_params.get("force", False)
@@ -148,6 +167,39 @@ class SpecialExpressionEvaluator:
148
167
  # Apply geo lookup
149
168
  geo_data = apply_mutators(field_value, [geo_mutator], actual_field, record)
150
169
 
170
+ # Always include enrichment in query results (save=True adds to record for output)
171
+ # Note: This does not modify source files - enrichment only appears in query results
172
+ save_enrichment = geo_params.get("save", True)
173
+ if save_enrichment and geo_data and isinstance(geo_data, dict):
174
+ # Determine where to save the enrichment
175
+ if custom_field:
176
+ # Save to custom field location
177
+ self._set_field_value(record, custom_field, geo_data.get("geo"))
178
+ if "as" in geo_data:
179
+ # Save AS data as sibling to geo field
180
+ if "." in custom_field:
181
+ as_parent_path = custom_field.rsplit(".", 1)[0]
182
+ parent = self._get_field_value(record, as_parent_path)
183
+ if isinstance(parent, dict):
184
+ parent["as"] = geo_data["as"]
185
+ else:
186
+ record["as"] = geo_data["as"]
187
+ elif "." in actual_field:
188
+ # For nested fields like destination.ip, save to destination.geo and destination.as (ECS style)
189
+ parent_path = actual_field.rsplit(".", 1)[0]
190
+ parent = self._get_field_value(record, parent_path)
191
+ if isinstance(parent, dict):
192
+ if "geo" in geo_data:
193
+ parent["geo"] = geo_data["geo"]
194
+ if "as" in geo_data:
195
+ parent["as"] = geo_data["as"]
196
+ else:
197
+ # For top-level fields like ip, save to top-level geo and as fields (ECS style)
198
+ if "geo" in geo_data:
199
+ record["geo"] = geo_data["geo"]
200
+ if "as" in geo_data:
201
+ record["as"] = geo_data["as"]
202
+
151
203
  # Now evaluate the conditions against the geo data
152
204
  if conditions:
153
205
  # Handle None geo_data (e.g., private IPs or lookup failures)
@@ -85,10 +85,6 @@ class ValueComparator:
85
85
  return False
86
86
 
87
87
  # Convert numeric strings to numbers for comparison
88
- # IMPORTANT: Store original values to check if conversion succeeded
89
- field_value_original = field_value
90
- expected_value_original = expected_value
91
-
92
88
  field_value = self._convert_numeric(field_value)
93
89
  expected_value = self._convert_numeric(expected_value)
94
90
 
tql/exceptions.py CHANGED
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
9
9
  class TQLError(Exception):
10
10
  """Base exception class for all TQL errors."""
11
11
 
12
- def __init__(
12
+ def __init__( # noqa: B042
13
13
  self,
14
14
  message: str,
15
15
  position: Optional[int] = None,
@@ -72,7 +72,7 @@ class TQLParseError(TQLError):
72
72
  class TQLTypeError(TQLError):
73
73
  """Raised when an operator is incompatible with a field's data type."""
74
74
 
75
- def __init__(
75
+ def __init__( # noqa: B042
76
76
  self, field: str, field_type: str, operator: str, valid_operators: Optional[List[str]] = None, **kwargs
77
77
  ):
78
78
  """Initialize type error with field and operator context."""
@@ -98,7 +98,7 @@ class TQLTypeError(TQLError):
98
98
  class TQLFieldError(TQLError):
99
99
  """Raised when referencing invalid or non-existent fields."""
100
100
 
101
- def __init__(self, field: str, available_fields: Optional[List[str]] = None, **kwargs):
101
+ def __init__(self, field: str, available_fields: Optional[List[str]] = None, **kwargs): # noqa: B042
102
102
  """Initialize field error with available fields context."""
103
103
  message = f"Unknown field '{field}'."
104
104
 
@@ -147,7 +147,9 @@ class TQLConfigError(TQLError):
147
147
  class TQLMutatorError(TQLError):
148
148
  """Raised when there's an error applying a mutator."""
149
149
 
150
- def __init__(self, mutator_name: str, field_name: str, value_type: str, message: Optional[str] = None, **kwargs):
150
+ def __init__( # noqa: B042
151
+ self, mutator_name: str, field_name: str, value_type: str, message: Optional[str] = None, **kwargs
152
+ ):
151
153
  """Initialize mutator error with context."""
152
154
  if not message:
153
155
  message = (