tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tellaro_query_language-0.2.6.dist-info/LICENSE +72 -0
- tellaro_query_language-0.2.6.dist-info/METADATA +806 -0
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/RECORD +23 -20
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/entry_points.txt +1 -0
- tql/cache/base.py +36 -2
- tql/cache/memory.py +53 -6
- tql/cache/redis.py +52 -11
- tql/cli.py +484 -0
- tql/core.py +244 -5
- tql/evaluator.py +1 -1
- tql/evaluator_components/special_expressions.py +62 -10
- tql/evaluator_components/value_comparison.py +0 -4
- tql/exceptions.py +6 -4
- tql/field_type_inference.py +285 -0
- tql/mutators/geo.py +57 -20
- tql/opensearch_components/query_converter.py +1 -1
- tql/opensearch_stats.py +7 -6
- tql/parser.py +7 -3
- tql/post_processor.py +8 -4
- tql/scripts.py +3 -3
- tql/stats_evaluator.py +357 -5
- tql/streaming_file_processor.py +335 -0
- tellaro_query_language-0.2.3.dist-info/LICENSE +0 -21
- tellaro_query_language-0.2.3.dist-info/METADATA +0 -433
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/WHEEL +0 -0
tql/stats_evaluator.py
CHANGED
|
@@ -6,7 +6,7 @@ aggregation queries against data records in memory.
|
|
|
6
6
|
|
|
7
7
|
import statistics
|
|
8
8
|
from collections import defaultdict
|
|
9
|
-
from typing import Any, Dict, List, Optional, Set, Union
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
11
|
from .exceptions import TQLError
|
|
12
12
|
|
|
@@ -58,6 +58,354 @@ class TQLStatsEvaluator:
|
|
|
58
58
|
def __init__(self):
|
|
59
59
|
"""Initialize the stats evaluator."""
|
|
60
60
|
|
|
61
|
+
def evaluate_stats_streaming(
|
|
62
|
+
self,
|
|
63
|
+
record_iterator: Any,
|
|
64
|
+
stats_ast: Dict[str, Any],
|
|
65
|
+
field_mappings: Optional[Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]]] = None,
|
|
66
|
+
) -> Dict[str, Any]:
|
|
67
|
+
"""Evaluate stats query against streaming records.
|
|
68
|
+
|
|
69
|
+
This method processes records incrementally using accumulators to minimize
|
|
70
|
+
memory usage for large datasets.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
record_iterator: Iterator/generator yielding records
|
|
74
|
+
stats_ast: Stats AST from parser
|
|
75
|
+
field_mappings: Optional field type mappings
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Aggregated results in UI-friendly format
|
|
79
|
+
"""
|
|
80
|
+
aggregations = stats_ast.get("aggregations", [])
|
|
81
|
+
group_by_fields = stats_ast.get("group_by", [])
|
|
82
|
+
|
|
83
|
+
# Validate aggregation types against field mappings if provided
|
|
84
|
+
if field_mappings:
|
|
85
|
+
self._validate_aggregations(aggregations, field_mappings)
|
|
86
|
+
|
|
87
|
+
if not group_by_fields:
|
|
88
|
+
# Simple aggregation without grouping (streaming accumulators)
|
|
89
|
+
return self._streaming_simple_aggregation(record_iterator, aggregations)
|
|
90
|
+
else:
|
|
91
|
+
# Grouped aggregation (still needs to track groups)
|
|
92
|
+
return self._streaming_grouped_aggregation(record_iterator, aggregations, group_by_fields)
|
|
93
|
+
|
|
94
|
+
def _streaming_simple_aggregation( # noqa: C901
|
|
95
|
+
self, record_iterator: Any, aggregations: List[Dict[str, Any]]
|
|
96
|
+
) -> Dict[str, Any]:
|
|
97
|
+
"""Perform streaming aggregation without grouping.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
record_iterator: Iterator yielding records
|
|
101
|
+
aggregations: Aggregation specifications
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Aggregated results
|
|
105
|
+
"""
|
|
106
|
+
# Initialize accumulators for each aggregation
|
|
107
|
+
accumulators = {}
|
|
108
|
+
for agg in aggregations:
|
|
109
|
+
func = agg["function"]
|
|
110
|
+
field = agg["field"]
|
|
111
|
+
key = f"{func}_{field}"
|
|
112
|
+
|
|
113
|
+
accumulators[key] = {
|
|
114
|
+
"function": func,
|
|
115
|
+
"field": field,
|
|
116
|
+
"count": 0,
|
|
117
|
+
"sum": 0,
|
|
118
|
+
"min": None,
|
|
119
|
+
"max": None,
|
|
120
|
+
"values": [], # For unique, values, percentiles
|
|
121
|
+
"unique_set": set(), # For unique_count
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# Process records
|
|
125
|
+
for record in record_iterator:
|
|
126
|
+
for agg in aggregations:
|
|
127
|
+
func = agg["function"]
|
|
128
|
+
field = agg["field"]
|
|
129
|
+
key = f"{func}_{field}"
|
|
130
|
+
acc = accumulators[key]
|
|
131
|
+
|
|
132
|
+
# Handle count(*)
|
|
133
|
+
if func == "count" and field == "*":
|
|
134
|
+
acc["count"] += 1
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Get field value
|
|
138
|
+
value = self._get_field_value(record, field)
|
|
139
|
+
if value is None:
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# Update accumulator based on function
|
|
143
|
+
if func == "count":
|
|
144
|
+
acc["count"] += 1
|
|
145
|
+
elif func == "unique_count":
|
|
146
|
+
try:
|
|
147
|
+
acc["unique_set"].add(value)
|
|
148
|
+
except TypeError:
|
|
149
|
+
# Unhashable type, use string representation
|
|
150
|
+
acc["unique_set"].add(str(value))
|
|
151
|
+
elif func in ["sum", "min", "max", "average", "avg"]:
|
|
152
|
+
numeric_value = self._to_numeric(value)
|
|
153
|
+
acc["sum"] += numeric_value
|
|
154
|
+
acc["count"] += 1
|
|
155
|
+
if acc["min"] is None or numeric_value < acc["min"]:
|
|
156
|
+
acc["min"] = numeric_value
|
|
157
|
+
if acc["max"] is None or numeric_value > acc["max"]:
|
|
158
|
+
acc["max"] = numeric_value
|
|
159
|
+
elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
|
|
160
|
+
# Need to store all values for these
|
|
161
|
+
acc["values"].append(self._to_numeric(value))
|
|
162
|
+
elif func in ["values", "unique", "cardinality"]:
|
|
163
|
+
# Store unique values
|
|
164
|
+
if value not in acc["values"]:
|
|
165
|
+
acc["values"].append(value)
|
|
166
|
+
|
|
167
|
+
# Calculate final results
|
|
168
|
+
if len(aggregations) == 1:
|
|
169
|
+
agg = aggregations[0]
|
|
170
|
+
value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
|
|
171
|
+
return {
|
|
172
|
+
"type": "simple_aggregation",
|
|
173
|
+
"function": agg["function"],
|
|
174
|
+
"field": agg["field"],
|
|
175
|
+
"alias": agg.get("alias"),
|
|
176
|
+
"value": value,
|
|
177
|
+
}
|
|
178
|
+
else:
|
|
179
|
+
results = {}
|
|
180
|
+
for agg in aggregations:
|
|
181
|
+
value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
|
|
182
|
+
key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
|
|
183
|
+
results[key] = value
|
|
184
|
+
return {"type": "multiple_aggregations", "results": results}
|
|
185
|
+
|
|
186
|
+
def _streaming_grouped_aggregation( # noqa: C901
|
|
187
|
+
self, record_iterator: Any, aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
|
|
188
|
+
) -> Dict[str, Any]:
|
|
189
|
+
"""Perform streaming grouped aggregation.
|
|
190
|
+
|
|
191
|
+
For grouped aggregations, we still need to track groups in memory,
|
|
192
|
+
but we process records one at a time.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
record_iterator: Iterator yielding records
|
|
196
|
+
aggregations: Aggregation specifications
|
|
197
|
+
group_by_fields: Fields to group by
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Grouped aggregation results
|
|
201
|
+
"""
|
|
202
|
+
# Normalize group_by_fields
|
|
203
|
+
normalized_fields = []
|
|
204
|
+
for field in group_by_fields:
|
|
205
|
+
if isinstance(field, str):
|
|
206
|
+
normalized_fields.append({"field": field, "bucket_size": None})
|
|
207
|
+
elif isinstance(field, dict):
|
|
208
|
+
normalized_fields.append(field)
|
|
209
|
+
else:
|
|
210
|
+
normalized_fields.append({"field": str(field), "bucket_size": None})
|
|
211
|
+
|
|
212
|
+
# Track groups with accumulators
|
|
213
|
+
groups: Dict[Tuple[Any, ...], Dict[str, Any]] = defaultdict(
|
|
214
|
+
lambda: self._create_group_accumulators(aggregations)
|
|
215
|
+
)
|
|
216
|
+
key_mapping: Dict[Tuple[Any, ...], List[Tuple[str, Any]]] = {}
|
|
217
|
+
|
|
218
|
+
# Process records
|
|
219
|
+
for record in record_iterator:
|
|
220
|
+
# Build group key
|
|
221
|
+
key_parts = []
|
|
222
|
+
for field_spec in normalized_fields:
|
|
223
|
+
field_name = field_spec.get("field")
|
|
224
|
+
if field_name is None:
|
|
225
|
+
continue
|
|
226
|
+
value = self._get_field_value(record, field_name)
|
|
227
|
+
key_parts.append((field_name, value))
|
|
228
|
+
|
|
229
|
+
hashable_key = self._make_hashable_key(key_parts)
|
|
230
|
+
|
|
231
|
+
# Store key mapping
|
|
232
|
+
if hashable_key not in key_mapping:
|
|
233
|
+
key_mapping[hashable_key] = key_parts
|
|
234
|
+
|
|
235
|
+
# Update accumulators for this group
|
|
236
|
+
group_accs = groups[hashable_key]
|
|
237
|
+
self._update_group_accumulators(group_accs, record, aggregations)
|
|
238
|
+
|
|
239
|
+
# Finalize results
|
|
240
|
+
results = []
|
|
241
|
+
for hashable_key, group_accs in groups.items():
|
|
242
|
+
original_key = key_mapping[hashable_key]
|
|
243
|
+
group_result = {"key": dict(original_key), "doc_count": group_accs["doc_count"]}
|
|
244
|
+
|
|
245
|
+
if len(aggregations) == 1:
|
|
246
|
+
agg = aggregations[0]
|
|
247
|
+
value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
|
|
248
|
+
agg_key = agg.get("alias") or agg["function"]
|
|
249
|
+
group_result[agg_key] = value
|
|
250
|
+
else:
|
|
251
|
+
group_result["aggregations"] = {}
|
|
252
|
+
for agg in aggregations:
|
|
253
|
+
value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
|
|
254
|
+
agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
|
|
255
|
+
group_result["aggregations"][agg_key] = value
|
|
256
|
+
|
|
257
|
+
results.append(group_result)
|
|
258
|
+
|
|
259
|
+
# Apply modifiers and bucket limits
|
|
260
|
+
results = self._apply_modifiers(results, aggregations)
|
|
261
|
+
results = self._apply_bucket_limits(results, normalized_fields)
|
|
262
|
+
|
|
263
|
+
# Extract field names for response
|
|
264
|
+
group_by_field_names = []
|
|
265
|
+
for field in group_by_fields:
|
|
266
|
+
if isinstance(field, str):
|
|
267
|
+
group_by_field_names.append(field)
|
|
268
|
+
elif isinstance(field, dict) and "field" in field:
|
|
269
|
+
group_by_field_names.append(field["field"])
|
|
270
|
+
else:
|
|
271
|
+
group_by_field_names.append(str(field))
|
|
272
|
+
|
|
273
|
+
return {"type": "grouped_aggregation", "group_by": group_by_field_names, "results": results}
|
|
274
|
+
|
|
275
|
+
def _create_group_accumulators(self, aggregations: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
276
|
+
"""Create accumulator structure for a single group.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
aggregations: Aggregation specifications
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Dictionary of accumulators
|
|
283
|
+
"""
|
|
284
|
+
accumulators: Dict[str, Any] = {"doc_count": 0}
|
|
285
|
+
for agg in aggregations:
|
|
286
|
+
func = agg["function"]
|
|
287
|
+
field = agg["field"]
|
|
288
|
+
key = f"{func}_{field}"
|
|
289
|
+
|
|
290
|
+
acc_value: Dict[str, Any] = {
|
|
291
|
+
"function": func,
|
|
292
|
+
"field": field,
|
|
293
|
+
"count": 0,
|
|
294
|
+
"sum": 0,
|
|
295
|
+
"min": None,
|
|
296
|
+
"max": None,
|
|
297
|
+
"values": [],
|
|
298
|
+
"unique_set": set(),
|
|
299
|
+
}
|
|
300
|
+
accumulators[key] = acc_value
|
|
301
|
+
return accumulators
|
|
302
|
+
|
|
303
|
+
def _update_group_accumulators( # noqa: C901
|
|
304
|
+
self, group_accs: Dict[str, Any], record: Dict[str, Any], aggregations: List[Dict[str, Any]]
|
|
305
|
+
) -> None:
|
|
306
|
+
"""Update group accumulators with a new record.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
group_accs: Group accumulators dictionary
|
|
310
|
+
record: Record to process
|
|
311
|
+
aggregations: Aggregation specifications
|
|
312
|
+
"""
|
|
313
|
+
group_accs["doc_count"] += 1
|
|
314
|
+
|
|
315
|
+
for agg in aggregations:
|
|
316
|
+
func = agg["function"]
|
|
317
|
+
field = agg["field"]
|
|
318
|
+
key = f"{func}_{field}"
|
|
319
|
+
acc = group_accs[key]
|
|
320
|
+
|
|
321
|
+
# Handle count(*)
|
|
322
|
+
if func == "count" and field == "*":
|
|
323
|
+
acc["count"] += 1
|
|
324
|
+
continue
|
|
325
|
+
|
|
326
|
+
# Get field value
|
|
327
|
+
value = self._get_field_value(record, field)
|
|
328
|
+
if value is None:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
# Update accumulator
|
|
332
|
+
if func == "count":
|
|
333
|
+
acc["count"] += 1
|
|
334
|
+
elif func == "unique_count":
|
|
335
|
+
try:
|
|
336
|
+
acc["unique_set"].add(value)
|
|
337
|
+
except TypeError:
|
|
338
|
+
acc["unique_set"].add(str(value))
|
|
339
|
+
elif func in ["sum", "min", "max", "average", "avg"]:
|
|
340
|
+
numeric_value = self._to_numeric(value)
|
|
341
|
+
acc["sum"] += numeric_value
|
|
342
|
+
acc["count"] += 1
|
|
343
|
+
if acc["min"] is None or numeric_value < acc["min"]:
|
|
344
|
+
acc["min"] = numeric_value
|
|
345
|
+
if acc["max"] is None or numeric_value > acc["max"]:
|
|
346
|
+
acc["max"] = numeric_value
|
|
347
|
+
elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
|
|
348
|
+
acc["values"].append(self._to_numeric(value))
|
|
349
|
+
elif func in ["values", "unique", "cardinality"]:
|
|
350
|
+
if value not in acc["values"]:
|
|
351
|
+
acc["values"].append(value)
|
|
352
|
+
|
|
353
|
+
def _finalize_accumulator(self, acc: Dict[str, Any], agg_spec: Dict[str, Any]) -> Any: # noqa: C901
|
|
354
|
+
"""Finalize an accumulator to produce the final aggregation value.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
acc: Accumulator dictionary
|
|
358
|
+
agg_spec: Aggregation specification
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Final aggregated value
|
|
362
|
+
"""
|
|
363
|
+
func = agg_spec["function"]
|
|
364
|
+
|
|
365
|
+
if func == "count":
|
|
366
|
+
return acc["count"]
|
|
367
|
+
elif func == "unique_count":
|
|
368
|
+
return len(acc["unique_set"])
|
|
369
|
+
elif func == "sum":
|
|
370
|
+
return acc["sum"]
|
|
371
|
+
elif func == "min":
|
|
372
|
+
return acc["min"]
|
|
373
|
+
elif func == "max":
|
|
374
|
+
return acc["max"]
|
|
375
|
+
elif func in ["average", "avg"]:
|
|
376
|
+
return acc["sum"] / acc["count"] if acc["count"] > 0 else None
|
|
377
|
+
elif func in ["median", "med"]:
|
|
378
|
+
if not acc["values"]:
|
|
379
|
+
return None
|
|
380
|
+
sorted_values = sorted(acc["values"])
|
|
381
|
+
return statistics.median(sorted_values)
|
|
382
|
+
elif func in ["std", "standard_deviation"]:
|
|
383
|
+
if len(acc["values"]) < 2:
|
|
384
|
+
return None
|
|
385
|
+
return statistics.stdev(acc["values"])
|
|
386
|
+
elif func in ["percentile", "percentiles", "p", "pct"]:
|
|
387
|
+
if not acc["values"]:
|
|
388
|
+
return None
|
|
389
|
+
sorted_values = sorted(acc["values"])
|
|
390
|
+
percentile_values = agg_spec.get("percentile_values", [50])
|
|
391
|
+
|
|
392
|
+
if len(percentile_values) == 1:
|
|
393
|
+
return self._calculate_percentile(sorted_values, percentile_values[0])
|
|
394
|
+
else:
|
|
395
|
+
result = {}
|
|
396
|
+
for p in percentile_values:
|
|
397
|
+
result[f"p{int(p)}"] = self._calculate_percentile(sorted_values, p)
|
|
398
|
+
return result
|
|
399
|
+
elif func in ["values", "unique", "cardinality"]:
|
|
400
|
+
unique_values = acc["values"]
|
|
401
|
+
try:
|
|
402
|
+
unique_values.sort()
|
|
403
|
+
except TypeError:
|
|
404
|
+
pass
|
|
405
|
+
return unique_values
|
|
406
|
+
else:
|
|
407
|
+
return None
|
|
408
|
+
|
|
61
409
|
def evaluate_stats(
|
|
62
410
|
self, records: List[Dict[str, Any]], stats_ast: Dict[str, Any], field_mappings: Optional[Dict[str, str]] = None
|
|
63
411
|
) -> Dict[str, Any]:
|
|
@@ -85,7 +433,11 @@ class TQLStatsEvaluator:
|
|
|
85
433
|
# Grouped aggregation
|
|
86
434
|
return self._grouped_aggregation(records, aggregations, group_by_fields)
|
|
87
435
|
|
|
88
|
-
def _validate_aggregations(
|
|
436
|
+
def _validate_aggregations(
|
|
437
|
+
self,
|
|
438
|
+
aggregations: List[Dict[str, Any]],
|
|
439
|
+
field_mappings: Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]],
|
|
440
|
+
) -> None:
|
|
89
441
|
"""Validate that aggregation functions are compatible with field types.
|
|
90
442
|
|
|
91
443
|
Args:
|
|
@@ -146,7 +498,7 @@ class TQLStatsEvaluator:
|
|
|
146
498
|
|
|
147
499
|
return {"type": "multiple_aggregations", "results": results}
|
|
148
500
|
|
|
149
|
-
def _grouped_aggregation(
|
|
501
|
+
def _grouped_aggregation( # noqa: C901
|
|
150
502
|
self, records: List[Dict[str, Any]], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
|
|
151
503
|
) -> Dict[str, Any]:
|
|
152
504
|
"""Perform aggregation with grouping.
|
|
@@ -373,7 +725,7 @@ class TQLStatsEvaluator:
|
|
|
373
725
|
|
|
374
726
|
return results
|
|
375
727
|
|
|
376
|
-
def _apply_bucket_limits(
|
|
728
|
+
def _apply_bucket_limits( # noqa: C901
|
|
377
729
|
self, results: List[Dict[str, Any]], normalized_fields: List[Dict[str, Any]]
|
|
378
730
|
) -> List[Dict[str, Any]]:
|
|
379
731
|
"""Apply per-field bucket size limits to results.
|
|
@@ -410,7 +762,7 @@ class TQLStatsEvaluator:
|
|
|
410
762
|
|
|
411
763
|
# Track unique values at each level
|
|
412
764
|
level_values: Dict[int, Dict[Any, Set[Any]]] = {}
|
|
413
|
-
for level,
|
|
765
|
+
for level, _field_spec in enumerate(normalized_fields):
|
|
414
766
|
level_values[level] = {}
|
|
415
767
|
|
|
416
768
|
for result in results:
|