tellaro-query-language 0.2.2__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tql/stats_evaluator.py CHANGED
@@ -6,7 +6,7 @@ aggregation queries against data records in memory.
6
6
 
7
7
  import statistics
8
8
  from collections import defaultdict
9
- from typing import Any, Dict, List, Optional, Union
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
10
10
 
11
11
  from .exceptions import TQLError
12
12
 
@@ -58,6 +58,354 @@ class TQLStatsEvaluator:
58
58
  def __init__(self):
59
59
  """Initialize the stats evaluator."""
60
60
 
61
+ def evaluate_stats_streaming(
62
+ self,
63
+ record_iterator: Any,
64
+ stats_ast: Dict[str, Any],
65
+ field_mappings: Optional[Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]]] = None,
66
+ ) -> Dict[str, Any]:
67
+ """Evaluate stats query against streaming records.
68
+
69
+ This method processes records incrementally using accumulators to minimize
70
+ memory usage for large datasets.
71
+
72
+ Args:
73
+ record_iterator: Iterator/generator yielding records
74
+ stats_ast: Stats AST from parser
75
+ field_mappings: Optional field type mappings
76
+
77
+ Returns:
78
+ Aggregated results in UI-friendly format
79
+ """
80
+ aggregations = stats_ast.get("aggregations", [])
81
+ group_by_fields = stats_ast.get("group_by", [])
82
+
83
+ # Validate aggregation types against field mappings if provided
84
+ if field_mappings:
85
+ self._validate_aggregations(aggregations, field_mappings)
86
+
87
+ if not group_by_fields:
88
+ # Simple aggregation without grouping (streaming accumulators)
89
+ return self._streaming_simple_aggregation(record_iterator, aggregations)
90
+ else:
91
+ # Grouped aggregation (still needs to track groups)
92
+ return self._streaming_grouped_aggregation(record_iterator, aggregations, group_by_fields)
93
+
94
+ def _streaming_simple_aggregation( # noqa: C901
95
+ self, record_iterator: Any, aggregations: List[Dict[str, Any]]
96
+ ) -> Dict[str, Any]:
97
+ """Perform streaming aggregation without grouping.
98
+
99
+ Args:
100
+ record_iterator: Iterator yielding records
101
+ aggregations: Aggregation specifications
102
+
103
+ Returns:
104
+ Aggregated results
105
+ """
106
+ # Initialize accumulators for each aggregation
107
+ accumulators = {}
108
+ for agg in aggregations:
109
+ func = agg["function"]
110
+ field = agg["field"]
111
+ key = f"{func}_{field}"
112
+
113
+ accumulators[key] = {
114
+ "function": func,
115
+ "field": field,
116
+ "count": 0,
117
+ "sum": 0,
118
+ "min": None,
119
+ "max": None,
120
+ "values": [], # For unique, values, percentiles
121
+ "unique_set": set(), # For unique_count
122
+ }
123
+
124
+ # Process records
125
+ for record in record_iterator:
126
+ for agg in aggregations:
127
+ func = agg["function"]
128
+ field = agg["field"]
129
+ key = f"{func}_{field}"
130
+ acc = accumulators[key]
131
+
132
+ # Handle count(*)
133
+ if func == "count" and field == "*":
134
+ acc["count"] += 1
135
+ continue
136
+
137
+ # Get field value
138
+ value = self._get_field_value(record, field)
139
+ if value is None:
140
+ continue
141
+
142
+ # Update accumulator based on function
143
+ if func == "count":
144
+ acc["count"] += 1
145
+ elif func == "unique_count":
146
+ try:
147
+ acc["unique_set"].add(value)
148
+ except TypeError:
149
+ # Unhashable type, use string representation
150
+ acc["unique_set"].add(str(value))
151
+ elif func in ["sum", "min", "max", "average", "avg"]:
152
+ numeric_value = self._to_numeric(value)
153
+ acc["sum"] += numeric_value
154
+ acc["count"] += 1
155
+ if acc["min"] is None or numeric_value < acc["min"]:
156
+ acc["min"] = numeric_value
157
+ if acc["max"] is None or numeric_value > acc["max"]:
158
+ acc["max"] = numeric_value
159
+ elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
160
+ # Need to store all values for these
161
+ acc["values"].append(self._to_numeric(value))
162
+ elif func in ["values", "unique", "cardinality"]:
163
+ # Store unique values
164
+ if value not in acc["values"]:
165
+ acc["values"].append(value)
166
+
167
+ # Calculate final results
168
+ if len(aggregations) == 1:
169
+ agg = aggregations[0]
170
+ value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
171
+ return {
172
+ "type": "simple_aggregation",
173
+ "function": agg["function"],
174
+ "field": agg["field"],
175
+ "alias": agg.get("alias"),
176
+ "value": value,
177
+ }
178
+ else:
179
+ results = {}
180
+ for agg in aggregations:
181
+ value = self._finalize_accumulator(accumulators[f"{agg['function']}_{agg['field']}"], agg)
182
+ key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
183
+ results[key] = value
184
+ return {"type": "multiple_aggregations", "results": results}
185
+
186
+ def _streaming_grouped_aggregation( # noqa: C901
187
+ self, record_iterator: Any, aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
188
+ ) -> Dict[str, Any]:
189
+ """Perform streaming grouped aggregation.
190
+
191
+ For grouped aggregations, we still need to track groups in memory,
192
+ but we process records one at a time.
193
+
194
+ Args:
195
+ record_iterator: Iterator yielding records
196
+ aggregations: Aggregation specifications
197
+ group_by_fields: Fields to group by
198
+
199
+ Returns:
200
+ Grouped aggregation results
201
+ """
202
+ # Normalize group_by_fields
203
+ normalized_fields = []
204
+ for field in group_by_fields:
205
+ if isinstance(field, str):
206
+ normalized_fields.append({"field": field, "bucket_size": None})
207
+ elif isinstance(field, dict):
208
+ normalized_fields.append(field)
209
+ else:
210
+ normalized_fields.append({"field": str(field), "bucket_size": None})
211
+
212
+ # Track groups with accumulators
213
+ groups: Dict[Tuple[Any, ...], Dict[str, Any]] = defaultdict(
214
+ lambda: self._create_group_accumulators(aggregations)
215
+ )
216
+ key_mapping: Dict[Tuple[Any, ...], List[Tuple[str, Any]]] = {}
217
+
218
+ # Process records
219
+ for record in record_iterator:
220
+ # Build group key
221
+ key_parts = []
222
+ for field_spec in normalized_fields:
223
+ field_name = field_spec.get("field")
224
+ if field_name is None:
225
+ continue
226
+ value = self._get_field_value(record, field_name)
227
+ key_parts.append((field_name, value))
228
+
229
+ hashable_key = self._make_hashable_key(key_parts)
230
+
231
+ # Store key mapping
232
+ if hashable_key not in key_mapping:
233
+ key_mapping[hashable_key] = key_parts
234
+
235
+ # Update accumulators for this group
236
+ group_accs = groups[hashable_key]
237
+ self._update_group_accumulators(group_accs, record, aggregations)
238
+
239
+ # Finalize results
240
+ results = []
241
+ for hashable_key, group_accs in groups.items():
242
+ original_key = key_mapping[hashable_key]
243
+ group_result = {"key": dict(original_key), "doc_count": group_accs["doc_count"]}
244
+
245
+ if len(aggregations) == 1:
246
+ agg = aggregations[0]
247
+ value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
248
+ agg_key = agg.get("alias") or agg["function"]
249
+ group_result[agg_key] = value
250
+ else:
251
+ group_result["aggregations"] = {}
252
+ for agg in aggregations:
253
+ value = self._finalize_accumulator(group_accs[f"{agg['function']}_{agg['field']}"], agg)
254
+ agg_key = agg.get("alias") or f"{agg['function']}_{agg['field']}"
255
+ group_result["aggregations"][agg_key] = value
256
+
257
+ results.append(group_result)
258
+
259
+ # Apply modifiers and bucket limits
260
+ results = self._apply_modifiers(results, aggregations)
261
+ results = self._apply_bucket_limits(results, normalized_fields)
262
+
263
+ # Extract field names for response
264
+ group_by_field_names = []
265
+ for field in group_by_fields:
266
+ if isinstance(field, str):
267
+ group_by_field_names.append(field)
268
+ elif isinstance(field, dict) and "field" in field:
269
+ group_by_field_names.append(field["field"])
270
+ else:
271
+ group_by_field_names.append(str(field))
272
+
273
+ return {"type": "grouped_aggregation", "group_by": group_by_field_names, "results": results}
274
+
275
+ def _create_group_accumulators(self, aggregations: List[Dict[str, Any]]) -> Dict[str, Any]:
276
+ """Create accumulator structure for a single group.
277
+
278
+ Args:
279
+ aggregations: Aggregation specifications
280
+
281
+ Returns:
282
+ Dictionary of accumulators
283
+ """
284
+ accumulators: Dict[str, Any] = {"doc_count": 0}
285
+ for agg in aggregations:
286
+ func = agg["function"]
287
+ field = agg["field"]
288
+ key = f"{func}_{field}"
289
+
290
+ acc_value: Dict[str, Any] = {
291
+ "function": func,
292
+ "field": field,
293
+ "count": 0,
294
+ "sum": 0,
295
+ "min": None,
296
+ "max": None,
297
+ "values": [],
298
+ "unique_set": set(),
299
+ }
300
+ accumulators[key] = acc_value
301
+ return accumulators
302
+
303
+ def _update_group_accumulators( # noqa: C901
304
+ self, group_accs: Dict[str, Any], record: Dict[str, Any], aggregations: List[Dict[str, Any]]
305
+ ) -> None:
306
+ """Update group accumulators with a new record.
307
+
308
+ Args:
309
+ group_accs: Group accumulators dictionary
310
+ record: Record to process
311
+ aggregations: Aggregation specifications
312
+ """
313
+ group_accs["doc_count"] += 1
314
+
315
+ for agg in aggregations:
316
+ func = agg["function"]
317
+ field = agg["field"]
318
+ key = f"{func}_{field}"
319
+ acc = group_accs[key]
320
+
321
+ # Handle count(*)
322
+ if func == "count" and field == "*":
323
+ acc["count"] += 1
324
+ continue
325
+
326
+ # Get field value
327
+ value = self._get_field_value(record, field)
328
+ if value is None:
329
+ continue
330
+
331
+ # Update accumulator
332
+ if func == "count":
333
+ acc["count"] += 1
334
+ elif func == "unique_count":
335
+ try:
336
+ acc["unique_set"].add(value)
337
+ except TypeError:
338
+ acc["unique_set"].add(str(value))
339
+ elif func in ["sum", "min", "max", "average", "avg"]:
340
+ numeric_value = self._to_numeric(value)
341
+ acc["sum"] += numeric_value
342
+ acc["count"] += 1
343
+ if acc["min"] is None or numeric_value < acc["min"]:
344
+ acc["min"] = numeric_value
345
+ if acc["max"] is None or numeric_value > acc["max"]:
346
+ acc["max"] = numeric_value
347
+ elif func in ["median", "med", "percentile", "percentiles", "p", "pct", "std", "standard_deviation"]:
348
+ acc["values"].append(self._to_numeric(value))
349
+ elif func in ["values", "unique", "cardinality"]:
350
+ if value not in acc["values"]:
351
+ acc["values"].append(value)
352
+
353
+ def _finalize_accumulator(self, acc: Dict[str, Any], agg_spec: Dict[str, Any]) -> Any: # noqa: C901
354
+ """Finalize an accumulator to produce the final aggregation value.
355
+
356
+ Args:
357
+ acc: Accumulator dictionary
358
+ agg_spec: Aggregation specification
359
+
360
+ Returns:
361
+ Final aggregated value
362
+ """
363
+ func = agg_spec["function"]
364
+
365
+ if func == "count":
366
+ return acc["count"]
367
+ elif func == "unique_count":
368
+ return len(acc["unique_set"])
369
+ elif func == "sum":
370
+ return acc["sum"]
371
+ elif func == "min":
372
+ return acc["min"]
373
+ elif func == "max":
374
+ return acc["max"]
375
+ elif func in ["average", "avg"]:
376
+ return acc["sum"] / acc["count"] if acc["count"] > 0 else None
377
+ elif func in ["median", "med"]:
378
+ if not acc["values"]:
379
+ return None
380
+ sorted_values = sorted(acc["values"])
381
+ return statistics.median(sorted_values)
382
+ elif func in ["std", "standard_deviation"]:
383
+ if len(acc["values"]) < 2:
384
+ return None
385
+ return statistics.stdev(acc["values"])
386
+ elif func in ["percentile", "percentiles", "p", "pct"]:
387
+ if not acc["values"]:
388
+ return None
389
+ sorted_values = sorted(acc["values"])
390
+ percentile_values = agg_spec.get("percentile_values", [50])
391
+
392
+ if len(percentile_values) == 1:
393
+ return self._calculate_percentile(sorted_values, percentile_values[0])
394
+ else:
395
+ result = {}
396
+ for p in percentile_values:
397
+ result[f"p{int(p)}"] = self._calculate_percentile(sorted_values, p)
398
+ return result
399
+ elif func in ["values", "unique", "cardinality"]:
400
+ unique_values = acc["values"]
401
+ try:
402
+ unique_values.sort()
403
+ except TypeError:
404
+ pass
405
+ return unique_values
406
+ else:
407
+ return None
408
+
61
409
  def evaluate_stats(
62
410
  self, records: List[Dict[str, Any]], stats_ast: Dict[str, Any], field_mappings: Optional[Dict[str, str]] = None
63
411
  ) -> Dict[str, Any]:
@@ -85,7 +433,11 @@ class TQLStatsEvaluator:
85
433
  # Grouped aggregation
86
434
  return self._grouped_aggregation(records, aggregations, group_by_fields)
87
435
 
88
- def _validate_aggregations(self, aggregations: List[Dict[str, Any]], field_mappings: Dict[str, str]) -> None:
436
+ def _validate_aggregations(
437
+ self,
438
+ aggregations: List[Dict[str, Any]],
439
+ field_mappings: Union[Dict[str, str], Dict[str, Union[str, Dict[str, Any]]]],
440
+ ) -> None:
89
441
  """Validate that aggregation functions are compatible with field types.
90
442
 
91
443
  Args:
@@ -146,7 +498,7 @@ class TQLStatsEvaluator:
146
498
 
147
499
  return {"type": "multiple_aggregations", "results": results}
148
500
 
149
- def _grouped_aggregation(
501
+ def _grouped_aggregation( # noqa: C901
150
502
  self, records: List[Dict[str, Any]], aggregations: List[Dict[str, Any]], group_by_fields: List[Any]
151
503
  ) -> Dict[str, Any]:
152
504
  """Perform aggregation with grouping.
@@ -180,7 +532,9 @@ class TQLStatsEvaluator:
180
532
  # Build group key
181
533
  key_parts = []
182
534
  for field_spec in normalized_fields:
183
- field_name = field_spec["field"]
535
+ field_name = field_spec.get("field")
536
+ if field_name is None:
537
+ continue
184
538
  value = self._get_field_value(record, field_name)
185
539
  key_parts.append((field_name, value))
186
540
 
@@ -371,7 +725,7 @@ class TQLStatsEvaluator:
371
725
 
372
726
  return results
373
727
 
374
- def _apply_bucket_limits(
728
+ def _apply_bucket_limits( # noqa: C901
375
729
  self, results: List[Dict[str, Any]], normalized_fields: List[Dict[str, Any]]
376
730
  ) -> List[Dict[str, Any]]:
377
731
  """Apply per-field bucket size limits to results.
@@ -407,8 +761,8 @@ class TQLStatsEvaluator:
407
761
  filtered_results = []
408
762
 
409
763
  # Track unique values at each level
410
- level_values = {}
411
- for level, field_spec in enumerate(normalized_fields):
764
+ level_values: Dict[int, Dict[Any, Set[Any]]] = {}
765
+ for level, _field_spec in enumerate(normalized_fields):
412
766
  level_values[level] = {}
413
767
 
414
768
  for result in results: