prismiq 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
prismiq/timeseries.py ADDED
@@ -0,0 +1,410 @@
1
+ """Time series bucketing utilities for Prismiq analytics.
2
+
3
+ This module provides utilities for grouping data by time intervals,
4
+ generating time buckets, and filling missing data points.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime, timedelta
10
+ from enum import Enum
11
+ from typing import Any
12
+
13
+ from pydantic import BaseModel, ConfigDict
14
+
15
+
16
+ class TimeInterval(str, Enum):
17
+ """Time intervals for bucketing."""
18
+
19
+ MINUTE = "minute"
20
+ HOUR = "hour"
21
+ DAY = "day"
22
+ WEEK = "week"
23
+ MONTH = "month"
24
+ QUARTER = "quarter"
25
+ YEAR = "year"
26
+
27
+
28
+ class TimeBucket(BaseModel):
29
+ """A time bucket with start and end times."""
30
+
31
+ model_config = ConfigDict(strict=True)
32
+
33
+ start: datetime
34
+ """Start of the time bucket (inclusive)."""
35
+
36
+ end: datetime
37
+ """End of the time bucket (exclusive for next bucket)."""
38
+
39
+ label: str
40
+ """Human-readable label like 'Jan 2024'."""
41
+
42
+
43
+ def get_date_trunc_sql(interval: TimeInterval, column: str) -> str:
44
+ """Generate PostgreSQL date_trunc expression.
45
+
46
+ Args:
47
+ interval: Time interval for truncation.
48
+ column: Column name to truncate.
49
+
50
+ Returns:
51
+ SQL expression like: date_trunc('day', "order_date")
52
+
53
+ Example:
54
+ >>> get_date_trunc_sql(TimeInterval.DAY, "order_date")
55
+ 'date_trunc(\\'day\\', "order_date")'
56
+ """
57
+ # Quote the column name to prevent SQL injection
58
+ escaped_column = column.replace('"', '""')
59
+ quoted_column = f'"{escaped_column}"'
60
+
61
+ return f"date_trunc('{interval.value}', {quoted_column})"
62
+
63
+
64
+ def get_interval_format(interval: TimeInterval) -> str:
65
+ """Get the appropriate date format string for the interval.
66
+
67
+ Args:
68
+ interval: Time interval.
69
+
70
+ Returns:
71
+ Format string suitable for datetime.strftime().
72
+ Note: Quarter format returns a special marker that needs post-processing.
73
+
74
+ Example:
75
+ >>> get_interval_format(TimeInterval.DAY)
76
+ '%Y-%m-%d'
77
+ """
78
+ format_map = {
79
+ TimeInterval.MINUTE: "%Y-%m-%d %H:%M",
80
+ TimeInterval.HOUR: "%Y-%m-%d %H:00",
81
+ TimeInterval.DAY: "%Y-%m-%d",
82
+ TimeInterval.WEEK: "%Y-W%W",
83
+ TimeInterval.MONTH: "%Y-%m",
84
+ TimeInterval.QUARTER: "%Y-Q%q", # Special marker, needs post-processing
85
+ TimeInterval.YEAR: "%Y",
86
+ }
87
+ return format_map[interval]
88
+
89
+
90
+ def _format_bucket_label(dt: datetime, interval: TimeInterval) -> str:
91
+ """Format a datetime as a human-readable bucket label.
92
+
93
+ Args:
94
+ dt: Datetime to format.
95
+ interval: Time interval for context.
96
+
97
+ Returns:
98
+ Human-readable label.
99
+ """
100
+ if interval == TimeInterval.MINUTE:
101
+ return dt.strftime("%b %d, %H:%M")
102
+
103
+ if interval == TimeInterval.HOUR:
104
+ return dt.strftime("%b %d, %H:00")
105
+
106
+ if interval == TimeInterval.DAY:
107
+ return dt.strftime("%b %d")
108
+
109
+ if interval == TimeInterval.WEEK:
110
+ # ISO week number
111
+ week_num = dt.isocalendar()[1]
112
+ return f"Week {week_num}, {dt.year}"
113
+
114
+ if interval == TimeInterval.MONTH:
115
+ return dt.strftime("%b %Y")
116
+
117
+ if interval == TimeInterval.QUARTER:
118
+ quarter = (dt.month - 1) // 3 + 1
119
+ return f"Q{quarter} {dt.year}"
120
+
121
+ if interval == TimeInterval.YEAR:
122
+ return str(dt.year)
123
+
124
+ # Fallback
125
+ return dt.isoformat()
126
+
127
+
128
+ def _truncate_datetime(dt: datetime, interval: TimeInterval) -> datetime:
129
+ """Truncate datetime to the start of the given interval.
130
+
131
+ Args:
132
+ dt: Datetime to truncate.
133
+ interval: Interval to truncate to.
134
+
135
+ Returns:
136
+ Truncated datetime.
137
+ """
138
+ # Remove timezone info for consistent handling
139
+ if dt.tzinfo is not None:
140
+ dt = dt.replace(tzinfo=None)
141
+
142
+ if interval == TimeInterval.MINUTE:
143
+ return dt.replace(second=0, microsecond=0)
144
+
145
+ if interval == TimeInterval.HOUR:
146
+ return dt.replace(minute=0, second=0, microsecond=0)
147
+
148
+ if interval == TimeInterval.DAY:
149
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
150
+
151
+ if interval == TimeInterval.WEEK:
152
+ # Week starts on Monday
153
+ days_since_monday = dt.weekday()
154
+ week_start = dt - timedelta(days=days_since_monday)
155
+ return week_start.replace(hour=0, minute=0, second=0, microsecond=0)
156
+
157
+ if interval == TimeInterval.MONTH:
158
+ return dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
159
+
160
+ if interval == TimeInterval.QUARTER:
161
+ quarter = (dt.month - 1) // 3
162
+ quarter_start_month = quarter * 3 + 1
163
+ return dt.replace(
164
+ month=quarter_start_month, day=1, hour=0, minute=0, second=0, microsecond=0
165
+ )
166
+
167
+ if interval == TimeInterval.YEAR:
168
+ return dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
169
+
170
+ return dt
171
+
172
+
173
+ def _get_next_bucket_start(dt: datetime, interval: TimeInterval) -> datetime:
174
+ """Get the start of the next bucket after the given datetime.
175
+
176
+ Args:
177
+ dt: Current bucket start.
178
+ interval: Time interval.
179
+
180
+ Returns:
181
+ Start of the next bucket.
182
+ """
183
+ if interval == TimeInterval.MINUTE:
184
+ return dt + timedelta(minutes=1)
185
+
186
+ if interval == TimeInterval.HOUR:
187
+ return dt + timedelta(hours=1)
188
+
189
+ if interval == TimeInterval.DAY:
190
+ return dt + timedelta(days=1)
191
+
192
+ if interval == TimeInterval.WEEK:
193
+ return dt + timedelta(weeks=1)
194
+
195
+ if interval == TimeInterval.MONTH:
196
+ # Move to next month
197
+ if dt.month == 12:
198
+ return dt.replace(year=dt.year + 1, month=1)
199
+ return dt.replace(month=dt.month + 1)
200
+
201
+ if interval == TimeInterval.QUARTER:
202
+ # Move to next quarter (3 months)
203
+ new_month = dt.month + 3
204
+ if new_month > 12:
205
+ return dt.replace(year=dt.year + 1, month=new_month - 12)
206
+ return dt.replace(month=new_month)
207
+
208
+ if interval == TimeInterval.YEAR:
209
+ return dt.replace(year=dt.year + 1)
210
+
211
+ return dt + timedelta(days=1)
212
+
213
+
214
+ def _get_bucket_end(bucket_start: datetime, interval: TimeInterval) -> datetime:
215
+ """Get the end datetime for a bucket (last moment before next bucket).
216
+
217
+ Args:
218
+ bucket_start: Start of the bucket.
219
+ interval: Time interval.
220
+
221
+ Returns:
222
+ End of the bucket (last microsecond before next bucket).
223
+ """
224
+ next_start = _get_next_bucket_start(bucket_start, interval)
225
+ # End is one microsecond before the next bucket starts
226
+ return next_start - timedelta(microseconds=1)
227
+
228
+
229
+ def generate_time_buckets(
230
+ start: datetime,
231
+ end: datetime,
232
+ interval: TimeInterval,
233
+ ) -> list[TimeBucket]:
234
+ """Generate all time buckets between start and end.
235
+
236
+ Args:
237
+ start: Start datetime (inclusive).
238
+ end: End datetime (inclusive).
239
+ interval: Time interval for bucketing.
240
+
241
+ Returns:
242
+ List of TimeBucket objects covering the range.
243
+
244
+ Example:
245
+ >>> from datetime import datetime
246
+ >>> start = datetime(2024, 1, 1)
247
+ >>> end = datetime(2024, 1, 3)
248
+ >>> buckets = generate_time_buckets(start, end, TimeInterval.DAY)
249
+ >>> len(buckets)
250
+ 3
251
+ """
252
+ # Handle timezone-aware datetimes by converting to naive
253
+ if start.tzinfo is not None:
254
+ start = start.replace(tzinfo=None)
255
+ if end.tzinfo is not None:
256
+ end = end.replace(tzinfo=None)
257
+
258
+ # Truncate start to the beginning of its interval
259
+ current = _truncate_datetime(start, interval)
260
+
261
+ buckets: list[TimeBucket] = []
262
+
263
+ while current <= end:
264
+ bucket_end = _get_bucket_end(current, interval)
265
+ label = _format_bucket_label(current, interval)
266
+
267
+ buckets.append(
268
+ TimeBucket(
269
+ start=current,
270
+ end=bucket_end,
271
+ label=label,
272
+ )
273
+ )
274
+
275
+ current = _get_next_bucket_start(current, interval)
276
+
277
+ return buckets
278
+
279
+
280
+ def fill_missing_buckets(
281
+ data: list[dict[str, Any]],
282
+ date_column: str,
283
+ buckets: list[TimeBucket],
284
+ fill_value: Any = 0,
285
+ ) -> list[dict[str, Any]]:
286
+ """Fill missing time buckets with default values.
287
+
288
+ Takes query result data and fills in missing time periods with
289
+ default values for numeric columns.
290
+
291
+ Args:
292
+ data: List of row dictionaries from query result.
293
+ date_column: Name of the date/datetime column.
294
+ buckets: List of time buckets to ensure coverage.
295
+ fill_value: Value to use for missing numeric data (default: 0).
296
+
297
+ Returns:
298
+ List of row dictionaries with missing buckets filled.
299
+
300
+ Example:
301
+ >>> data = [
302
+ ... {"date": datetime(2024, 1, 1), "sales": 100},
303
+ ... {"date": datetime(2024, 1, 3), "sales": 150},
304
+ ... ]
305
+ >>> buckets = generate_time_buckets(
306
+ ... datetime(2024, 1, 1), datetime(2024, 1, 3), TimeInterval.DAY
307
+ ... )
308
+ >>> filled = fill_missing_buckets(data, "date", buckets)
309
+ >>> len(filled) # Now includes Jan 2
310
+ 3
311
+ """
312
+ if not buckets:
313
+ return data
314
+
315
+ if not data:
316
+ # No data, create empty rows for all buckets
317
+ return [{date_column: bucket.start} for bucket in buckets]
318
+
319
+ # Build a map of bucket start -> existing data rows
320
+ bucket_data: dict[datetime, list[dict[str, Any]]] = {}
321
+
322
+ # Get all columns from first data row for template
323
+ template_row = data[0]
324
+ all_columns = list(template_row.keys())
325
+
326
+ # Determine which interval we're using based on bucket size
327
+ # (we need this to truncate data dates properly)
328
+ if len(buckets) >= 2:
329
+ diff = buckets[1].start - buckets[0].start
330
+ if diff <= timedelta(minutes=1):
331
+ interval = TimeInterval.MINUTE
332
+ elif diff <= timedelta(hours=1):
333
+ interval = TimeInterval.HOUR
334
+ elif diff <= timedelta(days=1):
335
+ interval = TimeInterval.DAY
336
+ elif diff <= timedelta(weeks=1):
337
+ interval = TimeInterval.WEEK
338
+ elif diff <= timedelta(days=32):
339
+ interval = TimeInterval.MONTH
340
+ elif diff <= timedelta(days=100):
341
+ interval = TimeInterval.QUARTER
342
+ else:
343
+ interval = TimeInterval.YEAR
344
+ else:
345
+ # Single bucket, guess from bucket duration
346
+ diff = buckets[0].end - buckets[0].start
347
+ if diff <= timedelta(minutes=1):
348
+ interval = TimeInterval.MINUTE
349
+ elif diff <= timedelta(hours=1):
350
+ interval = TimeInterval.HOUR
351
+ elif diff <= timedelta(days=1):
352
+ interval = TimeInterval.DAY
353
+ elif diff <= timedelta(weeks=1):
354
+ interval = TimeInterval.WEEK
355
+ elif diff <= timedelta(days=32):
356
+ interval = TimeInterval.MONTH
357
+ elif diff <= timedelta(days=100):
358
+ interval = TimeInterval.QUARTER
359
+ else:
360
+ interval = TimeInterval.YEAR
361
+
362
+ # Map existing data to buckets
363
+ for row in data:
364
+ date_val = row.get(date_column)
365
+ if date_val is None:
366
+ continue
367
+
368
+ # Convert to datetime if it's a date
369
+ if hasattr(date_val, "hour"):
370
+ dt = date_val
371
+ else:
372
+ # It's a date, convert to datetime
373
+ dt = datetime.combine(date_val, datetime.min.time())
374
+
375
+ # Handle timezone
376
+ if hasattr(dt, "tzinfo") and dt.tzinfo is not None:
377
+ dt = dt.replace(tzinfo=None)
378
+
379
+ # Truncate to bucket start
380
+ bucket_start = _truncate_datetime(dt, interval)
381
+
382
+ if bucket_start not in bucket_data:
383
+ bucket_data[bucket_start] = []
384
+ bucket_data[bucket_start].append(row)
385
+
386
+ # Build result with all buckets
387
+ result: list[dict[str, Any]] = []
388
+
389
+ for bucket in buckets:
390
+ existing_rows = bucket_data.get(bucket.start, [])
391
+
392
+ if existing_rows:
393
+ # Use existing data
394
+ result.extend(existing_rows)
395
+ else:
396
+ # Create a filled row
397
+ filled_row: dict[str, Any] = {}
398
+ for col in all_columns:
399
+ if col == date_column:
400
+ filled_row[col] = bucket.start
401
+ else:
402
+ # Check if original column was numeric
403
+ sample_value = template_row.get(col)
404
+ if isinstance(sample_value, int | float):
405
+ filled_row[col] = fill_value
406
+ else:
407
+ filled_row[col] = None
408
+ result.append(filled_row)
409
+
410
+ return result