tencent-wedata-feature-engineering-dev 0.1.49__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
  2. tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
  4. wedata/feature_store/client.py +28 -92
  5. wedata/feature_store/constants/constants.py +2 -5
  6. wedata/feature_store/entities/feature_lookup.py +0 -17
  7. wedata/feature_store/entities/feature_spec.py +2 -2
  8. wedata/feature_store/entities/feature_table.py +1 -5
  9. wedata/feature_store/entities/function_info.py +4 -1
  10. wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
  11. wedata/feature_store/spark_client/spark_client.py +15 -41
  12. wedata/feature_store/training_set_client/training_set_client.py +10 -9
  13. wedata/feature_store/utils/common_utils.py +4 -48
  14. wedata/feature_store/utils/feature_lookup_utils.py +43 -37
  15. wedata/feature_store/utils/feature_spec_utils.py +1 -1
  16. wedata/feature_store/utils/uc_utils.py +1 -1
  17. tencent_wedata_feature_engineering_dev-0.1.49.dist-info/RECORD +0 -66
  18. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  19. wedata/feature_store/cloud_sdk_client/client.py +0 -108
  20. wedata/feature_store/cloud_sdk_client/models.py +0 -686
  21. wedata/feature_store/cloud_sdk_client/utils.py +0 -39
  22. wedata/feature_store/common/log/__init__.py +0 -0
  23. wedata/feature_store/common/log/logger.py +0 -40
  24. wedata/feature_store/common/store_config/__init__.py +0 -0
  25. wedata/feature_store/common/store_config/redis.py +0 -48
  26. wedata/feature_store/constants/engine_types.py +0 -34
  27. wedata/feature_store/feast_client/__init__.py +0 -0
  28. wedata/feature_store/feast_client/feast_client.py +0 -487
  29. wedata/feature_store/utils/env_utils.py +0 -108
  30. wedata/tempo/__init__.py +0 -0
  31. wedata/tempo/interpol.py +0 -448
  32. wedata/tempo/intervals.py +0 -1331
  33. wedata/tempo/io.py +0 -61
  34. wedata/tempo/ml.py +0 -129
  35. wedata/tempo/resample.py +0 -318
  36. wedata/tempo/tsdf.py +0 -1720
  37. wedata/tempo/utils.py +0 -254
  38. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
wedata/tempo/intervals.py DELETED
@@ -1,1331 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from functools import cached_property
4
- from itertools import islice
5
- from typing import Optional, Iterable, cast, Any, Callable
6
-
7
- import numpy as np
8
- import pandas as pd
9
- import pyspark.sql.functions as f
10
- from pyspark.sql.dataframe import DataFrame
11
- from pyspark.sql.types import (
12
- # NB: NumericType is a non-public object, so we shouldn't import it directly
13
- ByteType,
14
- ShortType,
15
- IntegerType,
16
- LongType,
17
- FloatType,
18
- DoubleType,
19
- DecimalType,
20
- BooleanType,
21
- StructField,
22
- )
23
- from pyspark.sql.window import Window, WindowSpec
24
-
25
-
26
- def is_metric_col(col: StructField) -> bool:
27
- return isinstance(
28
- col.dataType,
29
- (
30
- ByteType,
31
- ShortType,
32
- IntegerType,
33
- LongType,
34
- FloatType,
35
- DoubleType,
36
- DecimalType,
37
- ),
38
- ) or isinstance(col.dataType, BooleanType)
39
-
40
-
41
- class IntervalsDF:
42
- """
43
- This object is the main wrapper over a `Spark DataFrame`_ which allows a
44
- user to parallelize computations over snapshots of metrics for intervals
45
- of time defined by a start and end timestamp and various dimensions.
46
-
47
- The required dimensions are `series` (list of columns by which to
48
- summarize), `metrics` (list of columns to analyze), `start_ts` (timestamp
49
- column), and `end_ts` (timestamp column). `start_ts` and `end_ts` can be
50
- epoch or TimestampType.
51
-
52
- .. _`Spark DataFrame`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
53
-
54
- """
55
-
56
- def __init__(
57
- self,
58
- df: DataFrame,
59
- start_ts: str,
60
- end_ts: str,
61
- series_ids: Optional[Iterable[str]] = None,
62
- ) -> None:
63
- """
64
- Constructor for :class:`IntervalsDF`.
65
-
66
- :param df:
67
- :type df: `DataFrame`_
68
- :param start_ts:
69
- :type start_ts: str
70
- :param end_ts:
71
- :type end_ts: str
72
- :param series_ids:
73
- :type series_ids: list[str]
74
- :rtype: None
75
-
76
- :Example:
77
-
78
- .. code-block:
79
-
80
- df = spark.createDataFrame(
81
- [["2020-08-01 00:00:09", "2020-08-01 00:00:14", "v1", 5, 0]],
82
- "start_ts STRING, end_ts STRING, series_1 STRING, metric_1 INT, metric_2 INT",
83
- )
84
- idf = IntervalsDF(df, "start_ts", "end_ts", ["series_1"])
85
- idf.df.collect()
86
- [Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:14', series_1='v1', metric_1=5, metric_2=0)]
87
-
88
- .. _`DataFrame`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
89
-
90
- .. todo::
91
- - create IntervalsSchema class to validate data types and column
92
- existence
93
- - check elements of series and identifiers to ensure all are str
94
- - check if start_ts, end_ts, and the elements of series and
95
- identifiers can be of type col
96
-
97
- """
98
-
99
- self.df = df
100
-
101
- self.start_ts = start_ts
102
- self.end_ts = end_ts
103
-
104
- if not series_ids:
105
- self.series_ids = []
106
- elif isinstance(series_ids, str):
107
- series_ids = series_ids.split(",")
108
- self.series_ids = [s.strip() for s in series_ids]
109
- elif isinstance(series_ids, Iterable):
110
- self.series_ids = list(series_ids)
111
- else:
112
- raise ValueError(
113
- f"series_ids must be an Iterable or comma seperated string"
114
- f" of column names, instead got {type(series_ids)}"
115
- )
116
-
117
- # self.make_disjoint = MakeDisjointBuilder()
118
-
119
- @cached_property
120
- def interval_boundaries(self) -> list[str]:
121
- return [self.start_ts, self.end_ts]
122
-
123
- @cached_property
124
- def structural_columns(self) -> list[str]:
125
- return self.interval_boundaries + self.series_ids
126
-
127
- @cached_property
128
- def observational_columns(self) -> list[str]:
129
- return list(set(self.df.columns) - set(self.structural_columns))
130
-
131
- @cached_property
132
- def metric_columns(self) -> list[str]:
133
- return [col.name for col in self.df.schema.fields if is_metric_col(col)]
134
-
135
- @cached_property
136
- def window(self) -> WindowSpec:
137
- return Window.partitionBy(*self.series_ids).orderBy(*self.interval_boundaries)
138
-
139
- @classmethod
140
- def fromStackedMetrics(
141
- cls,
142
- df: DataFrame,
143
- start_ts: str,
144
- end_ts: str,
145
- series: list[str],
146
- metrics_name_col: str,
147
- metrics_value_col: str,
148
- metric_names: Optional[list[str]] = None,
149
- ) -> "IntervalsDF":
150
- """
151
- Returns a new :class:`IntervalsDF` with metrics of the current DataFrame
152
- pivoted by start and end timestamp and series.
153
-
154
- There are two versions of `fromStackedMetrics`. One that requires the caller
155
- to specify the list of distinct metric names to pivot on, and one that does
156
- not. The latter is more concise but less efficient, because Spark needs to
157
- first compute the list of distinct metric names internally.
158
-
159
- :param df: :class:`DataFrame` to wrap with :class:`IntervalsDF`
160
- :type df: `DataFrame`_
161
- :param start_ts: Name of the column which denotes interval start
162
- :type start_ts: str
163
- :param end_ts: Name of the column which denotes interval end
164
- :type end_ts: str
165
- :param series: column names
166
- :type series: list[str]
167
- :param metrics_name_col: column name
168
- :type metrics_name_col: str
169
- :param metrics_value_col: column name
170
- :type metrics_value_col: str
171
- :param metric_names: List of metric names that will be translated to
172
- columns in the output :class:`IntervalsDF`.
173
- :type metric_names: list[str], optional
174
- :return: A new :class:`IntervalsDF` with a column and respective
175
- values per distinct metric in `metrics_name_col`.
176
-
177
- :Example:
178
-
179
- .. code-block::
180
-
181
- df = spark.createDataFrame(
182
- [["2020-08-01 00:00:09", "2020-08-01 00:00:14", "v1", "metric_1", 5],
183
- ["2020-08-01 00:00:09", "2020-08-01 00:00:11", "v1", "metric_2", 0]],
184
- "start_ts STRING, end_ts STRING, series_1 STRING, metric_name STRING, metric_value INT",
185
- )
186
-
187
- # With distinct metric names specified
188
-
189
- idf = IntervalsDF.fromStackedMetrics(
190
- df, "start_ts", "end_ts", ["series_1"], "metric_name", "metric_value", ["metric_1", "metric_2"],
191
- )
192
- idf.df.collect()
193
- [Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:14', series_1='v1', metric_1=5, metric_2=null),
194
- Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:11', series_1='v1', metric_1=null, metric_2=0)]
195
-
196
- # Or without specifying metric names (less efficient)
197
-
198
- idf = IntervalsDF.fromStackedMetrics(df, "start_ts", "end_ts", ["series_1"], "metric_name", "metric_value")
199
- idf.df.collect()
200
- [Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:14', series_1='v1', metric_1=5, metric_2=null),
201
- Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:11', series_1='v1', metric_1=null, metric_2=0)]
202
-
203
- .. _`DataFrame`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
204
-
205
- .. todo::
206
- - check elements of identifiers to ensure all are str
207
- - check if start_ts, end_ts, and the elements of series and
208
- identifiers can be of type col
209
-
210
- """
211
-
212
- if not isinstance(series, list):
213
- raise ValueError
214
-
215
- df = (
216
- df.groupBy(start_ts, end_ts, *series)
217
- .pivot(metrics_name_col, values=metric_names)
218
- .max(metrics_value_col)
219
- )
220
-
221
- return cls(df, start_ts, end_ts, series)
222
-
223
- def make_disjoint(self) -> "IntervalsDF":
224
- """
225
- Returns a new :class:`IntervalsDF` where metrics of overlapping time intervals
226
- are correlated and merged prior to constructing new time interval boundaries (
227
- start and end timestamp) so that all intervals are disjoint.
228
-
229
- The merge process assumes that two overlapping intervals cannot simultaneously
230
- report two different values for the same metric unless recorded in a data type
231
- which supports multiple elements (such as ArrayType, etc.).
232
-
233
- This is often used after :meth:`fromStackedMetrics` to reduce the number of
234
- metrics with `null` values and helps when constructing filter predicates to
235
- retrieve specific metric values across all instances.
236
-
237
- :return: A new :class:`IntervalsDF` containing disjoint time intervals
238
-
239
- :Example:
240
-
241
- .. code-block::
242
-
243
- df = spark.createDataFrame(
244
- [["2020-08-01 00:00:10", "2020-08-01 00:00:14", "v1", 5, null],
245
- ["2020-08-01 00:00:09", "2020-08-01 00:00:11", "v1", null, 0]],
246
- "start_ts STRING, end_ts STRING, series_1 STRING, metric_1 STRING, metric_2 INT",
247
- )
248
- idf = IntervalsDF(df, "start_ts", "end_ts", ["series_1"], ["metric_1", "metric_2"])
249
- idf.disjoint().df.collect()
250
- [Row(start_ts='2020-08-01 00:00:09', end_ts='2020-08-01 00:00:10', series_1='v1', metric_1=null, metric_2=0),
251
- Row(start_ts='2020-08-01 00:00:10', end_ts='2020-08-01 00:00:11', series_1='v1', metric_1=5, metric_2=0),
252
- Row(start_ts='2020-08-01 00:00:11', end_ts='2020-08-01 00:00:14', series_1='v1', metric_1=5, metric_2=null)]
253
-
254
- """
255
- # NB: creating local copies of class and instance attributes to be
256
- # referenced by UDF because complex python objects, like classes,
257
- # is not possible with PyArrow's supported data types
258
- # https://arrow.apache.org/docs/python/api/datatypes.html
259
- local_start_ts = self.start_ts
260
- local_end_ts = self.end_ts
261
- local_series_ids = self.series_ids
262
-
263
- disjoint_df = self.df.groupby(self.series_ids).applyInPandas(
264
- func=make_disjoint_wrap(
265
- self.start_ts,
266
- self.end_ts,
267
- self.series_ids,
268
- self.metric_columns,
269
- ),
270
- schema=self.df.schema,
271
- )
272
-
273
- return IntervalsDF(
274
- disjoint_df,
275
- local_start_ts,
276
- local_end_ts,
277
- local_series_ids,
278
- )
279
-
280
- def union(self, other: "IntervalsDF") -> "IntervalsDF":
281
- """
282
- Returns a new :class:`IntervalsDF` containing union of rows in this and another
283
- :class:`IntervalsDF`.
284
-
285
- This is equivalent to UNION ALL in SQL. To do a SQL-style set union
286
- (that does deduplication of elements), use this function followed by
287
- distinct().
288
-
289
- Also, as standard in SQL, this function resolves columns by position
290
- (not by name).
291
-
292
- Based on `pyspark.sql.DataFrame.union`_.
293
-
294
- :param other: :class:`IntervalsDF` to `union`
295
- :type other: :class:`IntervalsDF`
296
- :return: A new :class:`IntervalsDF` containing union of rows in this
297
- and `other`
298
-
299
- .. _`pyspark.sql.DataFrame.union`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.union.html
300
-
301
- """
302
-
303
- if not isinstance(other, IntervalsDF):
304
- raise TypeError
305
-
306
- return IntervalsDF(
307
- self.df.union(other.df), self.start_ts, self.end_ts, self.series_ids
308
- )
309
-
310
- def unionByName(self, other: "IntervalsDF") -> "IntervalsDF":
311
- """
312
- Returns a new :class:`IntervalsDF` containing union of rows in this
313
- and another :class:`IntervalsDF`.
314
-
315
- This is different from both UNION ALL and UNION DISTINCT in SQL. To do
316
- a SQL-style set union (that does deduplication of elements), use this
317
- function followed by distinct().
318
-
319
- Based on `pyspark.sql.DataFrame.unionByName`_; however,
320
- `allowMissingColumns` is not supported.
321
-
322
- :param other: :class:`IntervalsDF` to `unionByName`
323
- :type other: :class:`IntervalsDF`
324
- :return: A new :class:`IntervalsDF` containing union of rows in this
325
- and `other`
326
-
327
- .. _`pyspark.sql.DataFrame.unionByName`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.unionByName.html
328
-
329
- """
330
-
331
- if not isinstance(other, IntervalsDF):
332
- raise TypeError
333
-
334
- return IntervalsDF(
335
- self.df.unionByName(other.df),
336
- self.start_ts,
337
- self.end_ts,
338
- self.series_ids,
339
- )
340
-
341
- def toDF(self, stack: bool = False) -> DataFrame:
342
- """
343
- Returns a new `Spark DataFrame`_ converted from :class:`IntervalsDF`.
344
-
345
- There are two versions of `toDF`. One that will output columns as they exist
346
- in :class:`IntervalsDF` and, one that will stack metric columns into
347
- `metric_names` and `metric_values` columns populated with their respective
348
- values. The latter can be thought of as the inverse of
349
- :meth:`fromStackedMetrics`.
350
-
351
- Based on `pyspark.sql.DataFrame.toDF`_.
352
-
353
- :param stack: How to handle metric columns in the conversion to a `DataFrame`
354
- :type stack: bool, optional
355
- :return:
356
-
357
- .. _`Spark DataFrame`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html
358
- .. _`pyspark.sql.DataFrame.toDF`: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.toDF.html
359
- .. _`STACK`: https://spark.apache.org/docs/latest/api/sql/index.html#stack
360
-
361
- """
362
-
363
- if stack:
364
- n_cols = len(self.metric_columns)
365
- metric_cols_expr = ",".join(
366
- tuple(f"'{col}', {col}" for col in self.metric_columns)
367
- )
368
-
369
- stack_expr = (
370
- f"STACK({n_cols}, {metric_cols_expr}) AS (metric_name, metric_value)"
371
- )
372
-
373
- return self.df.select(
374
- *self.interval_boundaries,
375
- *self.series_ids,
376
- f.expr(stack_expr),
377
- ).dropna(subset="metric_value")
378
-
379
- else:
380
- return self.df
381
-
382
-
383
- def identify_interval_overlaps(
384
- in_pdf: pd.DataFrame,
385
- with_row: pd.Series,
386
- interval_start_ts: str,
387
- interval_end_ts: str,
388
- ) -> pd.DataFrame:
389
- """
390
- return the subset of rows in DataFrame `in_pdf` that overlap with row `with_row`
391
- """
392
-
393
- if in_pdf.empty or with_row.empty:
394
- # return in_pdf
395
- return pd.DataFrame()
396
-
397
- local_in_pdf = in_pdf.copy()
398
-
399
- # https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
400
- local_in_pdf["max_start_timestamp"] = [
401
- _ if _ >= with_row[interval_start_ts] else with_row[interval_start_ts]
402
- for _ in local_in_pdf[interval_start_ts]
403
- ]
404
-
405
- local_in_pdf["min_end_timestamp"] = [
406
- _ if _ <= with_row[interval_end_ts] else with_row[interval_end_ts]
407
- for _ in local_in_pdf[interval_end_ts]
408
- ]
409
-
410
- # https://www.baeldung.com/cs/finding-all-overlapping-intervals
411
- local_in_pdf = local_in_pdf[
412
- local_in_pdf["max_start_timestamp"] < local_in_pdf["min_end_timestamp"]
413
- ]
414
-
415
- local_in_pdf = local_in_pdf.drop(
416
- columns=["max_start_timestamp", "min_end_timestamp"]
417
- )
418
-
419
- # NB: with_row will always be included in the subset because with_row
420
- # is identical to with_row. This step is to remove it from subset.
421
- remove_with_row_mask = ~(
422
- local_in_pdf.fillna("¯\\_(ツ)_/¯") == np.array(with_row.fillna("¯\\_(ツ)_/¯"))
423
- ).all(1)
424
- local_in_pdf = local_in_pdf[remove_with_row_mask]
425
-
426
- return local_in_pdf
427
-
428
-
429
- def check_for_nan_values(to_check: Any) -> bool:
430
- """
431
- return True if there are any NaN values in `to_check`
432
- """
433
- if isinstance(to_check, pd.Series):
434
- return bool(to_check.isna().any())
435
- elif isinstance(to_check, pd.DataFrame):
436
- return bool(to_check.isna().any().any())
437
- elif isinstance(to_check, np.ndarray):
438
- return bool(np.isnan(to_check).any())
439
- elif isinstance(to_check, (np.generic, float)):
440
- return bool(np.isnan(to_check))
441
- else:
442
- return to_check is None
443
-
444
-
445
- def interval_starts_before(
446
- *,
447
- interval: pd.Series,
448
- other: pd.Series,
449
- interval_start_ts: str,
450
- other_start_ts: Optional[str] = None,
451
- ) -> bool:
452
- """
453
- return True if interval_a starts before interval_b starts
454
- """
455
-
456
- if other_start_ts is None:
457
- other_start_ts = interval_start_ts
458
-
459
- if check_for_nan_values(interval[interval_start_ts]) or check_for_nan_values(
460
- other[other_start_ts]
461
- ):
462
- raise ValueError("interval and other cannot contain NaN values for timestamps")
463
-
464
- return interval[interval_start_ts] < other[other_start_ts]
465
-
466
-
467
- def interval_ends_before(
468
- *,
469
- interval: pd.Series,
470
- other: pd.Series,
471
- interval_end_ts: str,
472
- other_end_ts: Optional[str] = None,
473
- ) -> bool:
474
- """
475
- return True if interval_a ends before interval_b ends
476
- """
477
-
478
- if other_end_ts is None:
479
- other_end_ts = interval_end_ts
480
-
481
- if check_for_nan_values(interval[interval_end_ts]) or check_for_nan_values(
482
- other[other_end_ts]
483
- ):
484
- raise ValueError("interval and other cannot contain NaN values for timestamps")
485
-
486
- return interval[interval_end_ts] < other[other_end_ts]
487
-
488
-
489
- def interval_is_contained_by(
490
- *,
491
- interval: pd.Series,
492
- other: pd.Series,
493
- interval_start_ts: str,
494
- interval_end_ts: str,
495
- other_start_ts: Optional[str] = None,
496
- other_end_ts: Optional[str] = None,
497
- ) -> bool:
498
- """
499
- return True if interval is contained in other
500
- """
501
-
502
- if other_start_ts is None:
503
- other_start_ts = interval_start_ts
504
-
505
- if other_end_ts is None:
506
- other_end_ts = interval_end_ts
507
-
508
- if (
509
- check_for_nan_values(interval[interval_start_ts])
510
- or check_for_nan_values(interval[interval_end_ts])
511
- or check_for_nan_values(other[other_start_ts])
512
- or check_for_nan_values(other[other_end_ts])
513
- ):
514
- raise ValueError("interval and other cannot contain NaN values for timestamps")
515
-
516
- return interval_starts_before(
517
- interval=other,
518
- other=interval,
519
- interval_start_ts=other_start_ts,
520
- other_start_ts=interval_start_ts,
521
- ) and interval_ends_before(
522
- interval=interval,
523
- other=other,
524
- interval_end_ts=interval_end_ts,
525
- other_end_ts=other_end_ts,
526
- )
527
-
528
-
529
- def intervals_share_start_boundary(
530
- interval: pd.Series,
531
- other: pd.Series,
532
- interval_start_ts: str,
533
- other_start_ts: Optional[str] = None,
534
- ) -> bool:
535
- """
536
- return True if interval_a and interval_b share a start boundary
537
- """
538
-
539
- if other_start_ts is None:
540
- other_start_ts = interval_start_ts
541
-
542
- if check_for_nan_values(interval[interval_start_ts]) or check_for_nan_values(
543
- other[other_start_ts]
544
- ):
545
- raise ValueError("interval and other cannot contain NaN values for timestamps")
546
-
547
- return interval[interval_start_ts] == other[other_start_ts]
548
-
549
-
550
- def intervals_share_end_boundary(
551
- interval: pd.Series,
552
- other: pd.Series,
553
- interval_end_ts: str,
554
- other_end_ts: Optional[str] = None,
555
- ) -> bool:
556
- """
557
- return True if interval_a and interval_b share an end boundary
558
- """
559
-
560
- if other_end_ts is None:
561
- other_end_ts = interval_end_ts
562
-
563
- if check_for_nan_values(interval[interval_end_ts]) or check_for_nan_values(
564
- other[other_end_ts]
565
- ):
566
- raise ValueError("interval and other cannot contain NaN values for timestamps")
567
-
568
- return interval[interval_end_ts] == other[other_end_ts]
569
-
570
-
571
- def intervals_boundaries_are_equivalent(
572
- interval: pd.Series,
573
- other: pd.Series,
574
- interval_start_ts: str,
575
- interval_end_ts: str,
576
- other_start_ts: Optional[str] = None,
577
- other_end_ts: Optional[str] = None,
578
- ) -> bool:
579
- """
580
- return True if interval_a is equivalent to interval_b
581
- """
582
-
583
- if other_start_ts is None:
584
- other_start_ts = interval_start_ts
585
-
586
- if other_end_ts is None:
587
- other_end_ts = interval_end_ts
588
-
589
- if (
590
- check_for_nan_values(interval[interval_start_ts])
591
- or check_for_nan_values(interval[interval_end_ts])
592
- or check_for_nan_values(other[other_start_ts])
593
- or check_for_nan_values(other[other_end_ts])
594
- ):
595
- raise ValueError("interval and other cannot contain NaN values for timestamps")
596
-
597
- return intervals_share_start_boundary(
598
- interval,
599
- other,
600
- interval_start_ts=interval_start_ts,
601
- other_start_ts=other_start_ts,
602
- ) and intervals_share_end_boundary(
603
- interval,
604
- other,
605
- interval_end_ts=interval_end_ts,
606
- other_end_ts=other_end_ts,
607
- )
608
-
609
-
610
- def intervals_have_equivalent_metric_columns(
611
- interval_a: pd.Series,
612
- interval_b: pd.Series,
613
- metric_columns: Iterable[str],
614
- ) -> bool:
615
- """
616
- return True if interval_a and interval_b have identical metrics
617
- """
618
- if isinstance(metric_columns, str):
619
- metric_columns = metric_columns.split(",")
620
- metric_columns = [s.strip() for s in metric_columns]
621
- elif isinstance(metric_columns, Iterable):
622
- metric_columns = list(metric_columns)
623
- else:
624
- raise ValueError(
625
- f"series_ids must be an Iterable or comma seperated string"
626
- f" of column names, instead got {type(metric_columns)}"
627
- )
628
-
629
- interval_a = interval_a.copy().fillna("¯\\_(ツ)_/¯")
630
- interval_b = interval_b.copy().fillna("¯\\_(ツ)_/¯")
631
- return all(
632
- interval_a[metric_col] == interval_b[metric_col]
633
- for metric_col in metric_columns
634
- )
635
-
636
-
637
- def intervals_do_not_overlap(
638
- *,
639
- interval: pd.Series,
640
- other: pd.Series,
641
- interval_start_ts: str,
642
- interval_end_ts: str,
643
- other_start_ts: Optional[str] = None,
644
- other_end_ts: Optional[str] = None,
645
- ) -> bool:
646
- if other_start_ts is None:
647
- other_start_ts = interval_start_ts
648
-
649
- if other_end_ts is None:
650
- other_end_ts = interval_end_ts
651
-
652
- if (
653
- check_for_nan_values(interval[interval_start_ts])
654
- or check_for_nan_values(interval[interval_end_ts])
655
- or check_for_nan_values(other[other_start_ts])
656
- or check_for_nan_values(other[other_end_ts])
657
- ):
658
- raise ValueError("interval and other cannot contain NaN values for timestamps")
659
-
660
- return (
661
- interval[interval_end_ts] < other[other_start_ts]
662
- or interval[interval_start_ts] > other[other_end_ts]
663
- )
664
-
665
-
666
- def update_interval_boundary(
667
- *,
668
- interval: pd.Series,
669
- boundary_to_update: str,
670
- update_value: str,
671
- ) -> pd.Series:
672
- """
673
- return new copy of interval with start or end time updated using update_value
674
- """
675
- if boundary_to_update not in (interval_keys := interval.keys()):
676
- raise KeyError(f"boundary_to_update must exist in of {interval_keys}")
677
-
678
- updated_interval = interval.copy()
679
- updated_interval[boundary_to_update] = update_value
680
-
681
- return updated_interval
682
-
683
-
684
- def merge_metric_columns_of_intervals(
685
- *,
686
- main_interval: pd.Series,
687
- child_interval: pd.Series,
688
- metric_columns: Iterable[str],
689
- metric_merge_method: bool = False,
690
- ) -> pd.Series:
691
- """
692
- return the merged metrics of interval_a and interval_b
693
- """
694
-
695
- if isinstance(metric_columns, str):
696
- metric_columns = metric_columns.split(",")
697
- metric_columns = [s.strip() for s in metric_columns]
698
- elif isinstance(metric_columns, Iterable):
699
- metric_columns = list(metric_columns)
700
- else:
701
- raise ValueError(
702
- f"series_ids must be an Iterable or comma seperated string"
703
- f" of column names, instead got {type(metric_columns)}"
704
- )
705
-
706
- merged_interval = main_interval.copy()
707
-
708
- if metric_merge_method:
709
- for metric_col in metric_columns:
710
- if pd.notna(child_interval[metric_col]):
711
- merged_interval[metric_col] = child_interval[metric_col]
712
-
713
- return merged_interval
714
-
715
-
716
- def resolve_overlap( # TODO: need to implement proper metric merging
717
- # -> for now, can just take non-null values from both intervals
718
- interval: pd.Series,
719
- other: pd.Series,
720
- interval_start_ts: str,
721
- interval_end_ts: str,
722
- series_ids: Iterable[str],
723
- metric_columns: Iterable[str],
724
- other_start_ts: Optional[str] = None,
725
- other_end_ts: Optional[str] = None,
726
- ) -> list[pd.Series]:
727
- """
728
- resolve overlaps between the two given intervals,
729
- splitting them as necessary into some set of disjoint intervals
730
- """
731
-
732
- if other_start_ts is None:
733
- try:
734
- _ = other[interval_start_ts]
735
- other_start_ts = interval_start_ts
736
- except KeyError:
737
- raise ValueError(
738
- f"`other_start_ts` must be set or equivalent to `interval_start_ts`, got {other_start_ts}"
739
- )
740
-
741
- if other_end_ts is None:
742
- try:
743
- _ = other[interval_end_ts]
744
- other_end_ts = interval_end_ts
745
- except KeyError:
746
- raise ValueError(
747
- f"`other_end_ts` must be set or equivalent to `interval_end_ts`, got {other_end_ts}"
748
- )
749
-
750
- if (
751
- check_for_nan_values(interval[interval_start_ts])
752
- or check_for_nan_values(interval[interval_end_ts])
753
- or check_for_nan_values(other[other_start_ts])
754
- or check_for_nan_values(other[other_end_ts])
755
- ):
756
- raise ValueError("interval and other cannot contain NaN values for timestamps")
757
-
758
- interval_index = set(interval.index).difference(
759
- (
760
- interval_start_ts,
761
- interval_end_ts,
762
- )
763
- )
764
- other_index = set(other.index).difference(
765
- (
766
- other_start_ts,
767
- other_end_ts,
768
- )
769
- )
770
-
771
- if not interval_index == other_index:
772
- raise ValueError("Expected indices of pd.Series elements to be equivalent.")
773
-
774
- for arg in (series_ids, metric_columns):
775
- if isinstance(arg, str):
776
- arg = [s.strip() for s in arg.split(",")]
777
- elif isinstance(arg, Iterable):
778
- arg = list(arg)
779
- else:
780
- raise ValueError(
781
- f"{arg} must be an Iterable or comma seperated string"
782
- f" of column names, instead got {type(arg)}"
783
- )
784
-
785
- series_ids = cast(list[str], series_ids)
786
- metric_columns = cast(list[str], metric_columns)
787
-
788
- resolved_intervals = list()
789
-
790
- # NB: Checking order of intervals in terms of start time allows
791
- # us to remove all cases where b precedes a because the interval
792
- # which opens sooner can always be set to a
793
- if interval[interval_start_ts] > other[other_start_ts]:
794
- interval, other = other, interval
795
-
796
- # intervals_do_not_overlap(interval, other, ...) is True
797
- #
798
- # Results in 2 disjoint intervals
799
- # 1) A.start, A.end, A.metric_columns
800
- # 2) B.start, B.end, B.metric_columns
801
-
802
- if intervals_do_not_overlap(
803
- interval=interval,
804
- other=other,
805
- interval_start_ts=interval_start_ts,
806
- interval_end_ts=interval_end_ts,
807
- other_start_ts=other_start_ts,
808
- other_end_ts=other_end_ts,
809
- ):
810
- return [interval, other]
811
-
812
- # intervals_have_equivalent_metric_columns(interval, other, metric_columns) is True
813
- #
814
- # Results in 1 disjoint interval
815
- # 1) A.start, B.end, A.metric_columns
816
-
817
- if intervals_have_equivalent_metric_columns(interval, other, metric_columns):
818
- resolved_series = update_interval_boundary(
819
- interval=interval,
820
- boundary_to_update=interval_end_ts,
821
- update_value=other[other_end_ts],
822
- )
823
-
824
- resolved_intervals.append(resolved_series)
825
-
826
- return resolved_intervals
827
-
828
- # interval_is_contained_by(interval=other, other=interval, ...) is True
829
- #
830
- # Results in 3 disjoint intervals
831
- # 1) A.start, B.start, A.metric_columns
832
- # 2) B.start, B.end, merge(A.metric_columns, B.metric_columns)
833
- # 3) B.end, A.end, A.metric_columns
834
-
835
- if interval_is_contained_by(
836
- interval=other,
837
- other=interval,
838
- interval_start_ts=other_start_ts,
839
- interval_end_ts=other_end_ts,
840
- other_start_ts=interval_start_ts,
841
- other_end_ts=interval_end_ts,
842
- ):
843
- # 1)
844
- resolved_series = update_interval_boundary(
845
- interval=interval,
846
- boundary_to_update=interval_end_ts,
847
- update_value=other[other_start_ts],
848
- )
849
-
850
- resolved_intervals.append(resolved_series)
851
-
852
- # 2)
853
- resolved_series = merge_metric_columns_of_intervals(
854
- main_interval=other,
855
- child_interval=interval,
856
- metric_columns=metric_columns,
857
- metric_merge_method=True,
858
- )
859
-
860
- resolved_intervals.append(resolved_series)
861
-
862
- # 3)
863
- resolved_series = update_interval_boundary(
864
- interval=interval,
865
- boundary_to_update=interval_start_ts,
866
- update_value=other[other_end_ts],
867
- )
868
-
869
- resolved_intervals.append(resolved_series)
870
-
871
- return resolved_intervals
872
-
873
- # A shares a common start with B, a different end boundary
874
- # - A.start = B.start & A.end != B.end
875
- #
876
- # Results in 2 disjoint intervals
877
- # - if A.end < B.end
878
- # 1) A.start, A.end, merge(A.metric_columns, B.metric_columns)
879
- # 2) A.end, B.end, B.metric_columns
880
- # - if A.end > B.end
881
- # 1) B.start, B.end, merge(A.metric_columns, B.metric_columns)
882
- # 2) B.end, A.end, A.metric_columns
883
-
884
- if intervals_share_start_boundary(
885
- interval,
886
- other,
887
- interval_start_ts=interval_start_ts,
888
- other_start_ts=other_start_ts,
889
- ) and not intervals_share_end_boundary(
890
- interval, other, interval_end_ts=interval_end_ts
891
- ):
892
- if interval_ends_before(
893
- interval=interval,
894
- other=other,
895
- interval_end_ts=interval_end_ts,
896
- other_end_ts=other_end_ts,
897
- ):
898
- # 1)
899
- resolved_series = merge_metric_columns_of_intervals(
900
- main_interval=interval,
901
- child_interval=other,
902
- metric_columns=metric_columns,
903
- metric_merge_method=True,
904
- )
905
-
906
- resolved_intervals.append(resolved_series)
907
-
908
- # 2)
909
- resolved_series = update_interval_boundary(
910
- interval=other,
911
- boundary_to_update=other_start_ts,
912
- update_value=interval[interval_end_ts],
913
- )
914
-
915
- resolved_intervals.append(resolved_series)
916
-
917
- else:
918
- # 1)
919
- resolved_series = merge_metric_columns_of_intervals(
920
- main_interval=other,
921
- child_interval=interval,
922
- metric_columns=metric_columns,
923
- metric_merge_method=True,
924
- )
925
-
926
- resolved_intervals.append(resolved_series)
927
-
928
- # 2)
929
- resolved_series = update_interval_boundary(
930
- interval=interval,
931
- boundary_to_update=interval_start_ts,
932
- update_value=other[other_end_ts],
933
- )
934
-
935
- resolved_intervals.append(resolved_series)
936
-
937
- return resolved_intervals
938
-
939
- # A shares a common end with B, a different start boundary
940
- # - A.start != B.start & A.end = B.end
941
- #
942
- # Results in 2 disjoint intervals
943
- # - if A.start < B.start
944
- # 1) A.start, B.start, A.metric_columns
945
- # 1) B.start, B.end, merge(A.metric_columns, B.metric_columns)
946
- # - if A.start > B.start
947
- # 1) B.start, A.end, B.metric_columns
948
- # 2) A.start, A.end, merge(A.metric_columns, B.metric_columns)
949
-
950
- if not intervals_share_start_boundary(
951
- interval,
952
- other,
953
- interval_start_ts=interval_start_ts,
954
- other_start_ts=other_start_ts,
955
- ) and intervals_share_end_boundary(
956
- interval,
957
- other,
958
- interval_end_ts=interval_end_ts,
959
- other_end_ts=other_end_ts,
960
- ):
961
- if interval_starts_before(
962
- interval=interval,
963
- other=other,
964
- interval_start_ts=interval_start_ts,
965
- other_start_ts=other_start_ts,
966
- ):
967
- # 1)
968
- resolved_series = update_interval_boundary(
969
- interval=interval,
970
- boundary_to_update=interval_end_ts,
971
- update_value=other[other_start_ts],
972
- )
973
-
974
- resolved_intervals.append(resolved_series)
975
-
976
- # 2)
977
- resolved_series = merge_metric_columns_of_intervals(
978
- main_interval=other,
979
- child_interval=interval,
980
- metric_columns=metric_columns,
981
- metric_merge_method=True,
982
- )
983
-
984
- resolved_intervals.append(resolved_series)
985
-
986
- return resolved_intervals
987
-
988
- # A and B share a common start and end boundary, making them equivalent.
989
- # - A.start = B.start & A.end = B.end
990
- #
991
- # Results in 1 disjoint interval
992
- # 1) A.start, A.end, merge(A.metric_columns, B.metric_columns)
993
-
994
- if intervals_boundaries_are_equivalent(
995
- interval,
996
- other,
997
- interval_start_ts=interval_start_ts,
998
- interval_end_ts=interval_end_ts,
999
- other_start_ts=other_start_ts,
1000
- other_end_ts=other_end_ts,
1001
- ):
1002
- resolved_series = merge_metric_columns_of_intervals(
1003
- main_interval=interval,
1004
- child_interval=other,
1005
- metric_columns=metric_columns,
1006
- metric_merge_method=True,
1007
- )
1008
-
1009
- resolved_intervals.append(resolved_series)
1010
-
1011
- return resolved_intervals
1012
-
1013
- # Interval A starts first and A partially overlaps B
1014
- # - A.start < B.start & A.end < B.end
1015
- #
1016
- # Results in 3 disjoint intervals
1017
- # 1) A.start, B.start, A.metric_columns
1018
- # 2) B.start, A.end, merge(A.metric_columns, B.metric_columns)
1019
- # 3) A.end, B.end, B.metric_columns
1020
-
1021
- if interval_starts_before(
1022
- interval=interval,
1023
- other=other,
1024
- interval_start_ts=interval_start_ts,
1025
- other_start_ts=other_start_ts,
1026
- ) and interval_ends_before(
1027
- interval=interval,
1028
- other=other,
1029
- interval_end_ts=interval_end_ts,
1030
- other_end_ts=other_end_ts,
1031
- ):
1032
- # 1)
1033
- resolved_series = update_interval_boundary(
1034
- interval=interval,
1035
- boundary_to_update=interval_end_ts,
1036
- update_value=other[other_start_ts],
1037
- )
1038
-
1039
- resolved_intervals.append(resolved_series)
1040
-
1041
- # 2)
1042
- updated_series = update_interval_boundary(
1043
- interval=other,
1044
- boundary_to_update=other_end_ts,
1045
- update_value=interval[interval_end_ts],
1046
- )
1047
- resolved_series = merge_metric_columns_of_intervals(
1048
- main_interval=updated_series,
1049
- child_interval=interval,
1050
- metric_columns=metric_columns,
1051
- metric_merge_method=True,
1052
- )
1053
-
1054
- resolved_intervals.append(resolved_series)
1055
-
1056
- # 3)
1057
- resolved_series = update_interval_boundary(
1058
- interval=other,
1059
- boundary_to_update=other_start_ts,
1060
- update_value=interval[interval_end_ts],
1061
- )
1062
-
1063
- resolved_intervals.append(resolved_series)
1064
-
1065
- return resolved_intervals
1066
-
1067
- raise NotImplementedError("Interval resolution not implemented")
1068
-
1069
-
1070
- def resolve_all_overlaps(
1071
- with_row: pd.Series,
1072
- overlaps: pd.DataFrame,
1073
- with_row_start_ts: str,
1074
- with_row_end_ts: str,
1075
- series_ids: Iterable[str],
1076
- metric_columns: Iterable[str],
1077
- overlap_start_ts: Optional[str] = None,
1078
- overlap_end_ts: Optional[str] = None,
1079
- ) -> pd.DataFrame:
1080
- """
1081
- resolve the interval `x` against all overlapping intervals in `overlapping`,
1082
- returning a set of disjoint intervals with the same spans
1083
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
1084
- """
1085
-
1086
- if overlap_start_ts is None:
1087
- try:
1088
- _ = overlaps[with_row_start_ts]
1089
- overlap_start_ts = with_row_start_ts
1090
- except KeyError:
1091
- raise ValueError(
1092
- f"`overlaps_start_ts` must be set or equivalent to `with_row_start_ts`, got {overlap_start_ts}"
1093
- )
1094
-
1095
- if overlap_end_ts is None:
1096
- try:
1097
- _ = overlaps[with_row_end_ts]
1098
- overlap_end_ts = with_row_end_ts
1099
- except KeyError:
1100
- raise ValueError(
1101
- f"`overlaps_end_ts` must be set or equivalent to `with_row_end_ts`, got {overlap_end_ts}"
1102
- )
1103
-
1104
- for arg in (series_ids, metric_columns):
1105
- if isinstance(arg, str):
1106
- arg = [s.strip() for s in arg.split(",")]
1107
- elif isinstance(arg, Iterable):
1108
- arg = list(arg)
1109
- else:
1110
- raise ValueError(
1111
- f"{arg} must be an Iterable or comma seperated string"
1112
- f" of column names, instead got {type(arg)}"
1113
- )
1114
-
1115
- series_ids = cast(list[str], series_ids)
1116
- metric_columns = cast(list[str], metric_columns)
1117
-
1118
- first_row = overlaps.iloc[0]
1119
- initial_intervals = resolve_overlap(
1120
- with_row,
1121
- first_row,
1122
- with_row_start_ts,
1123
- with_row_end_ts,
1124
- series_ids,
1125
- metric_columns,
1126
- overlap_start_ts,
1127
- overlap_end_ts,
1128
- )
1129
- local_disjoint_df = pd.DataFrame(initial_intervals)
1130
-
1131
- # NB: using `itertools.islice` to build a generator that skips the first
1132
- # row of overlaps
1133
- for _, row in islice(overlaps.iterrows(), 1, None):
1134
- resolved_intervals = resolve_overlap(
1135
- with_row,
1136
- row,
1137
- with_row_start_ts,
1138
- with_row_end_ts,
1139
- series_ids,
1140
- metric_columns,
1141
- overlap_start_ts,
1142
- overlap_end_ts,
1143
- )
1144
- for interval in resolved_intervals:
1145
- local_disjoint_df = add_as_disjoint(
1146
- interval,
1147
- local_disjoint_df,
1148
- (overlap_start_ts, overlap_end_ts),
1149
- series_ids,
1150
- metric_columns,
1151
- )
1152
-
1153
- return local_disjoint_df
1154
-
1155
-
1156
- def add_as_disjoint(
1157
- interval: pd.Series,
1158
- disjoint_set: Optional[pd.DataFrame],
1159
- interval_boundaries: Iterable[str],
1160
- series_ids: Iterable[str],
1161
- metric_columns: Iterable[str],
1162
- ) -> pd.DataFrame:
1163
- """
1164
- returns a disjoint set consisting of the given interval, made disjoint with those already in `disjoint_set`
1165
- """
1166
-
1167
- if isinstance(interval_boundaries, str):
1168
- _ = interval_boundaries.split(",")
1169
- interval_boundaries = [s.strip() for s in _]
1170
- elif isinstance(interval_boundaries, Iterable):
1171
- interval_boundaries = list(interval_boundaries)
1172
- else:
1173
- raise ValueError(
1174
- f"series_ids must be an Iterable or comma seperated string"
1175
- f" of column names, instead got {type(interval_boundaries)}"
1176
- )
1177
-
1178
- if len(interval_boundaries) != 2:
1179
- raise ValueError(
1180
- f"interval_boundaries must be an Iterable of length 2, instead got {len(interval_boundaries)}"
1181
- )
1182
-
1183
- start_ts, end_ts = interval_boundaries
1184
-
1185
- for arg in (series_ids, metric_columns):
1186
- if isinstance(arg, str):
1187
- arg = [s.strip() for s in arg.split(",")]
1188
- elif isinstance(arg, Iterable):
1189
- arg = list(arg)
1190
- else:
1191
- raise ValueError(
1192
- f"{arg} must be an Iterable or comma seperated string"
1193
- f" of column names, instead got {type(arg)}"
1194
- )
1195
-
1196
- series_ids = cast(list[str], series_ids)
1197
- metric_columns = cast(list[str], metric_columns)
1198
-
1199
- if disjoint_set is None:
1200
- return pd.DataFrame([interval])
1201
-
1202
- if disjoint_set.empty:
1203
- return pd.DataFrame([interval])
1204
-
1205
- overlapping_subset_df = identify_interval_overlaps(
1206
- in_pdf=disjoint_set,
1207
- with_row=interval,
1208
- interval_start_ts=start_ts,
1209
- interval_end_ts=end_ts,
1210
- )
1211
-
1212
- # if there are no overlaps, add the interval to disjoint_set
1213
- if overlapping_subset_df.empty:
1214
- element_wise_comparison = (
1215
- disjoint_set.fillna("¯\\_(ツ)_/¯") == interval.fillna("¯\\_(ツ)_/¯").values
1216
- )
1217
- row_wise_comparison = element_wise_comparison.all(axis=1)
1218
- # NB: because of the nested iterations, we need to check that the
1219
- # record hasn't already been added to `global_disjoint_df` by another loop
1220
- if row_wise_comparison.any():
1221
- return disjoint_set
1222
- else:
1223
- return pd.concat((disjoint_set, pd.DataFrame([interval])))
1224
-
1225
- # identify all intervals which do not overlap with the given interval to
1226
- # concatenate them to the disjoint set after resolving overlaps
1227
- non_overlapping_subset_df = disjoint_set[
1228
- ~disjoint_set.set_index(interval_boundaries).index.isin(
1229
- overlapping_subset_df.set_index(interval_boundaries).index
1230
- )
1231
- ]
1232
-
1233
- # Avoid a call to `resolve_all_overlaps` if there is only one to resolve
1234
- multiple_to_resolve = len(overlapping_subset_df.index) > 1
1235
-
1236
- # If every record overlaps, no need to handle non-overlaps
1237
- only_overlaps_present = len(disjoint_set.index) == len(overlapping_subset_df.index)
1238
-
1239
- # Resolve the interval against all the existing, overlapping intervals
1240
- # `multiple_to_resolve` is used to avoid unnecessary calls to `resolve_all_overlaps`
1241
- # `only_overlaps_present` is used to avoid unnecessary calls to `pd.concat`
1242
- if not multiple_to_resolve and only_overlaps_present:
1243
- return pd.DataFrame(
1244
- resolve_overlap(
1245
- interval,
1246
- overlapping_subset_df.iloc[0],
1247
- start_ts,
1248
- end_ts,
1249
- series_ids,
1250
- metric_columns,
1251
- )
1252
- )
1253
-
1254
- if multiple_to_resolve and only_overlaps_present:
1255
- return resolve_all_overlaps(
1256
- interval,
1257
- overlapping_subset_df,
1258
- start_ts,
1259
- end_ts,
1260
- series_ids,
1261
- metric_columns,
1262
- )
1263
-
1264
- if not multiple_to_resolve and not only_overlaps_present:
1265
- return pd.concat(
1266
- (
1267
- pd.DataFrame(
1268
- resolve_overlap(
1269
- interval,
1270
- overlapping_subset_df.iloc[0],
1271
- start_ts,
1272
- end_ts,
1273
- series_ids,
1274
- metric_columns,
1275
- )
1276
- ),
1277
- non_overlapping_subset_df,
1278
- ),
1279
- )
1280
-
1281
- if multiple_to_resolve and not only_overlaps_present:
1282
- return pd.concat(
1283
- (
1284
- resolve_all_overlaps(
1285
- interval,
1286
- overlapping_subset_df,
1287
- start_ts,
1288
- end_ts,
1289
- series_ids,
1290
- metric_columns,
1291
- ),
1292
- non_overlapping_subset_df,
1293
- ),
1294
- )
1295
-
1296
- # if we get here, something went wrong
1297
- raise NotImplementedError
1298
-
1299
-
1300
- def make_disjoint_wrap(
1301
- start_ts: str,
1302
- end_ts: str,
1303
- series_ids: Iterable[str],
1304
- metric_columns: Iterable[str],
1305
- ) -> Callable[[pd.DataFrame], pd.DataFrame]:
1306
- def make_disjoint_inner(
1307
- pdf: pd.DataFrame,
1308
- ) -> pd.DataFrame:
1309
- """
1310
- function will process all intervals in the input, and break down overlapping intervals into a fully disjoint set
1311
- https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-and-then-filling-it
1312
- https://stackoverflow.com/questions/55478191/list-of-series-to-dataframe
1313
- https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.append.html
1314
- """
1315
-
1316
- global_disjoint_df = pd.DataFrame(columns=pdf.columns)
1317
-
1318
- sorted_pdf = pdf.sort_values([start_ts, end_ts])
1319
-
1320
- for _, row in sorted_pdf.iterrows():
1321
- global_disjoint_df = add_as_disjoint(
1322
- row,
1323
- global_disjoint_df,
1324
- (start_ts, end_ts),
1325
- series_ids,
1326
- metric_columns,
1327
- )
1328
-
1329
- return global_disjoint_df
1330
-
1331
- return make_disjoint_inner