ts-data-generator 0.0.1a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 MANOJ MANIVANNAN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.3
2
+ Name: ts-data-generator
3
+ Version: 0.0.1a1
4
+ Summary: A Python library for generating synthetic time series data
5
+ License: MIT
6
+ Keywords: synthetic data,data generator,python,time series
7
+ Author: Manoj Manivannan
8
+ Author-email: manojm18@live.in
9
+ Requires-Python: >=3.8
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Provides-Extra: dev
14
+ Requires-Dist: black ; extra == "dev"
15
+ Requires-Dist: flake8 ; extra == "dev"
16
+ Requires-Dist: matplotlib
17
+ Requires-Dist: pandas
18
+ Requires-Dist: pydantic
19
+ Requires-Dist: pytest
20
+ Requires-Dist: python-dotenv
21
+ Description-Content-Type: text/markdown
22
+
23
+ <!-- html title in the middle -->
24
+ <p style="text-align: center;">
25
+ <h1 align="center">Synthetic Time Series Data Generator</h1>
26
+ <h3 align="center">A Python library for generating synthetic time series data</h3>
27
+ </p>
28
+ <p align="center">
29
+ <img src="notebooks/image.png" alt="MarineGEO circle logo" style="height: 1000px; width:800px;"/>
30
+ </p>
31
+
32
+
33
+ <!-- insert image from notebooks directory -->
34
+
35
+
36
+
37
+ ## Installation
38
+
39
+ ### Repo
40
+ After cloning this repo and creating a virtual environment, run the following command:
41
+ ```bash
42
+ pip install --editable .
43
+ ```
44
+ ### PyPi
45
+ Coming soon
46
+
47
+
48
+ ## Usage
49
+
50
+ ```python
51
+ d = DataGen()
52
+ d.start_datetime = "2019-01-01"
53
+ d.end_datetime = "2019-01-03"
54
+ d.granularity = Granularity.FIVE_MIN
55
+ d.add_dimension("product", random_choice(["A", "B", "C", "D"]))
56
+
57
+ metric1_trend = SinusoidalTrend(name="sine", amplitude=10, freq=24, phase=0, noise_level=10)
58
+
59
+ d.add_metric(name="temperature", trends=[metric1_trend])
60
+
61
+ metric2_trend = SinusoidalTrend(name="sine", amplitude=1, freq=12, phase=0, noise_level=2)
62
+ metric3_trend = LinearTrend(name="linear", limit=100, offset=10, noise_level=1)
63
+
64
+ d.add_metric(name="humidity", trends=[metric2_trend,metric3_trend])
65
+ d.generate_data()
66
+ df = d.data
67
+
68
+ # Use utility functions
69
+ processed_df = some_function(df)
70
+ ```
71
+
72
+ #### Release method
73
+ 1. `git tag <x.x.x>`
74
+ 2. `git push origin <x.x.x>`
@@ -0,0 +1,52 @@
1
+ <!-- html title in the middle -->
2
+ <p style="text-align: center;">
3
+ <h1 align="center">Synthetic Time Series Data Generator</h1>
4
+ <h3 align="center">A Python library for generating synthetic time series data</h3>
5
+ </p>
6
+ <p align="center">
7
+ <img src="notebooks/image.png" alt="MarineGEO circle logo" style="height: 1000px; width:800px;"/>
8
+ </p>
9
+
10
+
11
+ <!-- insert image from notebooks directory -->
12
+
13
+
14
+
15
+ ## Installation
16
+
17
+ ### Repo
18
+ After cloning this repo and creating a virtual environment, run the following command:
19
+ ```bash
20
+ pip install --editable .
21
+ ```
22
+ ### PyPi
23
+ Coming soon
24
+
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ d = DataGen()
30
+ d.start_datetime = "2019-01-01"
31
+ d.end_datetime = "2019-01-03"
32
+ d.granularity = Granularity.FIVE_MIN
33
+ d.add_dimension("product", random_choice(["A", "B", "C", "D"]))
34
+
35
+ metric1_trend = SinusoidalTrend(name="sine", amplitude=10, freq=24, phase=0, noise_level=10)
36
+
37
+ d.add_metric(name="temperature", trends=[metric1_trend])
38
+
39
+ metric2_trend = SinusoidalTrend(name="sine", amplitude=1, freq=12, phase=0, noise_level=2)
40
+ metric3_trend = LinearTrend(name="linear", limit=100, offset=10, noise_level=1)
41
+
42
+ d.add_metric(name="humidity", trends=[metric2_trend,metric3_trend])
43
+ d.generate_data()
44
+ df = d.data
45
+
46
+ # Use utility functions
47
+ processed_df = some_function(df)
48
+ ```
49
+
50
+ #### Release method
51
+ 1. `git tag <x.x.x>`
52
+ 2. `git push origin <x.x.x>`
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [tool.hatch.build.targets.wheel]
6
+ packages = ["src/ts_data_generator"]
7
+
8
+ [project]
9
+ name = "ts-data-generator"
10
+ version = "0.0.1a1"
11
+ description = "A Python library for generating synthetic time series data"
12
+ requires-python = ">=3.8"
13
+ dependencies = [
14
+ "pytest",
15
+ "python-dotenv",
16
+ "pandas",
17
+ "pydantic",
18
+ "matplotlib"
19
+ ]
20
+ readme = "README.md"
21
+ authors = [
22
+ { name = "Manoj Manivannan", email = "manojm18@live.in" }
23
+ ]
24
+ license = { text = "MIT" }
25
+ classifiers = [
26
+ "Programming Language :: Python :: 3",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ ]
30
+ keywords = ["synthetic data", "data generator", "python", "time series"]
31
+
32
+
33
+ [tool.pytest.ini_options]
34
+ addopts = "-ra -q"
35
+ testpaths = [
36
+ "tests",
37
+ ]
38
+
39
+ [project.optional-dependencies]
40
+ dev = [
41
+ "black",
42
+ "flake8",
43
+ ]
44
+
@@ -0,0 +1,8 @@
1
+ """
2
+ Data Generator Library - A tool for generating synthetic data
3
+ """
4
+
5
+ from .data_gen import DataGen
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = ["DataGen"]
@@ -0,0 +1,387 @@
1
+ """
2
+ Core DataGen class implementation
3
+ """
4
+
5
+ from typing import Optional, Union, Dict, Set, List, Generator
6
+ from .schema.models import Metrics, Dimensions, Granularity
7
+ from .utils.trends import Trends
8
+ import pandas as pd
9
+ from datetime import datetime
10
+
11
+
12
+ class DataGen:
13
+ """Main class for generating synthetic data"""
14
+
15
+ def __init__(
16
+ self,
17
+ dimensions: List[Dimensions] = None,
18
+ metrics: List[Metrics] = None,
19
+ start_datetime: Optional[str] = None,
20
+ end_datetime: Optional[str] = None,
21
+ granularity: Granularity = Granularity.FIVE_MIN,
22
+ ):
23
+ """Initialize DataGen with empty data"""
24
+
25
+ self._dimensions = dimensions or [] # Initialize to an empty set if None
26
+ self._metrics = metrics or [] # Initialize to an empty set if None
27
+ self._start_datetime = start_datetime
28
+ self._end_datetime = end_datetime
29
+ self._granularity = granularity
30
+
31
+ def __repr__(self):
32
+ return f"""DataGen Class
33
+ dimensions = {[d.to_json() for d in self._dimensions]},
34
+ metrics = {[m.to_json() for m in self._metrics]},
35
+ start_datetime = {self.start_datetime},
36
+ end_datetime = {self.end_datetime},
37
+ granularity = {self.granularity})
38
+ """
39
+
40
+ @property
41
+ def start_datetime(self):
42
+ return self._start_datetime
43
+
44
+ @start_datetime.setter
45
+ def start_datetime(self, value: str):
46
+ """Set start_datetime and validate it.
47
+
48
+ Args:
49
+ value (str): Start date in ISO format (YYYY-MM-DD).
50
+ """
51
+ if value is not None:
52
+ try:
53
+ datetime.fromisoformat(value)
54
+ except ValueError:
55
+ raise ValueError("Dates must be in ISO format (YYYY-MM-DD)")
56
+ self._start_datetime = value
57
+
58
+ @property
59
+ def end_datetime(self):
60
+ return self._end_datetime
61
+
62
+ @end_datetime.setter
63
+ def end_datetime(self, value: str):
64
+ """Set end_datetime and validate it.
65
+
66
+ Args:
67
+ value (str): End date in ISO format (YYYY-MM-DD).
68
+ """
69
+ if value is not None:
70
+ try:
71
+ datetime.fromisoformat(value)
72
+ except ValueError:
73
+ raise ValueError("Dates must be in ISO format (YYYY-MM-DD)")
74
+ self._end_datetime = value
75
+
76
+ @property
77
+ def granularity(self):
78
+ if isinstance(self._granularity, Granularity):
79
+ return self._granularity.value
80
+ return self._granularity
81
+
82
+ @granularity.setter
83
+ def granularity(self, value: Granularity):
84
+ """Set granularity and validate it.
85
+
86
+ Args:
87
+ value (str): Granularity in "5min", "H", "D".
88
+ """
89
+ if value is not None:
90
+ try:
91
+ Granularity(value)
92
+ except ValueError:
93
+ raise ValueError("Granularity must be 5min, H or D")
94
+ self._granularity = value
95
+
96
+ @property
97
+ def dimensions(self):
98
+ return {d.name: d for d in self._dimensions}
99
+
100
+ @property
101
+ def metrics(self):
102
+ return {m.name: m for m in self._metrics}
103
+
104
+ @property
105
+ def trends(self):
106
+ return {m.name: {t.name: t for t in m._trends} for m in self._metrics}
107
+
108
+ def _validate_metrics(self) -> None:
109
+ """Validate metrics format and logic.
110
+
111
+ Raises:
112
+ ValueError: If metrics are invalid
113
+ """
114
+ if not self.metrics:
115
+ raise ValueError("metrics must be set")
116
+
117
+ def _validate_dimensions(self) -> None:
118
+ """Validate dimensions format and logic.
119
+
120
+ Raises:
121
+ ValueError: If dimensions are invalid
122
+ """
123
+ if not self.dimensions:
124
+ raise ValueError("dimensions must be set")
125
+
126
+ for d in self._dimensions:
127
+ if not isinstance(d.function, Generator):
128
+ raise ValueError(
129
+ f"{d.name} dimension function must be a generator object"
130
+ )
131
+
132
+ def _validate_dates(self) -> None:
133
+ """Validate start_datetime and end_datetime format and logic.
134
+
135
+ Raises:
136
+ ValueError: If dates are invalid or start_datetime is after end_datetime
137
+ """
138
+ if not self.start_datetime:
139
+ raise ValueError("start_datetime must be set")
140
+
141
+ if not self.end_datetime:
142
+ raise ValueError("end_datetime must be set")
143
+
144
+ if (self.start_datetime is None) != (self.end_datetime is None):
145
+ raise ValueError(
146
+ "Both start_datetime and end_datetime must be either set or None"
147
+ )
148
+
149
+ start = datetime.fromisoformat(self.start_datetime)
150
+ end = datetime.fromisoformat(self.end_datetime)
151
+
152
+ if start > end:
153
+ raise ValueError("start_datetime cannot be after end_datetime")
154
+
155
+ def add_dimension(self, name: str, function) -> None:
156
+ """
157
+ Add a new dimension to the collection.
158
+
159
+ A dimension represents an additional attribute or aspect of the dataset. Each dimension is
160
+ identified by a unique name and associated with a function that generates its values.
161
+
162
+ Args:
163
+ name (str): The unique name of the dimension.
164
+ function (int | str | Generator): A callable (e.g., generator function) that produces values for the dimension.
165
+
166
+ Raises:
167
+ ValueError: If a dimension with the same name already exists in the collection.
168
+
169
+ Example:
170
+ >>> def sample_generator():
171
+ ... while True:
172
+ ... yield "sample_value"
173
+ ...
174
+ >>> my_object.add_dimension(name="category", function=sample_generator())
175
+ """
176
+ dimension = Dimensions(name=name, function=function)
177
+ # Raise error if self._dimensions already contains a dimension with the same name
178
+ if dimension in self._dimensions:
179
+ raise ValueError(f"Dimension with name {name} already exists")
180
+ self._dimensions.append(dimension)
181
+
182
+ def update_dimension(self, name: str, function) -> None:
183
+ """
184
+ Update an existing dimension in the DataGen instance.
185
+
186
+ Allows updating the function associated with a dimension. The dimension is identified by its name.
187
+
188
+ Args:
189
+ name (str): The unique name of the dimension to update.
190
+ function (int | str | Generator): int or string or callable (e.g., generator function) that produces values for the dimension.
191
+ If None, the function will remain unchanged.
192
+
193
+ Raises:
194
+ ValueError: If the dimension with the specified name does not exist.
195
+ ValueError: If the provided function is not a callable object.
196
+
197
+ Example:
198
+ ```python
199
+ # Updating an existing dimension
200
+ def new_generator():
201
+ while True:
202
+ yield "new_value"
203
+
204
+ data_gen.update_dimension(name="category", function=new_generator())
205
+ ```
206
+ """
207
+ if name not in self.dimensions:
208
+ raise ValueError(f"Dimension with name '{name}' does not exist.")
209
+
210
+ dimension = self.dimensions[name]
211
+
212
+ if function is not None:
213
+ if (
214
+ not isinstance(function, Generator)
215
+ and not isinstance(function, int)
216
+ and not isinstance(function, str)
217
+ and not isinstance(function, float)
218
+ ):
219
+ raise ValueError(
220
+ "Provided function must be callable or int or float or string."
221
+ )
222
+ dimension.function = function
223
+
224
+ def add_metric(
225
+ self,
226
+ name: str,
227
+ trends: Set[Trends]
228
+ ) -> None:
229
+ """
230
+ Add a metric to the DataGen instance.
231
+
232
+ This method allows you to add a new metric with specified characteristics to the DataGen instance.
233
+ The `function_type` determines the type of data generation (e.g., sine wave, constant value, etc.).
234
+ For sine or cosine metrics, additional parameters (`frequency_in_hour`, `offset_in_minutes`, and `scale`)
235
+ must be provided. For constant metrics, only `scale` is required.
236
+
237
+ Args:
238
+ name (str): The unique name of the metric.
239
+ function_type (str): The type of function used for data generation.
240
+ Must be one of ["sine", "cosine", "constant", "generator"].
241
+ frequency_in_hour (Optional[float]): The frequency of oscillation in hours.
242
+ Required for "sine" and "cosine".
243
+ offset_in_minutes (Optional[float]): The phase offset in minutes.
244
+ Required for "sine" and "cosine".
245
+ scale (Optional[float]): The amplitude of the wave or the constant value.
246
+ Required for all function types.
247
+
248
+ Raises:
249
+ ValueError: If a metric with the same name already exists.
250
+ ValueError: If required parameters for the specified `function_type` are missing.
251
+
252
+ Example:
253
+ ```python
254
+ # Adding a sine metric
255
+ data_gen.add_metric(
256
+ name="sine_metric",
257
+ function_type="sine",
258
+ frequency_in_hour=1.0,
259
+ offset_in_minutes=15.0,
260
+ scale=10.0
261
+ )
262
+
263
+ # Adding a constant metric
264
+ data_gen.add_metric(
265
+ name="constant_metric",
266
+ function_type="constant",
267
+ scale=5.0
268
+ )
269
+ ```
270
+ """
271
+ metric = Metrics(
272
+ name=name,
273
+ trends=trends
274
+ )
275
+ # Raise error if self._metrics already contains a metric with the same name
276
+ for m in self._metrics:
277
+ if name == m.name:
278
+ raise ValueError(f"Metric with name '{name}' already exists")
279
+ self._metrics.append(metric)
280
+
281
+ def update_metric(
282
+ self,
283
+ name: str,
284
+ function_value: Optional[Generator] = None,
285
+ frequency_in_hour: Optional[float] = None,
286
+ offset_in_minutes: Optional[float] = None,
287
+ scale: Optional[float] = None,
288
+ ) -> None:
289
+ """
290
+ Update an existing metric in the DataGen instance.
291
+
292
+ Allows updating the characteristics of a metric. The metric is identified by its name.
293
+
294
+ Args:
295
+ name (str): The unique name of the metric to update.
296
+ function_value (Optional[Generator]): A new generator function for the metric.
297
+ Required if `function_type` is "generator".
298
+ frequency_in_hour (Optional[float]): The new frequency of oscillation in hours.
299
+ Required if `function_type` is "sine" or "cosine".
300
+ offset_in_minutes (Optional[float]): The new phase offset in minutes.
301
+ Required if `function_type` is "sine" or "cosine".
302
+ scale (Optional[float]): The new amplitude or constant value. If None, the scale remains unchanged.
303
+
304
+ Raises:
305
+ ValueError: If the metric with the specified name does not exist.
306
+ ValueError: If required parameters for the specified `function_type` are missing.
307
+
308
+ Example:
309
+ ```python
310
+ # Updating an existing metric
311
+ data_gen.update_metric(
312
+ name="sine_metric",
313
+ frequency_in_hour=2.0,
314
+ scale=15.0
315
+ )
316
+ ```
317
+ """
318
+ if name not in self.metrics:
319
+ raise ValueError(f"Metric with name '{name}' does not exist.")
320
+
321
+ metric = self.metrics[name]
322
+
323
+
324
+ if metric._function_type == "generator" and function_value is not None:
325
+ metric._function_value = function_value
326
+ elif metric._function_type in {"sine", "cosine"}:
327
+ if frequency_in_hour is None or offset_in_minutes is None or scale is None:
328
+ raise ValueError(
329
+ "frequency_in_hour, offset_in_minutes, and scale are required for sine or cosine."
330
+ )
331
+ metric._frequency_in_hour = frequency_in_hour
332
+ metric._offset_in_minutes = offset_in_minutes
333
+ metric._scale = scale
334
+ elif metric._function_type == "constant":
335
+ if function_value is None:
336
+ raise ValueError("function_value is required for constant.")
337
+ metric._function_value = function_value
338
+
339
+ def generate_data(self) -> pd.DataFrame:
340
+ """Generate a sample DataFrame with unique IDs and values.
341
+
342
+ Args:
343
+ rows: Number of rows to generate. Must be positive.
344
+
345
+ Returns:
346
+ pd.DataFrame: Generated data with 'id' and 'value' columns
347
+
348
+ Raises:
349
+ ValueError: If rows is less than or equal to 0
350
+ TypeError: If rows cannot be converted to int
351
+ """
352
+ # Validate dates
353
+ self._validate_dates()
354
+ self._validate_dimensions()
355
+ self._validate_metrics()
356
+
357
+ self._timestamps = pd.date_range(
358
+ start=self.start_datetime,
359
+ end=self.end_datetime,
360
+ freq=self.granularity,
361
+ )
362
+
363
+ # create an empty dataframe with timestamps as index
364
+ self.metric_data = pd.DataFrame(index=self._timestamps)
365
+
366
+
367
+ # Generate metric data
368
+ for _, metric in self.metrics.items():
369
+ # recursively concant the dataframe to self.data
370
+ self.metric_data = pd.concat([self.metric_data, metric.generate(self._timestamps)], axis=1)
371
+
372
+
373
+
374
+ # Generate dimension data directly using a dictionary comprehension
375
+ self.dimension_data = pd.DataFrame(
376
+ {
377
+ column_name: [
378
+ next(dimension.function) if not isinstance(dimension.function, (int, str)) else dimension.function
379
+ for _ in range(len(self._timestamps))
380
+ ]
381
+ for column_name, dimension in self.dimensions.items()
382
+ },
383
+ index=self._timestamps,
384
+ )
385
+
386
+ self.data = pd.concat([self.dimension_data, self.metric_data], axis=1)
387
+
@@ -0,0 +1,172 @@
1
+ from pydantic import BaseModel
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Callable, TypeVar, Generator, Literal, Optional, Union, Set
4
+ from enum import Enum
5
+ from ..utils.functions import auto_generate_name
6
+ from ..utils.trends import Trends
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ T = TypeVar("T")
11
+
12
+
13
+ class Granularity(Enum):
14
+ FIVE_MIN = "5min"
15
+ HOURLY = "H"
16
+ DAILY = "D"
17
+
18
+
19
+ class Metrics(ABC):
20
+ def __init__(
21
+ self,
22
+ name: str = "default",
23
+ trends: Set[Trends] = []
24
+ ):
25
+ """
26
+ Initialize a Metrics object.
27
+
28
+ Args:
29
+ name (str): Name of the metric.
30
+ function_type (Literal): Type of function to generate data (e.g., "sine", "cosine", "constant", "generator").
31
+ function_value (Optional[Generator]): A generator function for this metric; required if function_type is "generator".
32
+ frequency_in_hour (Optional[str]): Frequency of trend to oscillate in hours; required if function_type in [sine, cosine].
33
+ offset_in_minutes (Optional[str]): Phase offset of trend in minutes; required if function_type in [sine, cosine].
34
+ scale (Optional[float]): Amplitude of the wave; required if function_type in [sine, cosine].
35
+ """
36
+ self._name = (
37
+ auto_generate_name(category="metric") if name == "default" else name
38
+ )
39
+ self._trends = trends
40
+ # self._function_type = function_type
41
+
42
+ # # Validate required arguments for sine and cosine
43
+ # if function_type in {"sine", "cosine"}:
44
+ # if frequency_in_hour is None or offset_in_minutes is None or scale is None:
45
+ # raise ValueError(
46
+ # "frequency_in_hour, offset_in_minutes, and scale are required for sine or cosine"
47
+ # )
48
+ # self._frequency_in_hour = frequency_in_hour
49
+ # self._offset_in_minutes = offset_in_minutes
50
+ # self._scale = scale
51
+ # self._function_value = None
52
+
53
+ # elif function_type == "generator":
54
+ # if function_value is None:
55
+ # raise ValueError("function_value is required for generator")
56
+ # self._function_value = function_value
57
+ # self._frequency_in_hour = None
58
+ # self._offset_in_minutes = None
59
+ # self._scale = None
60
+
61
+ # elif function_type == "constant":
62
+ # if function_value is None:
63
+ # raise ValueError("scale is required for constant")
64
+ # if not isinstance(function_value, (int, float)):
65
+ # raise ValueError("function_value must be an integer or float")
66
+ # self._function_value = function_value
67
+
68
+ # self._frequency_in_hour = None
69
+ # self._offset_in_minutes = None
70
+ # self._scale = None
71
+ @property
72
+ def name(self) -> str:
73
+ """Get the name of the metric."""
74
+ return self._name
75
+
76
+ @property
77
+ def trends(self) -> Set[Trends]:
78
+ """Get the trends of the metric."""
79
+ return self._trends
80
+
81
+ def generate(self, timestamps) -> pd.DataFrame:
82
+ """Generate data for this metric.
83
+
84
+ Args:
85
+ timestamps: List of timestamps from pd.date_range
86
+ """
87
+
88
+ data = np.zeros(len(timestamps))
89
+
90
+ for t in self._trends:
91
+ data += t.generate(timestamps)
92
+
93
+ self._data = pd.DataFrame(data, columns=[self._name], index=timestamps)
94
+ return self._data
95
+
96
+
97
+ def __repr__(self):
98
+ # drop few keys from the dictionary
99
+ json_data = self.to_json()
100
+
101
+ return str(json_data)
102
+
103
+ # add a function to represent the metric in json format
104
+ def to_json(self):
105
+ return {
106
+ "name": self._name,
107
+ "trends": [t._name for t in self._trends],
108
+ }
109
+
110
+
111
+ class Dimensions(ABC):
112
+ def __init__(self, name: str, function: Union[int, str, float, Generator]):
113
+ """Initialize a dimension with a name and value generation function.
114
+
115
+ Args:
116
+ name: Name of the dimension
117
+ function: Function that generates values for this dimension
118
+ """
119
+ self._name = name
120
+ self._function = function
121
+
122
+ @property
123
+ def name(self) -> str:
124
+ """Get the name of the dimension."""
125
+ return self._name
126
+
127
+ @property
128
+ def function(self) -> Union[int, str, float, Generator]:
129
+ """Get the value generation function."""
130
+ return self._function
131
+
132
+ @function.setter
133
+ def function(self, value: Union[int, str, float, Generator]) -> None:
134
+ """Set the value generation function.
135
+
136
+ Args:
137
+ value: Function that generates values for this dimension. Should be a generator object
138
+ """
139
+ # validate if value is a generator object
140
+ if (
141
+ not isinstance(value, int)
142
+ and not isinstance(value, str)
143
+ and not isinstance(value, float)
144
+ and not isinstance(value, Generator)
145
+ ):
146
+ raise ValueError(
147
+ "function must be a generator object or int or str or float"
148
+ )
149
+ self._function = value
150
+
151
+ def _create_generator(self, timestamps) -> Generator[T, None, None]:
152
+ """Create a generator that yields dimension values.
153
+
154
+ Args:
155
+ timestamps: List of timestamps from pd.date_range
156
+
157
+ """
158
+ pass
159
+
160
+ def __eq__(self, other: object) -> bool:
161
+ """Enable equality comparison for set operations."""
162
+ if not isinstance(other, Dimensions):
163
+ return NotImplemented
164
+ return self._name == other.name
165
+
166
+ def __hash__(self) -> int:
167
+ """Enable hashing for set operations."""
168
+ return hash(self._name)
169
+
170
+ # add a function to represent the dimension in json format
171
+ def to_json(self):
172
+ return {"name": self.name, "function": self.function.__repr__()}
@@ -0,0 +1,77 @@
1
+ # create several out of the box generator functions to be used in the DataGen class
2
+
3
+ import random
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ def constant(value):
8
+ """
9
+ Returns a constant value.
10
+
11
+ Args:
12
+ value: The constant value to return.
13
+
14
+ """
15
+ while True:
16
+ yield value
17
+
18
+ def random_choice(iterable):
19
+ """
20
+ Returns a random element from the given iterable.
21
+
22
+ Args:
23
+ iterable (iterable): The iterable to choose from.
24
+
25
+ """
26
+ while True:
27
+ yield random.choice(iterable)
28
+
29
+
30
+ def random_int(start, end):
31
+ """
32
+ Returns a random integer between start and end, inclusive.
33
+
34
+ Args:
35
+ start (int): The starting value of the range.
36
+ end (int): The ending value of the range.
37
+
38
+ """
39
+ while True:
40
+ yield random.randint(start, end)
41
+
42
+
43
+ def random_float(start, end):
44
+ """
45
+ Returns a random float between start and end, inclusive.
46
+
47
+ Args:
48
+ start (float): The starting value of the range.
49
+ end (float): The ending value of the range.
50
+
51
+ """
52
+ while True:
53
+ yield random.uniform(start, end)
54
+
55
+
56
+ def ordered_choice(iterable):
57
+ """
58
+ Returns a random element from the given iterable in order.
59
+
60
+ Args:
61
+ iterable (iterable): The iterable to choose from.
62
+
63
+ """
64
+ while True:
65
+ yield random.choice(iterable)
66
+
67
+
68
+ def auto_generate_name(category):
69
+ """
70
+ Generates a unique name for a metric or dimension.
71
+
72
+ Args:
73
+ category (str): The category of the name, either 'metric' or 'dimension'.
74
+
75
+ """
76
+ return f"{category}_{random.randint(1, 100)}"
77
+
@@ -0,0 +1,249 @@
1
+ import numpy as np
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Callable, TypeVar, Generator, Literal, Optional, Union
4
+ import pandas as pd
5
+
6
+ class Trends(ABC):
7
+ def __init__(
8
+ self,
9
+ name: str = "default",
10
+
11
+ ):
12
+ """
13
+ Initialize a Trends object.
14
+
15
+ Args:
16
+ name (str): Name of the trend.
17
+
18
+ """
19
+ self._name = name
20
+
21
+ @property
22
+ def name(self) -> str:
23
+ return self._name
24
+
25
+ @abstractmethod
26
+ def generate(
27
+ self,
28
+ timestamps: pd.DatetimeIndex,
29
+ ) -> np.array:
30
+ """
31
+ Generate a time series trend.
32
+
33
+ Args:
34
+ start_datetime (Union[str, pd.Timestamp]): Start datetime of the trend.
35
+ end_datetime (Union[str, pd.Timestamp]): End datetime of the trend.
36
+
37
+ """
38
+ pass
39
+
40
+
41
+ class SinusoidalTrend(Trends):
42
+ def __init__(
43
+ self,
44
+ name: str = "default",
45
+ amplitude: float = 1,
46
+ freq: float = 1,
47
+ phase: float = 0,
48
+ noise_level: float = 0,
49
+ ):
50
+ """
51
+ Initialize a SinusoidalTrend object.
52
+
53
+ Args:
54
+ name (str): Name of the trend.
55
+ amplitude (float): Amplitude of the sinusoidal wave.
56
+ freq (float): Frequency of the sinusoidal wave in days.
57
+ phase (float): Phase offset of the sinusoidal wave in hours.
58
+ noise_level (float): Standard deviation of the noise.
59
+ """
60
+ super().__init__(name)
61
+ self._amplitude = amplitude
62
+ self._freq = freq
63
+ self._phase = phase
64
+ self._noise_level = noise_level
65
+
66
+ @property
67
+ def amplitude(self) -> float:
68
+ return self._amplitude
69
+
70
+ @property
71
+ def freq(self) -> float:
72
+ return self._freq
73
+
74
+ @property
75
+ def phase(self) -> float:
76
+ return self._phase
77
+
78
+ @property
79
+ def noise_level(self) -> float:
80
+ return self._noise_level
81
+
82
+ def generate(self, timestamps: pd.DatetimeIndex) -> np.ndarray:
83
+ """
84
+ Generate a sinusoidal wave with added noise.
85
+
86
+ Args:
87
+ timestamps (pd.DatetimeIndex): Array of timestamps.
88
+
89
+ Returns:
90
+ np.ndarray: Sinusoidal wave with noise.
91
+ """
92
+ # Calculate the time in fractional days
93
+ time_in_days = (timestamps - timestamps[0]).total_seconds() / (24 * 3600)
94
+
95
+ # Convert phase to fractional days
96
+ phase_in_days = self._phase / 24.0
97
+
98
+ # Calculate the sinusoidal wave
99
+ base_wave = self._amplitude * np.sin(2 * np.pi * (1/self._freq) * (time_in_days + phase_in_days))
100
+
101
+ # Add noise
102
+ noise = np.random.normal(0, self._noise_level, len(timestamps))
103
+ sinusoidal_wave = base_wave + noise
104
+
105
+ return sinusoidal_wave
106
+
107
+
108
+
109
+ class LinearTrend(Trends):
110
+ def __init__(
111
+ self,
112
+ name: str = "default",
113
+ offset: float = 0.0,
114
+ noise_level: float = 0.0,
115
+ limit: float = 2.0,
116
+ ):
117
+ """
118
+ Initialize a LinearTrend object.
119
+
120
+ Args:
121
+ name (str): Name of the trend.
122
+ limit (float): Upper limit of the linear trend.
123
+ offset (float): Intercept (b) of the linear trend.
124
+ noise_level (float): Standard deviation of the noise.
125
+ """
126
+ super().__init__(name)
127
+
128
+ self._offset = offset
129
+ self._noise_level = noise_level
130
+ # check if limit is within the range of 1 and 100
131
+ if limit < 1 or limit > 100:
132
+ raise ValueError("Limit must be within the range of 1 and 100")
133
+ self._limit = limit
134
+
135
+ @property
136
+ def limit(self) -> float:
137
+ return self._limit
138
+
139
+ @property
140
+ def offset(self) -> float:
141
+ return self._offset
142
+
143
+ @property
144
+ def noise_level(self) -> float:
145
+ return self._noise_level
146
+
147
+ def generate(self, timestamps) -> np.ndarray:
148
+ """
149
+ Generate a linear trend with optional noise.
150
+
151
+ Args:
152
+ timestamps (pd.DatetimeIndex): Array of timestamps.
153
+
154
+ Returns:
155
+ np.ndarray: Generated linear trend values.
156
+ """
157
+ # Calculate time differences in the appropriate unit
158
+ time_deltas = (timestamps - timestamps[0])
159
+
160
+ if timestamps.freq == "5min": # 5-minute granularity
161
+ time_numeric = time_deltas.total_seconds() / 60.0 # Convert to minutes
162
+ elif timestamps.freq == "H": # Hourly granularity
163
+ time_numeric = time_deltas.total_seconds() / 3600.0 # Convert to hours
164
+ elif timestamps.freq == "D": # Daily granularity
165
+ time_numeric = time_deltas.days # Use days directly
166
+ else:
167
+ raise ValueError("Unsupported granularity. Use 5T, H, or D.")
168
+
169
+ self._coefficient = np.radians(np.sin(self._limit/len(time_numeric)))
170
+
171
+ # Calculate the linear trend
172
+ base_trend = self._coefficient * time_numeric + self._offset
173
+
174
+ # Add noise
175
+ noise = np.random.normal(0, self._noise_level, len(timestamps))
176
+ trend_with_noise = base_trend + noise
177
+
178
+ return trend_with_noise
179
+
180
+ class WeekendTrend(Trends):
181
+ def __init__(
182
+ self,
183
+ name: str = "default",
184
+ weekend_effect: float = 1.0,
185
+ direction: Literal["up", "down"] = "up",
186
+ noise_level: float = 0.0,
187
+ limit: float = 10.0,
188
+ ):
189
+ """
190
+ Initialize a WeekendTrend object.
191
+
192
+ Args:
193
+ name (str): Name of the trend.
194
+ weekend_effect (float): Magnitude of the weekend effect.
195
+ direction (Literal["up", "down"]): Direction of the weekend effect.
196
+ noise_level (float): Standard deviation of the noise.
197
+ limit (float): Maximum value for the weekend effect.
198
+ """
199
+ super().__init__(name)
200
+ self._weekend_effect = weekend_effect
201
+ self._direction = direction
202
+ self._noise_level = noise_level
203
+ self._limit = limit
204
+
205
+ @property
206
+ def weekend_effect(self) -> float:
207
+ return self._weekend_effect
208
+
209
+ @property
210
+ def direction(self) -> str:
211
+ return self._direction
212
+
213
+ @property
214
+ def noise_level(self) -> float:
215
+ return self._noise_level
216
+
217
+ @property
218
+ def limit(self) -> float:
219
+ return self._limit
220
+
221
+ def generate(self, timestamps: pd.DatetimeIndex) -> np.ndarray:
222
+ """
223
+ Generate a weekend-specific trend.
224
+
225
+ Args:
226
+ timestamps (pd.DatetimeIndex): Array of timestamps.
227
+
228
+ Returns:
229
+ np.ndarray: Trend values with weekend effect.
230
+ """
231
+ # Initialize the trend with zeros
232
+ trend = np.zeros(len(timestamps))
233
+
234
+ # Determine if each timestamp falls on a weekend (Saturday or Sunday)
235
+ is_weekend = timestamps.weekday >= 5
236
+
237
+ # Apply the weekend effect
238
+ weekend_adjustment = self._weekend_effect if self._direction == "up" else -self._weekend_effect
239
+ trend[is_weekend] = weekend_adjustment
240
+
241
+
242
+ # Clip the trend to the specified limit
243
+ trend = np.clip(trend, -self._limit, self._limit)
244
+
245
+ # Add noise
246
+ noise = np.random.normal(0, self._noise_level, len(timestamps))
247
+ trend += noise
248
+
249
+ return trend
@@ -0,0 +1,16 @@
1
+ """
2
+ Utility functions for data generation
3
+ """
4
+ import pandas as pd
5
+
6
+ def some_function(data: pd.DataFrame) -> pd.DataFrame:
7
+ """
8
+ Example utility function
9
+
10
+ Args:
11
+ data: Input DataFrame
12
+
13
+ Returns:
14
+ pd.DataFrame: Processed DataFrame
15
+ """
16
+ return data.copy()