rand-engine 0.4.4__tar.gz → 0.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rand_engine-0.4.4 → rand_engine-0.4.7}/PKG-INFO +1 -1
- {rand_engine-0.4.4 → rand_engine-0.4.7}/pyproject.toml +1 -1
- rand_engine-0.4.7/rand_engine/core.py +95 -0
- {rand_engine-0.4.4/rand_engine/main → rand_engine-0.4.7/rand_engine}/file_writer.py +6 -4
- {rand_engine-0.4.4 → rand_engine-0.4.7}/rand_engine/main/cdc_generator.py +5 -5
- {rand_engine-0.4.4 → rand_engine-0.4.7}/rand_engine/main/fs_utils.py +1 -1
- rand_engine-0.4.7/rand_engine/main.py +102 -0
- rand_engine-0.4.7/rand_engine/rand_generator.py +46 -0
- rand_engine-0.4.7/rand_engine/spark/spark_core.py +40 -0
- rand_engine-0.4.4/rand_engine/core/distinct_utils.py → rand_engine-0.4.7/rand_engine/utils/distincts.py +8 -1
- rand_engine-0.4.4/rand_engine/main/stream_handle.py → rand_engine-0.4.7/rand_engine/utils/stream_handler.py +1 -1
- rand_engine-0.4.7/rand_engine/utils/update.py +33 -0
- rand_engine-0.4.4/rand_engine/core/datetime_core.py +0 -45
- rand_engine-0.4.4/rand_engine/core/distinct_core.py +0 -65
- rand_engine-0.4.4/rand_engine/core/general_utils.py +0 -14
- rand_engine-0.4.4/rand_engine/core/numeric_core.py +0 -62
- rand_engine-0.4.4/rand_engine/main/data_generator.py +0 -88
- {rand_engine-0.4.4 → rand_engine-0.4.7}/README.md +0 -0
- {rand_engine-0.4.4 → rand_engine-0.4.7}/rand_engine/__init__.py +0 -0
- {rand_engine-0.4.4/rand_engine/main → rand_engine-0.4.7/rand_engine/interfaces}/i_random_spec.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rand-engine
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.7
|
|
4
4
|
Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
|
|
5
5
|
Author: marcoaureliomenezes
|
|
6
6
|
Author-email: marcoaurelioreislima@gmail.com
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "rand-engine"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.7"
|
|
4
4
|
description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
|
|
5
5
|
authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
|
|
6
6
|
repository = "https://github.com/marcoaureliomenezes/rand_engine"
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import List, Any
|
|
3
|
+
import numpy as np
|
|
4
|
+
from datetime import datetime as dt
|
|
5
|
+
from functools import reduce
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Core:
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def gen_distincts(self, size: int, distinct: List[Any]) -> np.ndarray:
|
|
13
|
+
assert len(list(set([type(x) for x in distinct]))) == 1
|
|
14
|
+
return np.random.choice(distinct, size)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def gen_distincts_untyped(self, size: int, distinct: List[Any]) -> List[Any]:
|
|
19
|
+
return list(map(lambda x: distinct[x], np.random.randint(0, len(distinct), size)))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def gen_complex_distincts(self, size: int, pattern="x.x.x-x", replacement="x", templates=[]):
|
|
24
|
+
assert pattern.count(replacement) == len(templates)
|
|
25
|
+
list_of_lists, counter = [], 0
|
|
26
|
+
for replacer_cursor in range(len(pattern)):
|
|
27
|
+
if pattern[replacer_cursor] == replacement:
|
|
28
|
+
list_of_lists.append(templates[counter]["method"](size, **templates[counter]["parms"]))
|
|
29
|
+
counter += 1
|
|
30
|
+
else:
|
|
31
|
+
list_of_lists.append(np.array([pattern[replacer_cursor] for i in range(size)]))
|
|
32
|
+
return reduce(lambda a, b: a.astype('str') + b.astype('str'), list_of_lists)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def gen_ints(self, size: int, min: int, max: int) -> np.ndarray:
|
|
37
|
+
return np.random.randint(min, max + 1, size)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
|
|
42
|
+
str_arr = np.random.randint(0, 10**length, size).astype('str')
|
|
43
|
+
return np.char.zfill(str_arr, length)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def gen_floats(self, size: int, min: int, max: int, round: int = 2) -> np.ndarray:
|
|
48
|
+
sig_part = np.random.randint(min, max, size)
|
|
49
|
+
decimal = np.random.randint(0, 10 ** round, size)
|
|
50
|
+
return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2) -> np.ndarray:
|
|
55
|
+
return np.round(np.random.normal(mean, std, size), round)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def gen_unix_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
60
|
+
dt_start, dt_end = dt.strptime(start, format), dt.strptime(end, format)
|
|
61
|
+
if dt_start < dt(1970, 1, 1): dt_start = dt(1970, 1, 1)
|
|
62
|
+
timestamp_start, timestamp_end = dt_start.timestamp(), dt_end.timestamp()
|
|
63
|
+
int_array = np.random.randint(timestamp_start, timestamp_end, size)
|
|
64
|
+
return int_array
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def gen_unique_identifiers(self, size: int, strategy="zint", length=12) -> np.ndarray:
|
|
69
|
+
import uuid
|
|
70
|
+
if strategy == "uuid4":
|
|
71
|
+
return np.array([str(uuid.uuid4()) for _ in range(size)])
|
|
72
|
+
elif strategy == "uuid1":
|
|
73
|
+
return np.array([str(uuid.uuid1()) for _ in range(size)])
|
|
74
|
+
elif strategy == "zint":
|
|
75
|
+
return self.gen_ints_zfilled(size, length)
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError("Method not recognized. Use 'uuid4', 'uuid1', 'shortuuid' or 'random'.")
|
|
78
|
+
# @classmethod
|
|
79
|
+
# def gen_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
80
|
+
# """
|
|
81
|
+
# This method generates an array of random timestamps.
|
|
82
|
+
# :param size: int: Number of elements to be generated.
|
|
83
|
+
# :param start: str: Start date of the generated timestamps.
|
|
84
|
+
# :param end: str: End date of the generated timestamps.
|
|
85
|
+
# :param format: str: Format of the input dates.
|
|
86
|
+
# :return: np.ndarray: Array of random timestamps."""
|
|
87
|
+
# date_array = self.gen_unix_timestamps(size, start, end, format).astype('datetime64[s]')
|
|
88
|
+
# return date_array
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# @classmethod
|
|
92
|
+
# def gen_datetimes(self, size: int, start: str, end: str, format_in: str, format_out: str):
|
|
93
|
+
# timestamp_array = self.gen_unix_timestamps(size, start, end, format_in)
|
|
94
|
+
# vectorized_func = np.vectorize(lambda x: dt.fromtimestamp(x).strftime(format_out))
|
|
95
|
+
# return vectorized_func(timestamp_array)
|
|
@@ -86,7 +86,8 @@ class FileWriter:
|
|
|
86
86
|
:return: Callable: Function to write the Pandas DataFrame to a csv file.
|
|
87
87
|
"""
|
|
88
88
|
if self.write_options.get("compression"):
|
|
89
|
-
|
|
89
|
+
# Add compression extension to the end of the filename
|
|
90
|
+
full_path = f"{full_path}.{self.write_options['compression']}"
|
|
90
91
|
writer = lambda: dataframe().to_csv(full_path, index=False, **self.write_options)
|
|
91
92
|
return writer
|
|
92
93
|
|
|
@@ -98,7 +99,8 @@ class FileWriter:
|
|
|
98
99
|
:return: Callable: Function to write the Pandas DataFrame to a json file.
|
|
99
100
|
"""
|
|
100
101
|
if self.write_options.get("compression"):
|
|
101
|
-
|
|
102
|
+
# Add compression extension to the end of the filename
|
|
103
|
+
full_path = f"{full_path}.{self.write_options['compression']}"
|
|
102
104
|
def writer():
|
|
103
105
|
dataframe().to_json(full_path, orient='records', lines=True)
|
|
104
106
|
return writer
|
|
@@ -132,13 +134,13 @@ class FileWriter:
|
|
|
132
134
|
:param path: str: Path of the file to be written.
|
|
133
135
|
:param size_in_mb: int: Size in MB of the file to be written.
|
|
134
136
|
"""
|
|
135
|
-
self.__handle_fs(path, flag=
|
|
137
|
+
self.__handle_fs(path, flag=True)
|
|
136
138
|
counter = 0
|
|
137
139
|
while True:
|
|
138
140
|
full_path = f"{path}/part-{str(counter).zfill(6)}.{self.write_format}"
|
|
139
141
|
dataframe = self.microbatch_def()
|
|
140
142
|
self.dict_format[self.write_format](dataframe, full_path)()
|
|
141
143
|
size_bytes = self.__get_dir_size(path)
|
|
142
|
-
if counter % 100 == 0: print(f"Size: {size_bytes/2**20:.2f} MB")
|
|
144
|
+
#if counter % 100 == 0: print(f"Size: {size_bytes/2**20:.2f} MB")
|
|
143
145
|
if self.__get_dir_size(path) >= size_in_mb*2**20: break
|
|
144
146
|
counter += 1
|
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
|
8
8
|
from pandas import DataFrame as PandasDF
|
|
9
9
|
|
|
10
10
|
from rand_engine.main.i_random_spec import IRandomSpec
|
|
11
|
-
from rand_engine.main
|
|
11
|
+
from rand_engine.main import RandGenerator
|
|
12
12
|
from rand_engine.main.fs_utils import FSUtils, DBFSUtils
|
|
13
13
|
|
|
14
14
|
from pyspark.sql.functions import coalesce
|
|
@@ -34,10 +34,10 @@ class FilesGenerator:
|
|
|
34
34
|
def _get_file_path(self) -> str:
|
|
35
35
|
return f"{self.base_path}/{self.file_name}_{str(uuid4())[:8]}.{self.ext}"
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
def generate_sample(self, size: int=100) -> PandasDF:
|
|
39
39
|
return (
|
|
40
|
-
|
|
40
|
+
RandGenerator(self.footprint.metadata())
|
|
41
41
|
.generate_pandas_df(size, transformer=self.footprint.transformer())
|
|
42
42
|
.get_df()
|
|
43
43
|
)
|
|
@@ -55,7 +55,7 @@ class FilesGenerator:
|
|
|
55
55
|
def write_file(self, size: int=100, const_cols={}):
|
|
56
56
|
file_path = self._get_file_path()
|
|
57
57
|
_ = (
|
|
58
|
-
|
|
58
|
+
RandGenerator(self.footprint.metadata()) \
|
|
59
59
|
.generate_pandas_df(size, transformer=self.footprint.transformer(**const_cols))
|
|
60
60
|
.write() \
|
|
61
61
|
.mode("overwrite") \
|
|
@@ -108,7 +108,7 @@ class CDCGenerator(FilesGenerator):
|
|
|
108
108
|
metadata = self.footprint.metadata()
|
|
109
109
|
size = df_pks_to_change.shape[0]
|
|
110
110
|
transformer = self.footprint.transformer_cdc_update(null_rate=null_rate, **const_cols)
|
|
111
|
-
df_data =
|
|
111
|
+
df_data = RandGenerator(metadata).generate_pandas_df(size, transformer).get_df()
|
|
112
112
|
for coluna in self.pk_cols: df_data[coluna] = df_pks_to_change[coluna]
|
|
113
113
|
if null_rate != 1:
|
|
114
114
|
cols_to_check = [col for col in df_data.columns if col not in self.pk_cols + list(const_cols.keys())]
|
|
@@ -131,7 +131,7 @@ class DBFSUtils(FSUtils):
|
|
|
131
131
|
|
|
132
132
|
def rm(self, path: str, recursive: bool = False) -> None:
|
|
133
133
|
try:
|
|
134
|
-
result = self.dbutils.fs.rm(path,
|
|
134
|
+
result = self.dbutils.fs.rm(path, recursive)
|
|
135
135
|
if not result:
|
|
136
136
|
raise Exception(f"Failed to delete file {path}")
|
|
137
137
|
except Exception as e:
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import List, Dict, Optional, Generator, Callable, Any
|
|
6
|
+
from rand_engine.rand_generator import RandGenerator
|
|
7
|
+
from rand_engine.file_writer import FileWriter
|
|
8
|
+
from rand_engine.utils.stream_handler import StreamHandler
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RandEngine:
|
|
14
|
+
|
|
15
|
+
def __init__(self, random_spec, seed: bool = False):
|
|
16
|
+
np.random.seed(42) if seed else np.random.seed(None)
|
|
17
|
+
self.actual_dataframe: Optional[Callable[[], pd.DataFrame]] = None
|
|
18
|
+
self.data_generator = RandGenerator(random_spec)
|
|
19
|
+
self._mode = "pandas"
|
|
20
|
+
self._size = 1000
|
|
21
|
+
self._transformers: List[Optional[Callable]] = []
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def generate_pandas_df(self, size: int) -> pd.DataFrame:
|
|
25
|
+
"""
|
|
26
|
+
This method generates a pandas DataFrame based on random data specified in the metadata parameter.
|
|
27
|
+
:param size: int: Number of rows to be generated.
|
|
28
|
+
:param transformer: Optional[Callable]: Function to transform the generated data.
|
|
29
|
+
:return: pd.DataFrame: DataFrame with the generated data.
|
|
30
|
+
"""
|
|
31
|
+
def wrapped_lazy_dataframe():
|
|
32
|
+
df_pandas = self.data_generator.generate_first_level(size=size)
|
|
33
|
+
df_pandas = self.data_generator.handle_splitable(df_pandas)
|
|
34
|
+
df_pandas = self.data_generator.apply_embedded_transformers(df_pandas)
|
|
35
|
+
df_pandas = self.data_generator.apply_global_transformers(df_pandas, self._transformers)
|
|
36
|
+
return df_pandas
|
|
37
|
+
self.actual_dataframe = wrapped_lazy_dataframe
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def transformers(self, transformers: List[Optional[Callable]]):
|
|
41
|
+
self._transformers = transformers
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
def generate_spark_df(self, spark, size: int) -> Any:
|
|
45
|
+
"""
|
|
46
|
+
This method generates a Spark DataFrame based on random data specified in the random_spec parameter.
|
|
47
|
+
:param spark: SparkSession: SparkSession object.
|
|
48
|
+
:param size: int: Number of rows to be generated.
|
|
49
|
+
:param transformer: Optional[Callable]: Function to transform the generated data."""
|
|
50
|
+
def wrapped_lazy_dataframe():
|
|
51
|
+
self.generate_pandas_df(size=size)
|
|
52
|
+
df_spark = spark.createDataFrame(self.actual_dataframe())
|
|
53
|
+
return df_spark
|
|
54
|
+
self.actual_dataframe = wrapped_lazy_dataframe
|
|
55
|
+
|
|
56
|
+
def mode(self, mode: str):
|
|
57
|
+
assert mode in ["pandas", "spark"], "Mode not recognized. Use 'pandas' or 'spark'."
|
|
58
|
+
self._mode = mode
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def size(self, size: int):
|
|
62
|
+
self._size = size
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def get_df(self, spark=None):
|
|
66
|
+
if self._mode == "pandas":
|
|
67
|
+
self.generate_pandas_df(size=self._size)
|
|
68
|
+
elif self._mode == "spark":
|
|
69
|
+
self.generate_spark_df(spark=spark, size=self._size)
|
|
70
|
+
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
71
|
+
return self.actual_dataframe()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def stream_dict(self, min_throughput: int=1, max_throughput: int = 10) -> Generator:
|
|
75
|
+
"""
|
|
76
|
+
This method creates a generator of records to be used in a streaming context.
|
|
77
|
+
:param min_throughput: int: Minimum throughput to be generated.
|
|
78
|
+
:param max_throughput: int: Maximum throughput to be generated.
|
|
79
|
+
:return: Generator: Generator of records.
|
|
80
|
+
"""
|
|
81
|
+
self.generate_pandas_df(size=self._size)
|
|
82
|
+
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
83
|
+
while True:
|
|
84
|
+
df_data_microbatch = self.actual_dataframe()
|
|
85
|
+
df_data_parsed = StreamHandler.convert_dt_to_str(df_data_microbatch)
|
|
86
|
+
list_of_records = df_data_parsed.to_dict('records')
|
|
87
|
+
for record in list_of_records:
|
|
88
|
+
record["timestamp_created"] = round(time.time(), 3)
|
|
89
|
+
yield record
|
|
90
|
+
StreamHandler.sleep_to_contro_throughput(min_throughput, max_throughput)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def write(self, size):
|
|
94
|
+
self.generate_pandas_df(size=size)
|
|
95
|
+
microbatch_def = lambda: self.actual_dataframe
|
|
96
|
+
return FileWriter(microbatch_def)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == '__main__':
|
|
101
|
+
|
|
102
|
+
pass
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import List, Optional, Callable
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RandGenerator:
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def __init__(self, random_spec):
|
|
9
|
+
self.random_spec = random_spec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def generate_first_level(self, size: int):
|
|
13
|
+
dict_data = {}
|
|
14
|
+
for k, v in self.random_spec.items():
|
|
15
|
+
try:
|
|
16
|
+
if "args" in v: dict_data[k] = v["method"](size , *v["args"])
|
|
17
|
+
else: dict_data[k] = v["method"](size , **v.get("kwargs", {}))
|
|
18
|
+
except Exception as e:
|
|
19
|
+
raise Exception(f"Error generating data for column '{k}': {e}")
|
|
20
|
+
df_pandas = pd.DataFrame(dict_data)
|
|
21
|
+
return df_pandas
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def apply_embedded_transformers(self, df):
|
|
25
|
+
|
|
26
|
+
cols_with_transformers = {key: value["transformers"] for key, value in self.random_spec.items() if value.get("transformers")}
|
|
27
|
+
for col, transformers in cols_with_transformers.items():
|
|
28
|
+
for transformer in transformers:
|
|
29
|
+
df[col] = df[col].apply(transformer)
|
|
30
|
+
return df
|
|
31
|
+
|
|
32
|
+
def apply_global_transformers(self, df, transformers: List[Optional[Callable]]):
|
|
33
|
+
if transformers:
|
|
34
|
+
if len(transformers) > 0:
|
|
35
|
+
for transformer in transformers:
|
|
36
|
+
df = transformer(df)
|
|
37
|
+
return df
|
|
38
|
+
|
|
39
|
+
def handle_splitable(self, df):
|
|
40
|
+
for key, value in self.random_spec.items():
|
|
41
|
+
if value.get("splitable"):
|
|
42
|
+
sep = value.get("sep", ";")
|
|
43
|
+
cols = value.get("cols")
|
|
44
|
+
df[cols] = df[key].str.split(sep, expand=True)
|
|
45
|
+
df.drop(columns=[key], inplace=True)
|
|
46
|
+
return df
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
import pyspark.sql.functions as F
|
|
3
|
+
from pyspark.sql.functions import randn, rand, randstr
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
class RandSpark:
|
|
8
|
+
|
|
9
|
+
def __init__(self, spark, df: DataFrame):
|
|
10
|
+
self.spark = spark
|
|
11
|
+
self._df = df
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def withColumnRandInt(self, col_name="rand_int", min_size=0, max_size=10):
|
|
15
|
+
return RandSpark(
|
|
16
|
+
self._df.withColumn(col_name, (F.rand() * (max_size - min_size) + min_size).cast("int"))
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def withColumnRandFloat(self, col_name="rand_float", min_size=0.0, max_size=10.0, decimals=2):
|
|
20
|
+
|
|
21
|
+
return RandSpark(
|
|
22
|
+
self._df.withColumn(col_name, F.round(F.rand() * (max_size - min_size) + min_size, decimals))
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def withColumnRandChoice(self, col_name="rand_choice", distincts=[]):
|
|
26
|
+
df_columns = self._df.columns
|
|
27
|
+
aux_col = f"{col_name}_aux"
|
|
28
|
+
|
|
29
|
+
df_pd = pd.DataFrame(distincts, columns=[col_name])
|
|
30
|
+
df_pd[aux_col] = range(len(distincts))
|
|
31
|
+
df_spark = self.spark.createDataFrame(df_pd)
|
|
32
|
+
df = RandSpark(self._df.withColumn(aux_col, (F.rand() * (len(distincts) - 0) + 0).cast("int")))
|
|
33
|
+
return (
|
|
34
|
+
df.alias("a").join(F.broadcast(df_spark).alias("b"), on=aux_col, how="left") \
|
|
35
|
+
.select(*df_columns, f"b.{col_name}"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def __getattr__(self, name):
|
|
39
|
+
"""Delegate unknown methods to the original DataFrame"""
|
|
40
|
+
return getattr(self._df, name)
|
|
@@ -5,7 +5,7 @@ import itertools
|
|
|
5
5
|
class DistinctUtils:
|
|
6
6
|
|
|
7
7
|
@classmethod
|
|
8
|
-
def handle_distincts_lvl_1(self, distinct_prop, precision):
|
|
8
|
+
def handle_distincts_lvl_1(self, distinct_prop, precision=1):
|
|
9
9
|
"""
|
|
10
10
|
This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
|
|
11
11
|
:param distinct_prop: dict: Dictionary containing the distinct values and their respective frequencies.
|
|
@@ -22,6 +22,13 @@ class DistinctUtils:
|
|
|
22
22
|
data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
|
|
23
23
|
return data_flatted
|
|
24
24
|
|
|
25
|
+
@classmethod
|
|
26
|
+
def handle_distincts_lvl_22(self, distincts):
|
|
27
|
+
"""
|
|
28
|
+
This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
|
|
29
|
+
:param distincts: dict: Dictionary containing the distinct values and their respective frequencies."""
|
|
30
|
+
data_flatted = [(j, i) for j in distincts for i in distincts[j]]
|
|
31
|
+
return data_flatted
|
|
25
32
|
|
|
26
33
|
@classmethod
|
|
27
34
|
def handle_distincts_lvl_3(self, distincts, sep=";"):
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from random import randint
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Changer:
|
|
6
|
+
|
|
7
|
+
def __init__(self, cols_to_change):
|
|
8
|
+
self.cols_to_change = cols_to_change
|
|
9
|
+
|
|
10
|
+
def __transformer_numeric(self, df):
|
|
11
|
+
numeric_types = ['int64', 'int32', 'float32', 'float64']
|
|
12
|
+
for col in df.columns:
|
|
13
|
+
if col in self.cols_to_change and df[col].dtype in numeric_types:
|
|
14
|
+
np.random.seed(None)
|
|
15
|
+
df[col + '_random'] = np.random.randint(df[col].min(), df[col].max(), size=len(df))
|
|
16
|
+
df[col + '_random_signal'] = np.random.choice([-1, 1], size=len(df))
|
|
17
|
+
df[col + '_random'] = df[col + '_random'] * df[col + '_random_signal']
|
|
18
|
+
df[col] = df[col] + df[col + '_random']
|
|
19
|
+
df.drop(columns=[col + '_random', col + '_random_signal'], inplace=True)
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __transformer_object(self, df):
|
|
24
|
+
for col in df.columns:
|
|
25
|
+
if col in self.cols_to_change and df[col].dtype == 'object':
|
|
26
|
+
df[col] = np.roll(df[col], randint(1, len(df)))
|
|
27
|
+
return df
|
|
28
|
+
|
|
29
|
+
def updater(self, df):
|
|
30
|
+
transformers = [self.__transformer_numeric, self.__transformer_object]
|
|
31
|
+
for transform in transformers:
|
|
32
|
+
df = transform(df)
|
|
33
|
+
return df
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import random
|
|
3
|
-
from datetime import datetime as dt, timedelta
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class DatetimeCore:
|
|
7
|
-
|
|
8
|
-
@classmethod
|
|
9
|
-
def gen_unix_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
10
|
-
"""
|
|
11
|
-
This method generates an array of random unix timestamps.
|
|
12
|
-
:param size: int: Number of elements to be generated.
|
|
13
|
-
:param start: str: Start date of the generated timestamps.
|
|
14
|
-
:param end: str: End date of the generated timestamps.
|
|
15
|
-
:param format: str: Format of the input dates."""
|
|
16
|
-
dt_start, dt_end = dt.strptime(start, format), dt.strptime(end, format)
|
|
17
|
-
if dt_start < dt(1970, 1, 1): dt_start = dt(1970, 1, 1)
|
|
18
|
-
timestamp_start, timestamp_end = dt_start.timestamp(), dt_end.timestamp()
|
|
19
|
-
int_array = np.random.randint(timestamp_start, timestamp_end, size)
|
|
20
|
-
return int_array
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@classmethod
|
|
24
|
-
def gen_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
25
|
-
"""
|
|
26
|
-
This method generates an array of random timestamps.
|
|
27
|
-
:param size: int: Number of elements to be generated.
|
|
28
|
-
:param start: str: Start date of the generated timestamps.
|
|
29
|
-
:param end: str: End date of the generated timestamps.
|
|
30
|
-
:param format: str: Format of the input dates.
|
|
31
|
-
:return: np.ndarray: Array of random timestamps."""
|
|
32
|
-
|
|
33
|
-
date_array = self.gen_unix_timestamps(size, start, end, format).astype('datetime64[s]')
|
|
34
|
-
return date_array
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@classmethod
|
|
38
|
-
def gen_datetimes(self, size: int, start: str, end: str, format_in: str, format_out: str):
|
|
39
|
-
timestamp_array = self.gen_unix_timestamps(size, start, end, format_in)
|
|
40
|
-
return [dt.fromtimestamp(i).strftime(format_out) for i in timestamp_array]
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if __name__ == '__main__':
|
|
45
|
-
pass
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
from functools import reduce
|
|
2
|
-
from typing import List, Any, Iterator
|
|
3
|
-
import numpy as np
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class DistinctCore:
|
|
7
|
-
|
|
8
|
-
@classmethod
|
|
9
|
-
def gen_distincts_typed(self, size: int, distinct: List[Any]) -> np.ndarray:
|
|
10
|
-
"""
|
|
11
|
-
This method generates an array of random distinct values.
|
|
12
|
-
:param size: int: Number of elements to be generated.
|
|
13
|
-
:param distinct: List[Any]: List of distinct values to be generated.
|
|
14
|
-
:return: np.ndarray: Array of random distinct values.
|
|
15
|
-
"""
|
|
16
|
-
assert len(list(set([type(x) for x in distinct]))) == 1
|
|
17
|
-
return np.random.choice(distinct, size)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@classmethod
|
|
21
|
-
def gen_distincts_untyped(self, size: int, distinct: List[Any]) -> List[Any]:
|
|
22
|
-
"""
|
|
23
|
-
This method generates an array of random distinct values.
|
|
24
|
-
:param size: int: Number of elements to be generated.
|
|
25
|
-
:param distinct: List[Any]: List of distinct values to be generated.
|
|
26
|
-
:return: Iterator: Iterator of random distinct values.
|
|
27
|
-
"""
|
|
28
|
-
return list(map(lambda x: distinct[x], np.random.randint(0, len(distinct), size)))
|
|
29
|
-
|
|
30
|
-
@classmethod
|
|
31
|
-
def gen_complex_distincts(self, size: int, pattern="x.x.x-x", replacement="x", templates=[]):
|
|
32
|
-
"""
|
|
33
|
-
This method generates an array of random distinct values.
|
|
34
|
-
:param size: int: Number of elements to be generated.
|
|
35
|
-
:param pattern: str: Pattern to be replaced.
|
|
36
|
-
:param replacement: str: Replacement of the pattern.
|
|
37
|
-
:param templates: List[Dict]: List of dictionaries containing the method and parameters to be used in the replacement.
|
|
38
|
-
:return: np.ndarray: Array of random distinct values.
|
|
39
|
-
"""
|
|
40
|
-
assert pattern.count(replacement) == len(templates)
|
|
41
|
-
list_of_lists, counter = [], 0
|
|
42
|
-
for replacer_cursor in range(len(pattern)):
|
|
43
|
-
if pattern[replacer_cursor] == replacement:
|
|
44
|
-
list_of_lists.append(templates[counter]["method"](size, **templates[counter]["parms"]))
|
|
45
|
-
counter += 1
|
|
46
|
-
else:
|
|
47
|
-
list_of_lists.append(np.array([pattern[replacer_cursor] for i in range(size)]))
|
|
48
|
-
return reduce(lambda a, b: a.astype('str') + b.astype('str'), list_of_lists)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
if __name__ == '__main__':
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
# def replace_duplicate(array_input, replace):
|
|
58
|
-
# result = list(set(array_input))
|
|
59
|
-
# result.extend([replace for i in range(len(array_input)-len(list(set(array_input))))])
|
|
60
|
-
# random.shuffle(result)
|
|
61
|
-
# return result
|
|
62
|
-
|
|
63
|
-
# def handle_string_format(array_input, **kwargs):
|
|
64
|
-
# return replace_duplicate(array_input, np.nan) \
|
|
65
|
-
# if kwargs.get("rm_dupl") else array_input
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
def expand_array(size=10, base_array=[]):
|
|
4
|
-
return [base_array[int(i % len(base_array))] for i in range(size)]
|
|
5
|
-
|
|
6
|
-
def reduce_array(size=10, base_array=[]):
|
|
7
|
-
int_array = [int(i) for i in np.linspace(0, size-1, len(base_array))]
|
|
8
|
-
reduced = [int_array.index(i) for i in range(size)]
|
|
9
|
-
result = [base_array[i] for i in reduced]
|
|
10
|
-
return result
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def spaced_array(interval, num_part=2):
|
|
14
|
-
return list(np.linspace(interval[0], interval[1], num_part))
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class NumericCore:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@classmethod
|
|
8
|
-
def gen_ints(self, size: int, min: int, max: int) -> np.ndarray:
|
|
9
|
-
"""
|
|
10
|
-
This method generates an array of random integers.
|
|
11
|
-
:param size: int: Number of elements to be generated.
|
|
12
|
-
:param min: int: Minimum value of the generated numbers.
|
|
13
|
-
:param max: int: Maximum value of the generated numbers.
|
|
14
|
-
:return: np.ndarray: Array of random integers.
|
|
15
|
-
"""
|
|
16
|
-
return np.random.randint(min, max + 1, size)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@classmethod
|
|
20
|
-
def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
|
|
21
|
-
"""
|
|
22
|
-
This method generates an array of random integers with a fixed length.
|
|
23
|
-
:param size: int: Number of elements to be generated.
|
|
24
|
-
:param length: int: Length of the generated numbers.
|
|
25
|
-
:return: np.ndarray: Array of random integers.
|
|
26
|
-
"""
|
|
27
|
-
str_arr = np.random.randint(0, 10**length, size).astype('str')
|
|
28
|
-
return np.char.zfill(str_arr, length)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@classmethod
|
|
32
|
-
def gen_floats(self, size: int, min: int, max: int, round: int = 2):
|
|
33
|
-
"""
|
|
34
|
-
This method generates an array of random floats.
|
|
35
|
-
:param size: int: Number of elements to be generated.
|
|
36
|
-
:param min: int: Minimum value of the generated numbers.
|
|
37
|
-
:param max: int: Maximum value of the generated numbers.
|
|
38
|
-
:param round: int: Number of decimal places to round the generated numbers. Default is 2.
|
|
39
|
-
:return: np.ndarray: Array of random floats.
|
|
40
|
-
"""
|
|
41
|
-
sig_part = np.random.randint(min, max, size)
|
|
42
|
-
decimal = np.random.randint(0, 10 ** round, size)
|
|
43
|
-
return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@classmethod
|
|
47
|
-
def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2):
|
|
48
|
-
"""
|
|
49
|
-
This method generates an array of random floats with a normal distribution.
|
|
50
|
-
:param size: int: Number of elements to be generated.
|
|
51
|
-
:param mean: int: Mean of the distribution.
|
|
52
|
-
:param std: int: Standard deviation of the distribution.
|
|
53
|
-
:param round: int: Number of decimal places to round the generated numbers. Default is 2.
|
|
54
|
-
:return: np.ndarray: Array of random floats.
|
|
55
|
-
"""
|
|
56
|
-
return np.round(np.random.normal(mean, std, size), round)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
if __name__ == '__main__':
|
|
62
|
-
pass
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import time
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import List, Dict, Optional, Generator, Callable, Any
|
|
5
|
-
from rand_engine.main.file_writer import FileWriter
|
|
6
|
-
from rand_engine.main.stream_handle import StreamHandle
|
|
7
|
-
|
|
8
|
-
class DataGenerator:
|
|
9
|
-
|
|
10
|
-
def __init__(self, random_spec):
|
|
11
|
-
self.random_spec = random_spec
|
|
12
|
-
self.actual_dataframe = None
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def handle_splitable(self, metadata, df):
|
|
16
|
-
for key, value in metadata.items():
|
|
17
|
-
if value.get("splitable"):
|
|
18
|
-
sep = value.get("sep", ";")
|
|
19
|
-
cols = value.get("cols")
|
|
20
|
-
df[cols] = df[key].str.split(sep, expand=True)
|
|
21
|
-
df.drop(columns=[key], inplace=True)
|
|
22
|
-
return df
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def generate_pandas_df(self, size: int, transformer: Optional[Callable]=None) -> pd.DataFrame:
|
|
26
|
-
"""
|
|
27
|
-
This method generates a pandas DataFrame based on random data specified in the metadata parameter.
|
|
28
|
-
:param size: int: Number of rows to be generated.
|
|
29
|
-
:param transformer: Optional[Callable]: Function to transform the generated data.
|
|
30
|
-
:return: pd.DataFrame: DataFrame with the generated data.
|
|
31
|
-
"""
|
|
32
|
-
assert type(self.random_spec) is dict, "You need to pass a random_spec parameter to generate the random data."
|
|
33
|
-
def first_level():
|
|
34
|
-
dict_data = {key: value["method"](size, **value["parms"]) for key, value in self.random_spec.items()}
|
|
35
|
-
df_pandas = pd.DataFrame(dict_data)
|
|
36
|
-
df_pandas = self.handle_splitable(self.random_spec, df_pandas)
|
|
37
|
-
if transformer: return transformer(df_pandas)
|
|
38
|
-
return df_pandas
|
|
39
|
-
self.actual_dataframe = first_level
|
|
40
|
-
return self
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def generate_spark_df(self, spark, size: int, transformer: Optional[Callable]=None) -> Any:
|
|
44
|
-
"""
|
|
45
|
-
This method generates a Spark DataFrame based on random data specified in the random_spec parameter.
|
|
46
|
-
:param spark: SparkSession: SparkSession object.
|
|
47
|
-
:param size: int: Number of rows to be generated.
|
|
48
|
-
:param transformer: Optional[Callable]: Function to transform the generated data."""
|
|
49
|
-
def second_level():
|
|
50
|
-
self.generate_pandas_df(size=size, transformer=transformer)
|
|
51
|
-
df_spark = spark.createDataFrame(self.actual_dataframe())
|
|
52
|
-
return df_spark
|
|
53
|
-
self.actual_dataframe = second_level
|
|
54
|
-
return self
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def get_df(self):
|
|
58
|
-
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
59
|
-
return self.actual_dataframe()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def stream_dict(self, min_throughput: int=1, max_throughput: int = 10) -> Generator:
|
|
63
|
-
"""
|
|
64
|
-
This method creates a generator of records to be used in a streaming context.
|
|
65
|
-
:param min_throughput: int: Minimum throughput to be generated.
|
|
66
|
-
:param max_throughput: int: Maximum throughput to be generated.
|
|
67
|
-
:return: Generator: Generator of records.
|
|
68
|
-
"""
|
|
69
|
-
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
70
|
-
while True:
|
|
71
|
-
df_data_microbatch = self.actual_dataframe()
|
|
72
|
-
df_data_parsed = StreamHandle.convert_dt_to_str(df_data_microbatch)
|
|
73
|
-
list_of_records = df_data_parsed.to_dict('records')
|
|
74
|
-
for record in list_of_records:
|
|
75
|
-
record["timestamp_created"] = round(time.time(), 3)
|
|
76
|
-
yield record
|
|
77
|
-
StreamHandle.sleep_to_contro_throughput(min_throughput, max_throughput)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def write(self):
|
|
81
|
-
microbatch_def = lambda: self.actual_dataframe
|
|
82
|
-
return FileWriter(microbatch_def)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if __name__ == '__main__':
|
|
87
|
-
|
|
88
|
-
pass
|
|
File without changes
|
|
File without changes
|
{rand_engine-0.4.4/rand_engine/main → rand_engine-0.4.7/rand_engine/interfaces}/i_random_spec.py
RENAMED
|
File without changes
|