rand-engine 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rand_engine-0.2.0 → rand_engine-0.3.0}/PKG-INFO +1 -1
- {rand_engine-0.2.0 → rand_engine-0.3.0}/pyproject.toml +1 -2
- rand_engine-0.3.0/rand_engine/core/distinct_utils.py +54 -0
- rand_engine-0.3.0/rand_engine/main/data_generator.py +74 -0
- rand_engine-0.2.0/rand_engine/core/distinct_utils.py +0 -23
- rand_engine-0.2.0/rand_engine/main/dataframe_builder.py +0 -64
- {rand_engine-0.2.0 → rand_engine-0.3.0}/README.md +0 -0
- {rand_engine-0.2.0 → rand_engine-0.3.0}/rand_engine/__init__.py +0 -0
- {rand_engine-0.2.0 → rand_engine-0.3.0}/rand_engine/core/datetime_core.py +0 -0
- {rand_engine-0.2.0 → rand_engine-0.3.0}/rand_engine/core/distinct_core.py +0 -0
- {rand_engine-0.2.0 → rand_engine-0.3.0}/rand_engine/core/general_utils.py +0 -0
- {rand_engine-0.2.0 → rand_engine-0.3.0}/rand_engine/core/numeric_core.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: rand-engine
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
|
|
5
5
|
Home-page: https://github.com/marcoaureliomenezes/rand_engine
|
|
6
6
|
Author: marcoaureliomenezes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "rand-engine"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
|
|
5
5
|
authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
|
|
6
6
|
repository = "https://github.com/marcoaureliomenezes/rand_engine"
|
|
@@ -12,7 +12,6 @@ numpy = "^2.1.1"
|
|
|
12
12
|
pandas = "^2.2.2"
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
[tool.poetry.group.test.dependencies]
|
|
17
16
|
pytest = "^8.3.3"
|
|
18
17
|
faker = "^28.4.1"
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DistinctUtils:
|
|
6
|
+
|
|
7
|
+
@classmethod
|
|
8
|
+
def handle_distincts_lvl_1(self, distinct_prop, precision):
|
|
9
|
+
return [ key for key, value in distinct_prop.items() for i in range(value * precision )]
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def handle_distincts_lvl_2(self, distincts, sep=";"):
|
|
13
|
+
data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
|
|
14
|
+
return data_flatted
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def handle_distincts_lvl_3(self, distincts, sep=";"):
|
|
19
|
+
parm_paired_distincts = {k: list(map(lambda x: f"{x[0]}@!{x[1]}", v)) for k, v in distincts.items()}
|
|
20
|
+
data_flatted = self.handle_distincts_lvl_2(parm_paired_distincts, sep)
|
|
21
|
+
result = []
|
|
22
|
+
for i in data_flatted:
|
|
23
|
+
value, size = i.split("@!")
|
|
24
|
+
result.extend([value for _ in range(int(size))])
|
|
25
|
+
return result
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def handle_distincts_lvl_4(self, distincts, sep=";"):
|
|
29
|
+
combinations = [list(itertools.product([k], *v)) for k, v in distincts.items()]
|
|
30
|
+
result = [sep.join(i) for i in list(itertools.chain(*combinations))]
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == '__main__':
|
|
36
|
+
distincts = {"OPC": ["C_OPC","V_OPC"], "SWP": ["C_SWP", "V_SWP"]}
|
|
37
|
+
|
|
38
|
+
distinct_1 = {"OPC": [["C_OPC","V_OPC"], ["PF", "PJ"], ["IN"]], "SWP": [["C_SWP", "V_SWP"], ["AF", "ME"], ["NULL"]]}
|
|
39
|
+
distinct_2 = {"OPC": [{"C_OPC": ["PF", "PJ"]}, {"V_OPC": ["NA"]}], "SWP": [{"C_SWP": ["AP"]}, {"V_SWP": ["MA", "ME"]}]}
|
|
40
|
+
#print(DistinctUtils.handle_distincts_lvl_5(distinct_2)[]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def rec(structure):
|
|
44
|
+
if isinstance(structure, list):
|
|
45
|
+
return [rec(i) for i in structure]
|
|
46
|
+
if isinstance(structure, dict):
|
|
47
|
+
return [[[k], rec(v)] for k, v in structure.items()]
|
|
48
|
+
return structure
|
|
49
|
+
|
|
50
|
+
import numpy as np
|
|
51
|
+
|
|
52
|
+
result = rec(distinct_2)
|
|
53
|
+
|
|
54
|
+
combinations = np.array(list(itertools.product(*result)))
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import List, Dict, Optional, Generator, Callable
|
|
3
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataGenerator:
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def handle_splitable(self, metadata, df):
|
|
14
|
+
for key, value in metadata.items():
|
|
15
|
+
if value.get("splitable"):
|
|
16
|
+
sep = value.get("sep", ";")
|
|
17
|
+
cols = value.get("cols")
|
|
18
|
+
df[cols] = df[key].str.split(sep, expand=True)
|
|
19
|
+
df.drop(columns=[key], inplace=True)
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __convert_dt_to_str(self, dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
24
|
+
df_result = dataframe.copy()
|
|
25
|
+
for column in df_result.columns:
|
|
26
|
+
if 'datetime64' in str(df_result[column].dtype):
|
|
27
|
+
df_result[column] = df_result[column].astype(str)
|
|
28
|
+
return df_result
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __sleep_to_contro_throughput(self, min_throughput: int, max_throughput: int):
|
|
32
|
+
sleep_time = 1 / random.uniform(min_throughput, max_throughput)
|
|
33
|
+
time.sleep(sleep_time)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def create_pandas_df(self, size: int, metadata: dict, transformer: Optional[Callable]=None) -> pd.DataFrame:
|
|
37
|
+
dict_data = {key: value["method"](size, **value["parms"]) for key, value in metadata.items()}
|
|
38
|
+
df_data = pd.DataFrame(dict_data)
|
|
39
|
+
df_data_final = self.handle_splitable(metadata, df_data)
|
|
40
|
+
if transformer: df_data_final = transformer(df_data_final)
|
|
41
|
+
return df_data_final
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_spark_df(self, spark: SparkSession, size: int, metadata: dict, transformer: Optional[Callable]=None) -> DataFrame:
|
|
45
|
+
df_data = self.create_pandas_df(size=size, metadata=metadata)
|
|
46
|
+
df_final = spark.createDataFrame(df_data)
|
|
47
|
+
return df_final
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def create_streaming_records(self, microbatch_size:int, metadata: dict, transformer: Optional[Callable]=None, min_throughput: int=1, max_throughput: int = 10) -> Generator:
|
|
51
|
+
while True:
|
|
52
|
+
df_data_microbatch = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
|
|
53
|
+
df_data_parsed = self.__convert_dt_to_str(df_data_microbatch)
|
|
54
|
+
list_of_records = df_data_parsed.to_dict('records')
|
|
55
|
+
for record in list_of_records:
|
|
56
|
+
record["timestamp_created"] = round(time.time(), 3)
|
|
57
|
+
yield record
|
|
58
|
+
self.__sleep_to_contro_throughput(min_throughput, max_throughput)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def create_csv_file(self, microbatch_size: int, size_in_mb: int, metadata: dict, path: str, transformer: Optional[Callable]=None) -> None:
|
|
62
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
63
|
+
while True:
|
|
64
|
+
df = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
|
|
65
|
+
df.to_csv(path, mode='a', header=False, index=False)
|
|
66
|
+
if os.path.getsize(path) > size_in_mb * 1024 * 1024: break
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == '__main__':
|
|
74
|
+
pass
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
class DistinctUtils:
|
|
4
|
-
|
|
5
|
-
@classmethod
|
|
6
|
-
def handle_distincts_lvl_1(self, distinct_prop, precision):
|
|
7
|
-
return [ key for key, value in distinct_prop.items() for i in range(value * precision )]
|
|
8
|
-
|
|
9
|
-
@classmethod
|
|
10
|
-
def handle_distincts_lvl_2(self, distincts, sep=";"):
|
|
11
|
-
data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
|
|
12
|
-
return data_flatted
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def handle_distincts_lvl_3(self, distincts, sep=";"):
|
|
16
|
-
parm_paired_distincts = {k: list(map(lambda x: f"{x[0]}@!{x[1]}", v)) for k, v in distincts.items()}
|
|
17
|
-
data_flatted = self.handle_distincts_lvl_2(parm_paired_distincts, sep)
|
|
18
|
-
result = []
|
|
19
|
-
for i in data_flatted:
|
|
20
|
-
value, size = i.split("@!")
|
|
21
|
-
result.extend([value for _ in range(int(size))])
|
|
22
|
-
return result
|
|
23
|
-
|
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BulkRandEngine:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def handle_splitable(self, metadata, df):
|
|
10
|
-
for key, value in metadata.items():
|
|
11
|
-
if value.get("splitable"):
|
|
12
|
-
sep = value.get("sep", ";")
|
|
13
|
-
cols = value.get("cols")
|
|
14
|
-
df[cols] = df[key].str.split(sep, expand=True)
|
|
15
|
-
df.drop(columns=[key], inplace=True)
|
|
16
|
-
return df
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def create_pandas_df(self, size, metadata):
|
|
21
|
-
df_pandas = pd.DataFrame({key: value["method"](size, **value["parms"]) for key, value in metadata.items()})
|
|
22
|
-
df_pandas = self.handle_splitable(metadata, df_pandas)
|
|
23
|
-
return df_pandas
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def convert_datetimes_to_string(self, pandas_df):
|
|
27
|
-
for column in pandas_df.columns:
|
|
28
|
-
if pandas_df[column].dtype == 'datetime64[ns]':
|
|
29
|
-
pandas_df[column] = pandas_df[column].astype(str)
|
|
30
|
-
return pandas_df
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@classmethod
|
|
34
|
-
def create_streaming_df(self, pandas_df):
|
|
35
|
-
pandas_df = self.convert_datetimes_to_string(pandas_df)
|
|
36
|
-
list_of_dicts = pandas_df.to_dict('records')
|
|
37
|
-
for record in list_of_dicts:
|
|
38
|
-
yield json.dumps(record)
|
|
39
|
-
|
|
40
|
-
@classmethod
|
|
41
|
-
def create_streaming_series(self, pandas_series):
|
|
42
|
-
for record in pandas_series:
|
|
43
|
-
yield record
|
|
44
|
-
|
|
45
|
-
@classmethod
|
|
46
|
-
def create_file(path, word, limit_size):
|
|
47
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
48
|
-
with open(path, 'w') as file:
|
|
49
|
-
while file.tell() < limit_size:
|
|
50
|
-
file.write(word + '\n')
|
|
51
|
-
return True
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def microbatch_file_with_streaming(self, path, metadata, df_transformer, microbatch_size, total_size):
|
|
55
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
56
|
-
while True:
|
|
57
|
-
df = self.create_pandas_df(size=microbatch_size, metadata=metadata)
|
|
58
|
-
df = df_transformer(df)
|
|
59
|
-
df.to_csv(path, mode='a', header=False, index=False)
|
|
60
|
-
if os.path.getsize(path) > total_size:
|
|
61
|
-
break
|
|
62
|
-
|
|
63
|
-
if __name__ == '__main__':
|
|
64
|
-
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|