rand-engine 0.3.3__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rand_engine-0.3.3 → rand_engine-0.3.7}/PKG-INFO +6 -3
- {rand_engine-0.3.3 → rand_engine-0.3.7}/README.md +3 -0
- {rand_engine-0.3.3 → rand_engine-0.3.7}/pyproject.toml +3 -2
- {rand_engine-0.3.3 → rand_engine-0.3.7}/rand_engine/core/datetime_core.py +18 -13
- {rand_engine-0.3.3 → rand_engine-0.3.7}/rand_engine/core/distinct_core.py +23 -1
- {rand_engine-0.3.3 → rand_engine-0.3.7}/rand_engine/core/distinct_utils.py +9 -4
- rand_engine-0.3.7/rand_engine/core/numeric_core.py +62 -0
- rand_engine-0.3.7/rand_engine/main/data_generator.py +88 -0
- rand_engine-0.3.7/rand_engine/main/file_writer.py +131 -0
- rand_engine-0.3.7/rand_engine/main/stream_handle.py +18 -0
- rand_engine-0.3.3/rand_engine/core/numeric_core.py +0 -33
- rand_engine-0.3.3/rand_engine/main/data_generator.py +0 -72
- {rand_engine-0.3.3 → rand_engine-0.3.7}/rand_engine/__init__.py +0 -0
- {rand_engine-0.3.3 → rand_engine-0.3.7}/rand_engine/core/general_utils.py +0 -0
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: rand-engine
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
|
|
5
|
-
Home-page: https://github.com/marcoaureliomenezes/rand_engine
|
|
6
5
|
Author: marcoaureliomenezes
|
|
7
6
|
Author-email: marcoaurelioreislima@gmail.com
|
|
8
7
|
Requires-Python: >=3.10,<4.0
|
|
@@ -11,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
11
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: fastavro (>=1.10.0,<2.0.0)
|
|
14
|
+
Requires-Dist: fastparquet (>=2024.11.0,<2025.0.0)
|
|
14
15
|
Requires-Dist: numpy (>=2.1.1,<3.0.0)
|
|
15
16
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
16
17
|
Project-URL: Repository, https://github.com/marcoaureliomenezes/rand_engine
|
|
@@ -42,4 +43,6 @@ CoreDistinct().randint(0, 100, 10)
|
|
|
42
43
|
|
|
43
44
|
```
|
|
44
45
|
|
|
46
|
+
git tag 0.0.1
|
|
45
47
|
|
|
48
|
+
git push origin --tags
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "rand-engine"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.7"
|
|
4
4
|
description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
|
|
5
5
|
authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
|
|
6
6
|
repository = "https://github.com/marcoaureliomenezes/rand_engine"
|
|
@@ -10,7 +10,8 @@ readme = "README.md"
|
|
|
10
10
|
python = "^3.10"
|
|
11
11
|
numpy = "^2.1.1"
|
|
12
12
|
pandas = "^2.2.2"
|
|
13
|
-
|
|
13
|
+
fastavro = "^1.10.0"
|
|
14
|
+
fastparquet = "^2024.11.0"
|
|
14
15
|
|
|
15
16
|
[tool.poetry.group.test.dependencies]
|
|
16
17
|
pytest = "^8.3.3"
|
|
@@ -6,7 +6,13 @@ from datetime import datetime as dt, timedelta
|
|
|
6
6
|
class DatetimeCore:
|
|
7
7
|
|
|
8
8
|
@classmethod
|
|
9
|
-
def gen_unix_timestamps(self, size: int, start: str, end: str, format: str):
|
|
9
|
+
def gen_unix_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
10
|
+
"""
|
|
11
|
+
This method generates an array of random unix timestamps.
|
|
12
|
+
:param size: int: Number of elements to be generated.
|
|
13
|
+
:param start: str: Start date of the generated timestamps.
|
|
14
|
+
:param end: str: End date of the generated timestamps.
|
|
15
|
+
:param format: str: Format of the input dates."""
|
|
10
16
|
dt_start, dt_end = dt.strptime(start, format), dt.strptime(end, format)
|
|
11
17
|
if dt_start < dt(1970, 1, 1): dt_start = dt(1970, 1, 1)
|
|
12
18
|
timestamp_start, timestamp_end = dt_start.timestamp(), dt_end.timestamp()
|
|
@@ -15,7 +21,15 @@ class DatetimeCore:
|
|
|
15
21
|
|
|
16
22
|
|
|
17
23
|
@classmethod
|
|
18
|
-
def gen_timestamps(self, size: int, start: str, end: str, format: str):
|
|
24
|
+
def gen_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
|
|
25
|
+
"""
|
|
26
|
+
This method generates an array of random timestamps.
|
|
27
|
+
:param size: int: Number of elements to be generated.
|
|
28
|
+
:param start: str: Start date of the generated timestamps.
|
|
29
|
+
:param end: str: End date of the generated timestamps.
|
|
30
|
+
:param format: str: Format of the input dates.
|
|
31
|
+
:return: np.ndarray: Array of random timestamps."""
|
|
32
|
+
|
|
19
33
|
date_array = self.gen_unix_timestamps(size, start, end, format).astype('datetime64[s]')
|
|
20
34
|
return date_array
|
|
21
35
|
|
|
@@ -25,16 +39,7 @@ class DatetimeCore:
|
|
|
25
39
|
timestamp_array = self.gen_unix_timestamps(size, start, end, format_in)
|
|
26
40
|
return [dt.fromtimestamp(i).strftime(format_out) for i in timestamp_array]
|
|
27
41
|
|
|
28
|
-
if __name__ == '__main__':
|
|
29
|
-
|
|
30
|
-
pass
|
|
31
42
|
|
|
32
43
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# def handle_format(format):
|
|
36
|
-
# return format[randint(0, len(format))] if format == list else \
|
|
37
|
-
# format if format == str else "%d-%m-%Y"
|
|
38
|
-
# def get_interval(start, end, date_format):
|
|
39
|
-
# return datetime.timestamp(datetime.strptime(start, date_format)), \
|
|
40
|
-
# datetime.timestamp(datetime.strptime(end, date_format))
|
|
44
|
+
if __name__ == '__main__':
|
|
45
|
+
pass
|
|
@@ -7,15 +7,36 @@ class DistinctCore:
|
|
|
7
7
|
|
|
8
8
|
@classmethod
|
|
9
9
|
def gen_distincts_typed(self, size: int, distinct: List[Any]) -> np.ndarray:
|
|
10
|
+
"""
|
|
11
|
+
This method generates an array of random distinct values.
|
|
12
|
+
:param size: int: Number of elements to be generated.
|
|
13
|
+
:param distinct: List[Any]: List of distinct values to be generated.
|
|
14
|
+
:return: np.ndarray: Array of random distinct values.
|
|
15
|
+
"""
|
|
10
16
|
assert len(list(set([type(x) for x in distinct]))) == 1
|
|
11
17
|
return np.random.choice(distinct, size)
|
|
12
18
|
|
|
19
|
+
|
|
13
20
|
@classmethod
|
|
14
|
-
def gen_distincts_untyped(self, size: int, distinct: List[Any]) ->
|
|
21
|
+
def gen_distincts_untyped(self, size: int, distinct: List[Any]) -> List[Any]:
|
|
22
|
+
"""
|
|
23
|
+
This method generates an array of random distinct values.
|
|
24
|
+
:param size: int: Number of elements to be generated.
|
|
25
|
+
:param distinct: List[Any]: List of distinct values to be generated.
|
|
26
|
+
:return: Iterator: Iterator of random distinct values.
|
|
27
|
+
"""
|
|
15
28
|
return list(map(lambda x: distinct[x], np.random.randint(0, len(distinct), size)))
|
|
16
29
|
|
|
17
30
|
@classmethod
|
|
18
31
|
def gen_complex_distincts(self, size: int, pattern="x.x.x-x", replacement="x", templates=[]):
|
|
32
|
+
"""
|
|
33
|
+
This method generates an array of random distinct values.
|
|
34
|
+
:param size: int: Number of elements to be generated.
|
|
35
|
+
:param pattern: str: Pattern to be replaced.
|
|
36
|
+
:param replacement: str: Replacement of the pattern.
|
|
37
|
+
:param templates: List[Dict]: List of dictionaries containing the method and parameters to be used in the replacement.
|
|
38
|
+
:return: np.ndarray: Array of random distinct values.
|
|
39
|
+
"""
|
|
19
40
|
assert pattern.count(replacement) == len(templates)
|
|
20
41
|
list_of_lists, counter = [], 0
|
|
21
42
|
for replacer_cursor in range(len(pattern)):
|
|
@@ -27,6 +48,7 @@ class DistinctCore:
|
|
|
27
48
|
return reduce(lambda a, b: a.astype('str') + b.astype('str'), list_of_lists)
|
|
28
49
|
|
|
29
50
|
|
|
51
|
+
|
|
30
52
|
if __name__ == '__main__':
|
|
31
53
|
pass
|
|
32
54
|
|
|
@@ -6,10 +6,19 @@ class DistinctUtils:
|
|
|
6
6
|
|
|
7
7
|
@classmethod
|
|
8
8
|
def handle_distincts_lvl_1(self, distinct_prop, precision):
|
|
9
|
+
"""
|
|
10
|
+
This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
|
|
11
|
+
:param distinct_prop: dict: Dictionary containing the distinct values and their respective frequencies.
|
|
12
|
+
:param precision: int: Precision of the distinct values.
|
|
13
|
+
:return: List: List of distinct values.
|
|
14
|
+
"""
|
|
9
15
|
return [ key for key, value in distinct_prop.items() for i in range(value * precision )]
|
|
10
16
|
|
|
11
17
|
@classmethod
|
|
12
18
|
def handle_distincts_lvl_2(self, distincts, sep=";"):
|
|
19
|
+
"""
|
|
20
|
+
This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
|
|
21
|
+
:param distincts: dict: Dictionary containing the distinct values and their respective frequencies."""
|
|
13
22
|
data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
|
|
14
23
|
return data_flatted
|
|
15
24
|
|
|
@@ -39,16 +48,12 @@ if __name__ == '__main__':
|
|
|
39
48
|
distinct_2 = {"OPC": [{"C_OPC": ["PF", "PJ"]}, {"V_OPC": ["NA"]}], "SWP": [{"C_SWP": ["AP"]}, {"V_SWP": ["MA", "ME"]}]}
|
|
40
49
|
#print(DistinctUtils.handle_distincts_lvl_5(distinct_2)[]
|
|
41
50
|
|
|
42
|
-
|
|
43
51
|
def rec(structure):
|
|
44
52
|
if isinstance(structure, list):
|
|
45
53
|
return [rec(i) for i in structure]
|
|
46
54
|
if isinstance(structure, dict):
|
|
47
55
|
return [[[k], rec(v)] for k, v in structure.items()]
|
|
48
56
|
return structure
|
|
49
|
-
|
|
50
57
|
import numpy as np
|
|
51
|
-
|
|
52
58
|
result = rec(distinct_2)
|
|
53
|
-
|
|
54
59
|
combinations = np.array(list(itertools.product(*result)))
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NumericCore:
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@classmethod
|
|
8
|
+
def gen_ints(self, size: int, min: int, max: int) -> np.ndarray:
|
|
9
|
+
"""
|
|
10
|
+
This method generates an array of random integers.
|
|
11
|
+
:param size: int: Number of elements to be generated.
|
|
12
|
+
:param min: int: Minimum value of the generated numbers.
|
|
13
|
+
:param max: int: Maximum value of the generated numbers.
|
|
14
|
+
:return: np.ndarray: Array of random integers.
|
|
15
|
+
"""
|
|
16
|
+
return np.random.randint(min, max + 1, size)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
|
|
21
|
+
"""
|
|
22
|
+
This method generates an array of random integers with a fixed length.
|
|
23
|
+
:param size: int: Number of elements to be generated.
|
|
24
|
+
:param length: int: Length of the generated numbers.
|
|
25
|
+
:return: np.ndarray: Array of random integers.
|
|
26
|
+
"""
|
|
27
|
+
str_arr = np.random.randint(0, 10**length, size).astype('str')
|
|
28
|
+
return np.char.zfill(str_arr, length)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def gen_floats(self, size: int, min: int, max: int, round: int = 2):
|
|
33
|
+
"""
|
|
34
|
+
This method generates an array of random floats.
|
|
35
|
+
:param size: int: Number of elements to be generated.
|
|
36
|
+
:param min: int: Minimum value of the generated numbers.
|
|
37
|
+
:param max: int: Maximum value of the generated numbers.
|
|
38
|
+
:param round: int: Number of decimal places to round the generated numbers. Default is 2.
|
|
39
|
+
:return: np.ndarray: Array of random floats.
|
|
40
|
+
"""
|
|
41
|
+
sig_part = np.random.randint(min, max, size)
|
|
42
|
+
decimal = np.random.randint(0, 10 ** round, size)
|
|
43
|
+
return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2):
|
|
48
|
+
"""
|
|
49
|
+
This method generates an array of random floats with a normal distribution.
|
|
50
|
+
:param size: int: Number of elements to be generated.
|
|
51
|
+
:param mean: int: Mean of the distribution.
|
|
52
|
+
:param std: int: Standard deviation of the distribution.
|
|
53
|
+
:param round: int: Number of decimal places to round the generated numbers. Default is 2.
|
|
54
|
+
:return: np.ndarray: Array of random floats.
|
|
55
|
+
"""
|
|
56
|
+
return np.round(np.random.normal(mean, std, size), round)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == '__main__':
|
|
62
|
+
pass
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List, Dict, Optional, Generator, Callable, Any
|
|
5
|
+
from rand_engine.main.file_writer import FileWriter
|
|
6
|
+
from rand_engine.main.stream_handle import StreamHandle
|
|
7
|
+
|
|
8
|
+
class DataGenerator:
|
|
9
|
+
|
|
10
|
+
def __init__(self, random_spec):
|
|
11
|
+
self.random_spec = random_spec
|
|
12
|
+
self.actual_dataframe = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def handle_splitable(self, metadata, df):
|
|
16
|
+
for key, value in metadata.items():
|
|
17
|
+
if value.get("splitable"):
|
|
18
|
+
sep = value.get("sep", ";")
|
|
19
|
+
cols = value.get("cols")
|
|
20
|
+
df[cols] = df[key].str.split(sep, expand=True)
|
|
21
|
+
df.drop(columns=[key], inplace=True)
|
|
22
|
+
return df
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def generate_pandas_df(self, size: int, transformer: Optional[Callable]=None) -> pd.DataFrame:
|
|
26
|
+
"""
|
|
27
|
+
This method generates a pandas DataFrame based on random data specified in the metadata parameter.
|
|
28
|
+
:param size: int: Number of rows to be generated.
|
|
29
|
+
:param transformer: Optional[Callable]: Function to transform the generated data.
|
|
30
|
+
:return: pd.DataFrame: DataFrame with the generated data.
|
|
31
|
+
"""
|
|
32
|
+
assert type(self.random_spec) is dict, "You need to pass a random_spec parameter to generate the random data."
|
|
33
|
+
def first_level():
|
|
34
|
+
dict_data = {key: value["method"](size, **value["parms"]) for key, value in self.random_spec.items()}
|
|
35
|
+
df_pandas = pd.DataFrame(dict_data)
|
|
36
|
+
df_pandas = self.handle_splitable(self.random_spec, df_pandas)
|
|
37
|
+
if transformer: df_pandas = transformer(df_pandas)
|
|
38
|
+
return df_pandas
|
|
39
|
+
self.actual_dataframe = first_level
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def generate_spark_df(self, spark, size: int, transformer: Optional[Callable]=None) -> Any:
|
|
44
|
+
"""
|
|
45
|
+
This method generates a Spark DataFrame based on random data specified in the random_spec parameter.
|
|
46
|
+
:param spark: SparkSession: SparkSession object.
|
|
47
|
+
:param size: int: Number of rows to be generated.
|
|
48
|
+
:param transformer: Optional[Callable]: Function to transform the generated data."""
|
|
49
|
+
def second_level():
|
|
50
|
+
self.generate_pandas_df(size=size, transformer=transformer)
|
|
51
|
+
df_spark = spark.createDataFrame(self.actual_dataframe())
|
|
52
|
+
return df_spark
|
|
53
|
+
self.actual_dataframe = second_level
|
|
54
|
+
return self
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_df(self):
|
|
58
|
+
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
59
|
+
return self.actual_dataframe()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def stream_dict(self, min_throughput: int=1, max_throughput: int = 10) -> Generator:
|
|
63
|
+
"""
|
|
64
|
+
This method creates a generator of records to be used in a streaming context.
|
|
65
|
+
:param min_throughput: int: Minimum throughput to be generated.
|
|
66
|
+
:param max_throughput: int: Maximum throughput to be generated.
|
|
67
|
+
:return: Generator: Generator of records.
|
|
68
|
+
"""
|
|
69
|
+
assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
|
|
70
|
+
while True:
|
|
71
|
+
df_data_microbatch = self.actual_dataframe()
|
|
72
|
+
df_data_parsed = StreamHandle.convert_dt_to_str(df_data_microbatch)
|
|
73
|
+
list_of_records = df_data_parsed.to_dict('records')
|
|
74
|
+
for record in list_of_records:
|
|
75
|
+
record["timestamp_created"] = round(time.time(), 3)
|
|
76
|
+
yield record
|
|
77
|
+
StreamHandle.sleep_to_contro_throughput(min_throughput, max_throughput)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def write(self):
|
|
81
|
+
microbatch_def = lambda: self.actual_dataframe
|
|
82
|
+
return FileWriter(microbatch_def)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == '__main__':
|
|
87
|
+
|
|
88
|
+
pass
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FileWriter:
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def __init__(self, microbatch_def):
|
|
9
|
+
self.microbatch_def = microbatch_def
|
|
10
|
+
self.write_format = "csv"
|
|
11
|
+
self.write_mode = "overwrite"
|
|
12
|
+
self.write_options = {}
|
|
13
|
+
self.dict_format = {
|
|
14
|
+
"csv": self.to_csv,
|
|
15
|
+
"parquet": self.to_parquet
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
def __handle_fs(self, path, flag=True) -> None:
|
|
19
|
+
"""
|
|
20
|
+
This method handles the file system operations.
|
|
21
|
+
:param path: str: Path of the file to be written.
|
|
22
|
+
"""
|
|
23
|
+
if self.write_mode == "overwrite":
|
|
24
|
+
try:
|
|
25
|
+
if os.path.exists(path):
|
|
26
|
+
for file in os.listdir(path):
|
|
27
|
+
os.remove(os.path.join(path, file))
|
|
28
|
+
except Exception as e: pass
|
|
29
|
+
if flag == True: to_create = os.path.dirname(path)
|
|
30
|
+
else: to_create = path
|
|
31
|
+
os.makedirs(to_create, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def __get_dir_size(self, folder_path: str) -> int:
|
|
35
|
+
"""
|
|
36
|
+
This method calculates the size in bytes of a directory.
|
|
37
|
+
:param folder_path: str: Path of the directory.
|
|
38
|
+
:return: int: Size of the directory in bytes.
|
|
39
|
+
"""
|
|
40
|
+
total_size = 0
|
|
41
|
+
for dirpath, dirnames, filenames in os.walk(folder_path):
|
|
42
|
+
for filename in filenames:
|
|
43
|
+
file_path = os.path.join(dirpath, filename)
|
|
44
|
+
if not os.path.islink(file_path):
|
|
45
|
+
total_size += os.path.getsize(file_path)
|
|
46
|
+
return total_size
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def mode(self, write_mode: str) -> Callable:
|
|
50
|
+
"""
|
|
51
|
+
This method sets the write mode of the file.
|
|
52
|
+
:param write_mode: str: Write mode of the file. Default is overwrite.
|
|
53
|
+
:return: FileWriter: Instance of the fileWriter class for method chaining.
|
|
54
|
+
"""
|
|
55
|
+
self.write_mode = write_mode
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def format(self, format):
|
|
60
|
+
"""
|
|
61
|
+
This method sets the write format of the file.
|
|
62
|
+
:param format: str: Write format of the file. Default is csv. Supported formats are csv and parquet.
|
|
63
|
+
:return: FileWriter: Instance of the fileWriter class for method chaining.
|
|
64
|
+
"""
|
|
65
|
+
self.write_format = format
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def option(self, key, value):
|
|
70
|
+
"""
|
|
71
|
+
This method sets the write options of the file.
|
|
72
|
+
:param key: str: Key of the write option.
|
|
73
|
+
:param value: Any: Value of the write option.
|
|
74
|
+
:return: FileWriter: Instance of the fileWriter class for method chaining
|
|
75
|
+
"""
|
|
76
|
+
self.write_options[key] = value
|
|
77
|
+
return self
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def to_csv(self, dataframe, full_path) -> Callable:
|
|
81
|
+
"""
|
|
82
|
+
This method writes a pandas DataFrame to a csv file.
|
|
83
|
+
:param dataframe: pd.DataFrame: DataFrame to be written.
|
|
84
|
+
:param full_path: str: Full path of the file to be written.
|
|
85
|
+
:return: Callable: Function to write the Pandas DataFrame to a csv file.
|
|
86
|
+
"""
|
|
87
|
+
if self.write_options.get("compression"):
|
|
88
|
+
full_path= full_path.replace("csv", f"csv.{self.write_options['compression']}")
|
|
89
|
+
writer = lambda: dataframe().to_csv(full_path, index=False, **self.write_options)
|
|
90
|
+
return writer
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def to_parquet(self, dataframe, full_path):
|
|
94
|
+
"""
|
|
95
|
+
This method writes a pandas DataFrame to a parquet file.
|
|
96
|
+
:param dataframe: pd.DataFrame: DataFrame to be written.
|
|
97
|
+
:param full_path: str: Full path of the file to be written.
|
|
98
|
+
:return: Callable: Function to write the Pandas DataFrame to a parquet file.
|
|
99
|
+
"""
|
|
100
|
+
if self.write_options.get("compression"):
|
|
101
|
+
full_path= full_path.replace(".parquet", f".{self.write_options['compression']}.parquet")
|
|
102
|
+
writer = lambda: dataframe().to_parquet(full_path, index=False, engine='pyarrow', **self.write_options)
|
|
103
|
+
return writer
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load(self, path: str) -> None:
|
|
107
|
+
"""
|
|
108
|
+
This method writes a pandas DataFrame to a file.
|
|
109
|
+
:param path: str: Path of the file to be written.
|
|
110
|
+
"""
|
|
111
|
+
self.__handle_fs(path)
|
|
112
|
+
dataframe = self.microbatch_def()
|
|
113
|
+
self.dict_format[self.write_format](dataframe, path)()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def incr_load(self, path, size_in_mb=4):
|
|
117
|
+
"""
|
|
118
|
+
This method writes a pandas DataFrame to a file in incremental mode.
|
|
119
|
+
:param path: str: Path of the file to be written.
|
|
120
|
+
:param size_in_mb: int: Size in MB of the file to be written.
|
|
121
|
+
"""
|
|
122
|
+
self.__handle_fs(path, flag=False)
|
|
123
|
+
counter = 0
|
|
124
|
+
while True:
|
|
125
|
+
full_path = f"{path}/part-{str(counter).zfill(6)}.{self.write_format}"
|
|
126
|
+
dataframe = self.microbatch_def()
|
|
127
|
+
self.dict_format[self.write_format](dataframe, full_path)()
|
|
128
|
+
size_bytes = self.__get_dir_size(path)
|
|
129
|
+
if counter % 100 == 0: print(f"Size: {size_bytes/2**20:.2f} MB")
|
|
130
|
+
if self.__get_dir_size(path) >= size_in_mb*2**20: break
|
|
131
|
+
counter += 1
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import time
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
class StreamHandle:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def convert_dt_to_str(dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
9
|
+
df_result = dataframe.copy()
|
|
10
|
+
for column in df_result.columns:
|
|
11
|
+
if 'datetime64' in str(df_result[column].dtype):
|
|
12
|
+
df_result[column] = df_result[column].astype(str)
|
|
13
|
+
return df_result
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def sleep_to_contro_throughput(min_throughput: int, max_throughput: int):
|
|
17
|
+
sleep_time = 1 / random.uniform(min_throughput, max_throughput)
|
|
18
|
+
time.sleep(sleep_time)
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class NumericCore:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@classmethod
|
|
8
|
-
def gen_ints(self, size: int, min: int, max: int):
|
|
9
|
-
return np.random.randint(min, max + 1, size)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@classmethod
|
|
13
|
-
def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
|
|
14
|
-
str_arr = np.random.randint(0, 10**length, size).astype('str')
|
|
15
|
-
return np.char.zfill(str_arr, length)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@classmethod
|
|
19
|
-
def gen_floats(self, size: int, min: int, max: int, round: int = 2):
|
|
20
|
-
sig_part = np.random.randint(min, max, size)
|
|
21
|
-
decimal = np.random.randint(0, 10 ** round, size)
|
|
22
|
-
return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2):
|
|
27
|
-
return np.round(np.random.normal(mean, std, size), round)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
if __name__ == '__main__':
|
|
33
|
-
pass
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import time
|
|
3
|
-
import random
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from typing import List, Dict, Optional, Generator, Callable, Any
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class DataGenerator:
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def handle_splitable(self, metadata, df):
|
|
12
|
-
for key, value in metadata.items():
|
|
13
|
-
if value.get("splitable"):
|
|
14
|
-
sep = value.get("sep", ";")
|
|
15
|
-
cols = value.get("cols")
|
|
16
|
-
df[cols] = df[key].str.split(sep, expand=True)
|
|
17
|
-
df.drop(columns=[key], inplace=True)
|
|
18
|
-
return df
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def __convert_dt_to_str(self, dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
22
|
-
df_result = dataframe.copy()
|
|
23
|
-
for column in df_result.columns:
|
|
24
|
-
if 'datetime64' in str(df_result[column].dtype):
|
|
25
|
-
df_result[column] = df_result[column].astype(str)
|
|
26
|
-
return df_result
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def __sleep_to_contro_throughput(self, min_throughput: int, max_throughput: int):
|
|
30
|
-
sleep_time = 1 / random.uniform(min_throughput, max_throughput)
|
|
31
|
-
time.sleep(sleep_time)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def create_pandas_df(self, size: int, metadata: dict, transformer: Optional[Callable]=None) -> pd.DataFrame:
|
|
35
|
-
dict_data = {key: value["method"](size, **value["parms"]) for key, value in metadata.items()}
|
|
36
|
-
df_data = pd.DataFrame(dict_data)
|
|
37
|
-
df_data_final = self.handle_splitable(metadata, df_data)
|
|
38
|
-
if transformer: df_data_final = transformer(df_data_final)
|
|
39
|
-
return df_data_final
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def create_spark_df(self, spark, size: int, metadata: dict, transformer: Optional[Callable]=None) -> Any:
|
|
43
|
-
df_data = self.create_pandas_df(size=size, metadata=metadata, transformer=transformer)
|
|
44
|
-
df_final = spark.createDataFrame(df_data)
|
|
45
|
-
return df_final
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def create_streaming_records(self, microbatch_size:int, metadata: dict, transformer: Optional[Callable]=None, min_throughput: int=1, max_throughput: int = 10) -> Generator:
|
|
49
|
-
while True:
|
|
50
|
-
df_data_microbatch = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
|
|
51
|
-
df_data_parsed = self.__convert_dt_to_str(df_data_microbatch)
|
|
52
|
-
list_of_records = df_data_parsed.to_dict('records')
|
|
53
|
-
for record in list_of_records:
|
|
54
|
-
record["timestamp_created"] = round(time.time(), 3)
|
|
55
|
-
yield record
|
|
56
|
-
self.__sleep_to_contro_throughput(min_throughput, max_throughput)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def create_csv_file(self, microbatch_size: int, size_in_mb: int, metadata: dict, path: str, transformer: Optional[Callable]=None) -> None:
|
|
60
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
61
|
-
while True:
|
|
62
|
-
df = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
|
|
63
|
-
df.to_csv(path, mode='a', header=False, index=False)
|
|
64
|
-
if os.path.getsize(path) > size_in_mb * 1024 * 1024: break
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
if __name__ == '__main__':
|
|
72
|
-
pass
|
|
File without changes
|
|
File without changes
|