rand-engine 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: rand-engine
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
5
5
  Home-page: https://github.com/marcoaureliomenezes/rand_engine
6
6
  Author: marcoaureliomenezes
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "rand-engine"
3
- version = "0.2.0"
3
+ version = "0.3.1"
4
4
  description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
5
5
  authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
6
6
  repository = "https://github.com/marcoaureliomenezes/rand_engine"
@@ -12,7 +12,6 @@ numpy = "^2.1.1"
12
12
  pandas = "^2.2.2"
13
13
 
14
14
 
15
-
16
15
  [tool.poetry.group.test.dependencies]
17
16
  pytest = "^8.3.3"
18
17
  faker = "^28.4.1"
@@ -0,0 +1,54 @@
1
+
2
+ import itertools
3
+
4
+
5
+ class DistinctUtils:
6
+
7
+ @classmethod
8
+ def handle_distincts_lvl_1(self, distinct_prop, precision):
9
+ return [ key for key, value in distinct_prop.items() for i in range(value * precision )]
10
+
11
+ @classmethod
12
+ def handle_distincts_lvl_2(self, distincts, sep=";"):
13
+ data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
14
+ return data_flatted
15
+
16
+
17
+ @classmethod
18
+ def handle_distincts_lvl_3(self, distincts, sep=";"):
19
+ parm_paired_distincts = {k: list(map(lambda x: f"{x[0]}@!{x[1]}", v)) for k, v in distincts.items()}
20
+ data_flatted = self.handle_distincts_lvl_2(parm_paired_distincts, sep)
21
+ result = []
22
+ for i in data_flatted:
23
+ value, size = i.split("@!")
24
+ result.extend([value for _ in range(int(size))])
25
+ return result
26
+
27
+ @classmethod
28
+ def handle_distincts_lvl_4(self, distincts, sep=";"):
29
+ combinations = [list(itertools.product([k], *v)) for k, v in distincts.items()]
30
+ result = [sep.join(i) for i in list(itertools.chain(*combinations))]
31
+ return result
32
+
33
+
34
+
35
+ if __name__ == '__main__':
36
+ distincts = {"OPC": ["C_OPC","V_OPC"], "SWP": ["C_SWP", "V_SWP"]}
37
+
38
+ distinct_1 = {"OPC": [["C_OPC","V_OPC"], ["PF", "PJ"], ["IN"]], "SWP": [["C_SWP", "V_SWP"], ["AF", "ME"], ["NULL"]]}
39
+ distinct_2 = {"OPC": [{"C_OPC": ["PF", "PJ"]}, {"V_OPC": ["NA"]}], "SWP": [{"C_SWP": ["AP"]}, {"V_SWP": ["MA", "ME"]}]}
40
+ #print(DistinctUtils.handle_distincts_lvl_5(distinct_2)[]
41
+
42
+
43
+ def rec(structure):
44
+ if isinstance(structure, list):
45
+ return [rec(i) for i in structure]
46
+ if isinstance(structure, dict):
47
+ return [[[k], rec(v)] for k, v in structure.items()]
48
+ return structure
49
+
50
+ import numpy as np
51
+
52
+ result = rec(distinct_2)
53
+
54
+ combinations = np.array(list(itertools.product(*result)))
@@ -0,0 +1,73 @@
1
+ import pandas as pd
2
+ from typing import List, Dict, Optional, Generator, Callable, Any
3
+ from pyspark.sql import SparkSession, DataFrame
4
+ import os
5
+ import time
6
+ import random
7
+
8
+
9
+ class DataGenerator:
10
+
11
+
12
+ def handle_splitable(self, metadata, df):
13
+ for key, value in metadata.items():
14
+ if value.get("splitable"):
15
+ sep = value.get("sep", ";")
16
+ cols = value.get("cols")
17
+ df[cols] = df[key].str.split(sep, expand=True)
18
+ df.drop(columns=[key], inplace=True)
19
+ return df
20
+
21
+
22
+ def __convert_dt_to_str(self, dataframe: pd.DataFrame) -> pd.DataFrame:
23
+ df_result = dataframe.copy()
24
+ for column in df_result.columns:
25
+ if 'datetime64' in str(df_result[column].dtype):
26
+ df_result[column] = df_result[column].astype(str)
27
+ return df_result
28
+
29
+
30
+ def __sleep_to_contro_throughput(self, min_throughput: int, max_throughput: int):
31
+ sleep_time = 1 / random.uniform(min_throughput, max_throughput)
32
+ time.sleep(sleep_time)
33
+
34
+
35
+ def create_pandas_df(self, size: int, metadata: dict, transformer: Optional[Callable]=None) -> pd.DataFrame:
36
+ dict_data = {key: value["method"](size, **value["parms"]) for key, value in metadata.items()}
37
+ df_data = pd.DataFrame(dict_data)
38
+ df_data_final = self.handle_splitable(metadata, df_data)
39
+ if transformer: df_data_final = transformer(df_data_final)
40
+ return df_data_final
41
+
42
+
43
+ def create_spark_df(self, spark, size: int, metadata: dict, transformer: Optional[Callable]=None) -> Any:
44
+ df_data = self.create_pandas_df(size=size, metadata=metadata, transformer=transformer)
45
+ df_final = spark.createDataFrame(df_data)
46
+ return df_final
47
+
48
+
49
+ def create_streaming_records(self, microbatch_size:int, metadata: dict, transformer: Optional[Callable]=None, min_throughput: int=1, max_throughput: int = 10) -> Generator[Dict]:
50
+ while True:
51
+ df_data_microbatch = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
52
+ df_data_parsed = self.__convert_dt_to_str(df_data_microbatch)
53
+ list_of_records = df_data_parsed.to_dict('records')
54
+ for record in list_of_records:
55
+ record["timestamp_created"] = round(time.time(), 3)
56
+ yield record
57
+ self.__sleep_to_contro_throughput(min_throughput, max_throughput)
58
+
59
+
60
+ def create_csv_file(self, microbatch_size: int, size_in_mb: int, metadata: dict, path: str, transformer: Optional[Callable]=None) -> None:
61
+ os.makedirs(os.path.dirname(path), exist_ok=True)
62
+ while True:
63
+ df = self.create_pandas_df(size=microbatch_size, metadata=metadata, transformer=transformer)
64
+ df.to_csv(path, mode='a', header=False, index=False)
65
+ if os.path.getsize(path) > size_in_mb * 1024 * 1024: break
66
+
67
+
68
+
69
+
70
+
71
+
72
+ if __name__ == '__main__':
73
+ pass
@@ -1,23 +0,0 @@
1
-
2
-
3
- class DistinctUtils:
4
-
5
- @classmethod
6
- def handle_distincts_lvl_1(self, distinct_prop, precision):
7
- return [ key for key, value in distinct_prop.items() for i in range(value * precision )]
8
-
9
- @classmethod
10
- def handle_distincts_lvl_2(self, distincts, sep=";"):
11
- data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
12
- return data_flatted
13
-
14
- @classmethod
15
- def handle_distincts_lvl_3(self, distincts, sep=";"):
16
- parm_paired_distincts = {k: list(map(lambda x: f"{x[0]}@!{x[1]}", v)) for k, v in distincts.items()}
17
- data_flatted = self.handle_distincts_lvl_2(parm_paired_distincts, sep)
18
- result = []
19
- for i in data_flatted:
20
- value, size = i.split("@!")
21
- result.extend([value for _ in range(int(size))])
22
- return result
23
-
@@ -1,64 +0,0 @@
1
- import pandas as pd
2
- import json
3
- import os
4
-
5
-
6
- class BulkRandEngine:
7
-
8
-
9
- def handle_splitable(self, metadata, df):
10
- for key, value in metadata.items():
11
- if value.get("splitable"):
12
- sep = value.get("sep", ";")
13
- cols = value.get("cols")
14
- df[cols] = df[key].str.split(sep, expand=True)
15
- df.drop(columns=[key], inplace=True)
16
- return df
17
-
18
-
19
-
20
- def create_pandas_df(self, size, metadata):
21
- df_pandas = pd.DataFrame({key: value["method"](size, **value["parms"]) for key, value in metadata.items()})
22
- df_pandas = self.handle_splitable(metadata, df_pandas)
23
- return df_pandas
24
-
25
- @classmethod
26
- def convert_datetimes_to_string(self, pandas_df):
27
- for column in pandas_df.columns:
28
- if pandas_df[column].dtype == 'datetime64[ns]':
29
- pandas_df[column] = pandas_df[column].astype(str)
30
- return pandas_df
31
-
32
-
33
- @classmethod
34
- def create_streaming_df(self, pandas_df):
35
- pandas_df = self.convert_datetimes_to_string(pandas_df)
36
- list_of_dicts = pandas_df.to_dict('records')
37
- for record in list_of_dicts:
38
- yield json.dumps(record)
39
-
40
- @classmethod
41
- def create_streaming_series(self, pandas_series):
42
- for record in pandas_series:
43
- yield record
44
-
45
- @classmethod
46
- def create_file(path, word, limit_size):
47
- os.makedirs(os.path.dirname(path), exist_ok=True)
48
- with open(path, 'w') as file:
49
- while file.tell() < limit_size:
50
- file.write(word + '\n')
51
- return True
52
-
53
-
54
- def microbatch_file_with_streaming(self, path, metadata, df_transformer, microbatch_size, total_size):
55
- os.makedirs(os.path.dirname(path), exist_ok=True)
56
- while True:
57
- df = self.create_pandas_df(size=microbatch_size, metadata=metadata)
58
- df = df_transformer(df)
59
- df.to_csv(path, mode='a', header=False, index=False)
60
- if os.path.getsize(path) > total_size:
61
- break
62
-
63
- if __name__ == '__main__':
64
- pass
File without changes