rand-engine 0.4.4__tar.gz → 0.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rand-engine
3
- Version: 0.4.4
3
+ Version: 0.4.7
4
4
  Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
5
5
  Author: marcoaureliomenezes
6
6
  Author-email: marcoaurelioreislima@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "rand-engine"
3
- version = "0.4.4"
3
+ version = "0.4.7"
4
4
  description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
5
5
  authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
6
6
  repository = "https://github.com/marcoaureliomenezes/rand_engine"
@@ -0,0 +1,95 @@
1
+
2
+ from typing import List, Any
3
+ import numpy as np
4
+ from datetime import datetime as dt
5
+ from functools import reduce
6
+
7
+
8
+
9
+ class Core:
10
+
11
+ @classmethod
12
+ def gen_distincts(self, size: int, distinct: List[Any]) -> np.ndarray:
13
+ assert len(list(set([type(x) for x in distinct]))) == 1
14
+ return np.random.choice(distinct, size)
15
+
16
+
17
+ @classmethod
18
+ def gen_distincts_untyped(self, size: int, distinct: List[Any]) -> List[Any]:
19
+ return list(map(lambda x: distinct[x], np.random.randint(0, len(distinct), size)))
20
+
21
+
22
+ @classmethod
23
+ def gen_complex_distincts(self, size: int, pattern="x.x.x-x", replacement="x", templates=[]):
24
+ assert pattern.count(replacement) == len(templates)
25
+ list_of_lists, counter = [], 0
26
+ for replacer_cursor in range(len(pattern)):
27
+ if pattern[replacer_cursor] == replacement:
28
+ list_of_lists.append(templates[counter]["method"](size, **templates[counter]["parms"]))
29
+ counter += 1
30
+ else:
31
+ list_of_lists.append(np.array([pattern[replacer_cursor] for i in range(size)]))
32
+ return reduce(lambda a, b: a.astype('str') + b.astype('str'), list_of_lists)
33
+
34
+
35
+ @classmethod
36
+ def gen_ints(self, size: int, min: int, max: int) -> np.ndarray:
37
+ return np.random.randint(min, max + 1, size)
38
+
39
+
40
+ @classmethod
41
+ def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
42
+ str_arr = np.random.randint(0, 10**length, size).astype('str')
43
+ return np.char.zfill(str_arr, length)
44
+
45
+
46
+ @classmethod
47
+ def gen_floats(self, size: int, min: int, max: int, round: int = 2) -> np.ndarray:
48
+ sig_part = np.random.randint(min, max, size)
49
+ decimal = np.random.randint(0, 10 ** round, size)
50
+ return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
51
+
52
+
53
+ @classmethod
54
+ def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2) -> np.ndarray:
55
+ return np.round(np.random.normal(mean, std, size), round)
56
+
57
+
58
+ @classmethod
59
+ def gen_unix_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
60
+ dt_start, dt_end = dt.strptime(start, format), dt.strptime(end, format)
61
+ if dt_start < dt(1970, 1, 1): dt_start = dt(1970, 1, 1)
62
+ timestamp_start, timestamp_end = dt_start.timestamp(), dt_end.timestamp()
63
+ int_array = np.random.randint(timestamp_start, timestamp_end, size)
64
+ return int_array
65
+
66
+
67
+ @classmethod
68
+ def gen_unique_identifiers(self, size: int, strategy="zint", length=12) -> np.ndarray:
69
+ import uuid
70
+ if strategy == "uuid4":
71
+ return np.array([str(uuid.uuid4()) for _ in range(size)])
72
+ elif strategy == "uuid1":
73
+ return np.array([str(uuid.uuid1()) for _ in range(size)])
74
+ elif strategy == "zint":
75
+ return self.gen_ints_zfilled(size, length)
76
+ else:
77
+ raise ValueError("Method not recognized. Use 'uuid4', 'uuid1', 'shortuuid' or 'random'.")
78
+ # @classmethod
79
+ # def gen_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
80
+ # """
81
+ # This method generates an array of random timestamps.
82
+ # :param size: int: Number of elements to be generated.
83
+ # :param start: str: Start date of the generated timestamps.
84
+ # :param end: str: End date of the generated timestamps.
85
+ # :param format: str: Format of the input dates.
86
+ # :return: np.ndarray: Array of random timestamps."""
87
+ # date_array = self.gen_unix_timestamps(size, start, end, format).astype('datetime64[s]')
88
+ # return date_array
89
+
90
+
91
+ # @classmethod
92
+ # def gen_datetimes(self, size: int, start: str, end: str, format_in: str, format_out: str):
93
+ # timestamp_array = self.gen_unix_timestamps(size, start, end, format_in)
94
+ # vectorized_func = np.vectorize(lambda x: dt.fromtimestamp(x).strftime(format_out))
95
+ # return vectorized_func(timestamp_array)
@@ -86,7 +86,8 @@ class FileWriter:
86
86
  :return: Callable: Function to write the Pandas DataFrame to a csv file.
87
87
  """
88
88
  if self.write_options.get("compression"):
89
- full_path= full_path.replace("csv", f"csv.{self.write_options['compression']}")
89
+ # Add compression extension to the end of the filename
90
+ full_path = f"{full_path}.{self.write_options['compression']}"
90
91
  writer = lambda: dataframe().to_csv(full_path, index=False, **self.write_options)
91
92
  return writer
92
93
 
@@ -98,7 +99,8 @@ class FileWriter:
98
99
  :return: Callable: Function to write the Pandas DataFrame to a json file.
99
100
  """
100
101
  if self.write_options.get("compression"):
101
- full_path= full_path.replace("json", f"json.{self.write_options['compression']}")
102
+ # Add compression extension to the end of the filename
103
+ full_path = f"{full_path}.{self.write_options['compression']}"
102
104
  def writer():
103
105
  dataframe().to_json(full_path, orient='records', lines=True)
104
106
  return writer
@@ -132,13 +134,13 @@ class FileWriter:
132
134
  :param path: str: Path of the file to be written.
133
135
  :param size_in_mb: int: Size in MB of the file to be written.
134
136
  """
135
- self.__handle_fs(path, flag=False)
137
+ self.__handle_fs(path, flag=True)
136
138
  counter = 0
137
139
  while True:
138
140
  full_path = f"{path}/part-{str(counter).zfill(6)}.{self.write_format}"
139
141
  dataframe = self.microbatch_def()
140
142
  self.dict_format[self.write_format](dataframe, full_path)()
141
143
  size_bytes = self.__get_dir_size(path)
142
- if counter % 100 == 0: print(f"Size: {size_bytes/2**20:.2f} MB")
144
+ #if counter % 100 == 0: print(f"Size: {size_bytes/2**20:.2f} MB")
143
145
  if self.__get_dir_size(path) >= size_in_mb*2**20: break
144
146
  counter += 1
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  from pandas import DataFrame as PandasDF
9
9
 
10
10
  from rand_engine.main.i_random_spec import IRandomSpec
11
- from rand_engine.main.data_generator import DataGenerator
11
+ from rand_engine.main import RandGenerator
12
12
  from rand_engine.main.fs_utils import FSUtils, DBFSUtils
13
13
 
14
14
  from pyspark.sql.functions import coalesce
@@ -34,10 +34,10 @@ class FilesGenerator:
34
34
  def _get_file_path(self) -> str:
35
35
  return f"{self.base_path}/{self.file_name}_{str(uuid4())[:8]}.{self.ext}"
36
36
 
37
-
37
+
38
38
  def generate_sample(self, size: int=100) -> PandasDF:
39
39
  return (
40
- DataGenerator(self.footprint.metadata())
40
+ RandGenerator(self.footprint.metadata())
41
41
  .generate_pandas_df(size, transformer=self.footprint.transformer())
42
42
  .get_df()
43
43
  )
@@ -55,7 +55,7 @@ class FilesGenerator:
55
55
  def write_file(self, size: int=100, const_cols={}):
56
56
  file_path = self._get_file_path()
57
57
  _ = (
58
- DataGenerator(self.footprint.metadata()) \
58
+ RandGenerator(self.footprint.metadata()) \
59
59
  .generate_pandas_df(size, transformer=self.footprint.transformer(**const_cols))
60
60
  .write() \
61
61
  .mode("overwrite") \
@@ -108,7 +108,7 @@ class CDCGenerator(FilesGenerator):
108
108
  metadata = self.footprint.metadata()
109
109
  size = df_pks_to_change.shape[0]
110
110
  transformer = self.footprint.transformer_cdc_update(null_rate=null_rate, **const_cols)
111
- df_data = DataGenerator(metadata).generate_pandas_df(size, transformer).get_df()
111
+ df_data = RandGenerator(metadata).generate_pandas_df(size, transformer).get_df()
112
112
  for coluna in self.pk_cols: df_data[coluna] = df_pks_to_change[coluna]
113
113
  if null_rate != 1:
114
114
  cols_to_check = [col for col in df_data.columns if col not in self.pk_cols + list(const_cols.keys())]
@@ -131,7 +131,7 @@ class DBFSUtils(FSUtils):
131
131
 
132
132
  def rm(self, path: str, recursive: bool = False) -> None:
133
133
  try:
134
- result = self.dbutils.fs.rm(path, recurse=recursive)
134
+ result = self.dbutils.fs.rm(path, recursive)
135
135
  if not result:
136
136
  raise Exception(f"Failed to delete file {path}")
137
137
  except Exception as e:
@@ -0,0 +1,102 @@
1
+ import os
2
+ import time
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import List, Dict, Optional, Generator, Callable, Any
6
+ from rand_engine.rand_generator import RandGenerator
7
+ from rand_engine.file_writer import FileWriter
8
+ from rand_engine.utils.stream_handler import StreamHandler
9
+
10
+
11
+
12
+
13
+ class RandEngine:
14
+
15
+ def __init__(self, random_spec, seed: bool = False):
16
+ np.random.seed(42) if seed else np.random.seed(None)
17
+ self.actual_dataframe: Optional[Callable[[], pd.DataFrame]] = None
18
+ self.data_generator = RandGenerator(random_spec)
19
+ self._mode = "pandas"
20
+ self._size = 1000
21
+ self._transformers: List[Optional[Callable]] = []
22
+
23
+
24
+ def generate_pandas_df(self, size: int) -> pd.DataFrame:
25
+ """
26
+ This method generates a pandas DataFrame based on random data specified in the metadata parameter.
27
+ :param size: int: Number of rows to be generated.
28
+ :param transformer: Optional[Callable]: Function to transform the generated data.
29
+ :return: pd.DataFrame: DataFrame with the generated data.
30
+ """
31
+ def wrapped_lazy_dataframe():
32
+ df_pandas = self.data_generator.generate_first_level(size=size)
33
+ df_pandas = self.data_generator.handle_splitable(df_pandas)
34
+ df_pandas = self.data_generator.apply_embedded_transformers(df_pandas)
35
+ df_pandas = self.data_generator.apply_global_transformers(df_pandas, self._transformers)
36
+ return df_pandas
37
+ self.actual_dataframe = wrapped_lazy_dataframe
38
+
39
+
40
+ def transformers(self, transformers: List[Optional[Callable]]):
41
+ self._transformers = transformers
42
+ return self
43
+
44
+ def generate_spark_df(self, spark, size: int) -> Any:
45
+ """
46
+ This method generates a Spark DataFrame based on random data specified in the random_spec parameter.
47
+ :param spark: SparkSession: SparkSession object.
48
+ :param size: int: Number of rows to be generated.
49
+ :param transformer: Optional[Callable]: Function to transform the generated data."""
50
+ def wrapped_lazy_dataframe():
51
+ self.generate_pandas_df(size=size)
52
+ df_spark = spark.createDataFrame(self.actual_dataframe())
53
+ return df_spark
54
+ self.actual_dataframe = wrapped_lazy_dataframe
55
+
56
+ def mode(self, mode: str):
57
+ assert mode in ["pandas", "spark"], "Mode not recognized. Use 'pandas' or 'spark'."
58
+ self._mode = mode
59
+ return self
60
+
61
+ def size(self, size: int):
62
+ self._size = size
63
+ return self
64
+
65
+ def get_df(self, spark=None):
66
+ if self._mode == "pandas":
67
+ self.generate_pandas_df(size=self._size)
68
+ elif self._mode == "spark":
69
+ self.generate_spark_df(spark=spark, size=self._size)
70
+ assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
71
+ return self.actual_dataframe()
72
+
73
+
74
+ def stream_dict(self, min_throughput: int=1, max_throughput: int = 10) -> Generator:
75
+ """
76
+ This method creates a generator of records to be used in a streaming context.
77
+ :param min_throughput: int: Minimum throughput to be generated.
78
+ :param max_throughput: int: Maximum throughput to be generated.
79
+ :return: Generator: Generator of records.
80
+ """
81
+ self.generate_pandas_df(size=self._size)
82
+ assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
83
+ while True:
84
+ df_data_microbatch = self.actual_dataframe()
85
+ df_data_parsed = StreamHandler.convert_dt_to_str(df_data_microbatch)
86
+ list_of_records = df_data_parsed.to_dict('records')
87
+ for record in list_of_records:
88
+ record["timestamp_created"] = round(time.time(), 3)
89
+ yield record
90
+ StreamHandler.sleep_to_contro_throughput(min_throughput, max_throughput)
91
+
92
+
93
+ def write(self, size):
94
+ self.generate_pandas_df(size=size)
95
+ microbatch_def = lambda: self.actual_dataframe
96
+ return FileWriter(microbatch_def)
97
+
98
+
99
+
100
+ if __name__ == '__main__':
101
+
102
+ pass
@@ -0,0 +1,46 @@
1
+ from typing import List, Optional, Callable
2
+ import pandas as pd
3
+
4
+
5
+ class RandGenerator:
6
+
7
+
8
+ def __init__(self, random_spec):
9
+ self.random_spec = random_spec
10
+
11
+
12
+ def generate_first_level(self, size: int):
13
+ dict_data = {}
14
+ for k, v in self.random_spec.items():
15
+ try:
16
+ if "args" in v: dict_data[k] = v["method"](size , *v["args"])
17
+ else: dict_data[k] = v["method"](size , **v.get("kwargs", {}))
18
+ except Exception as e:
19
+ raise Exception(f"Error generating data for column '{k}': {e}")
20
+ df_pandas = pd.DataFrame(dict_data)
21
+ return df_pandas
22
+
23
+
24
+ def apply_embedded_transformers(self, df):
25
+
26
+ cols_with_transformers = {key: value["transformers"] for key, value in self.random_spec.items() if value.get("transformers")}
27
+ for col, transformers in cols_with_transformers.items():
28
+ for transformer in transformers:
29
+ df[col] = df[col].apply(transformer)
30
+ return df
31
+
32
+ def apply_global_transformers(self, df, transformers: List[Optional[Callable]]):
33
+ if transformers:
34
+ if len(transformers) > 0:
35
+ for transformer in transformers:
36
+ df = transformer(df)
37
+ return df
38
+
39
+ def handle_splitable(self, df):
40
+ for key, value in self.random_spec.items():
41
+ if value.get("splitable"):
42
+ sep = value.get("sep", ";")
43
+ cols = value.get("cols")
44
+ df[cols] = df[key].str.split(sep, expand=True)
45
+ df.drop(columns=[key], inplace=True)
46
+ return df
@@ -0,0 +1,40 @@
1
+
2
+ import pyspark.sql.functions as F
3
+ from pyspark.sql.functions import randn, rand, randstr
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ import pandas as pd
6
+
7
+ class RandSpark:
8
+
9
+ def __init__(self, spark, df: DataFrame):
10
+ self.spark = spark
11
+ self._df = df
12
+
13
+
14
+ def withColumnRandInt(self, col_name="rand_int", min_size=0, max_size=10):
15
+ return RandSpark(
16
+ self._df.withColumn(col_name, (F.rand() * (max_size - min_size) + min_size).cast("int"))
17
+ )
18
+
19
+ def withColumnRandFloat(self, col_name="rand_float", min_size=0.0, max_size=10.0, decimals=2):
20
+
21
+ return RandSpark(
22
+ self._df.withColumn(col_name, F.round(F.rand() * (max_size - min_size) + min_size, decimals))
23
+ )
24
+
25
+ def withColumnRandChoice(self, col_name="rand_choice", distincts=[]):
26
+ df_columns = self._df.columns
27
+ aux_col = f"{col_name}_aux"
28
+
29
+ df_pd = pd.DataFrame(distincts, columns=[col_name])
30
+ df_pd[aux_col] = range(len(distincts))
31
+ df_spark = self.spark.createDataFrame(df_pd)
32
+ df = RandSpark(self._df.withColumn(aux_col, (F.rand() * (len(distincts) - 0) + 0).cast("int")))
33
+ return (
34
+ df.alias("a").join(F.broadcast(df_spark).alias("b"), on=aux_col, how="left") \
35
+ .select(*df_columns, f"b.{col_name}"))
36
+
37
+
38
+ def __getattr__(self, name):
39
+ """Delegate unknown methods to the original DataFrame"""
40
+ return getattr(self._df, name)
@@ -5,7 +5,7 @@ import itertools
5
5
  class DistinctUtils:
6
6
 
7
7
  @classmethod
8
- def handle_distincts_lvl_1(self, distinct_prop, precision):
8
+ def handle_distincts_lvl_1(self, distinct_prop, precision=1):
9
9
  """
10
10
  This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
11
11
  :param distinct_prop: dict: Dictionary containing the distinct values and their respective frequencies.
@@ -22,6 +22,13 @@ class DistinctUtils:
22
22
  data_flatted = [f"{j}{sep}{i}" for j in distincts for i in distincts[j]]
23
23
  return data_flatted
24
24
 
25
+ @classmethod
26
+ def handle_distincts_lvl_22(self, distincts):
27
+ """
28
+ This method generates a list of distinct values based on a dictionary of distinct values and their respective frequencies.
29
+ :param distincts: dict: Dictionary containing the distinct values and their respective frequencies."""
30
+ data_flatted = [(j, i) for j in distincts for i in distincts[j]]
31
+ return data_flatted
25
32
 
26
33
  @classmethod
27
34
  def handle_distincts_lvl_3(self, distincts, sep=";"):
@@ -2,7 +2,7 @@ import random
2
2
  import time
3
3
  import pandas as pd
4
4
 
5
- class StreamHandle:
5
+ class StreamHandler:
6
6
 
7
7
  @staticmethod
8
8
  def convert_dt_to_str(dataframe: pd.DataFrame) -> pd.DataFrame:
@@ -0,0 +1,33 @@
1
+ import numpy as np
2
+ from random import randint
3
+
4
+
5
+ class Changer:
6
+
7
+ def __init__(self, cols_to_change):
8
+ self.cols_to_change = cols_to_change
9
+
10
+ def __transformer_numeric(self, df):
11
+ numeric_types = ['int64', 'int32', 'float32', 'float64']
12
+ for col in df.columns:
13
+ if col in self.cols_to_change and df[col].dtype in numeric_types:
14
+ np.random.seed(None)
15
+ df[col + '_random'] = np.random.randint(df[col].min(), df[col].max(), size=len(df))
16
+ df[col + '_random_signal'] = np.random.choice([-1, 1], size=len(df))
17
+ df[col + '_random'] = df[col + '_random'] * df[col + '_random_signal']
18
+ df[col] = df[col] + df[col + '_random']
19
+ df.drop(columns=[col + '_random', col + '_random_signal'], inplace=True)
20
+ return df
21
+
22
+
23
+ def __transformer_object(self, df):
24
+ for col in df.columns:
25
+ if col in self.cols_to_change and df[col].dtype == 'object':
26
+ df[col] = np.roll(df[col], randint(1, len(df)))
27
+ return df
28
+
29
+ def updater(self, df):
30
+ transformers = [self.__transformer_numeric, self.__transformer_object]
31
+ for transform in transformers:
32
+ df = transform(df)
33
+ return df
@@ -1,45 +0,0 @@
1
- import numpy as np
2
- import random
3
- from datetime import datetime as dt, timedelta
4
-
5
-
6
- class DatetimeCore:
7
-
8
- @classmethod
9
- def gen_unix_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
10
- """
11
- This method generates an array of random unix timestamps.
12
- :param size: int: Number of elements to be generated.
13
- :param start: str: Start date of the generated timestamps.
14
- :param end: str: End date of the generated timestamps.
15
- :param format: str: Format of the input dates."""
16
- dt_start, dt_end = dt.strptime(start, format), dt.strptime(end, format)
17
- if dt_start < dt(1970, 1, 1): dt_start = dt(1970, 1, 1)
18
- timestamp_start, timestamp_end = dt_start.timestamp(), dt_end.timestamp()
19
- int_array = np.random.randint(timestamp_start, timestamp_end, size)
20
- return int_array
21
-
22
-
23
- @classmethod
24
- def gen_timestamps(self, size: int, start: str, end: str, format: str) -> np.ndarray:
25
- """
26
- This method generates an array of random timestamps.
27
- :param size: int: Number of elements to be generated.
28
- :param start: str: Start date of the generated timestamps.
29
- :param end: str: End date of the generated timestamps.
30
- :param format: str: Format of the input dates.
31
- :return: np.ndarray: Array of random timestamps."""
32
-
33
- date_array = self.gen_unix_timestamps(size, start, end, format).astype('datetime64[s]')
34
- return date_array
35
-
36
-
37
- @classmethod
38
- def gen_datetimes(self, size: int, start: str, end: str, format_in: str, format_out: str):
39
- timestamp_array = self.gen_unix_timestamps(size, start, end, format_in)
40
- return [dt.fromtimestamp(i).strftime(format_out) for i in timestamp_array]
41
-
42
-
43
-
44
- if __name__ == '__main__':
45
- pass
@@ -1,65 +0,0 @@
1
- from functools import reduce
2
- from typing import List, Any, Iterator
3
- import numpy as np
4
-
5
-
6
- class DistinctCore:
7
-
8
- @classmethod
9
- def gen_distincts_typed(self, size: int, distinct: List[Any]) -> np.ndarray:
10
- """
11
- This method generates an array of random distinct values.
12
- :param size: int: Number of elements to be generated.
13
- :param distinct: List[Any]: List of distinct values to be generated.
14
- :return: np.ndarray: Array of random distinct values.
15
- """
16
- assert len(list(set([type(x) for x in distinct]))) == 1
17
- return np.random.choice(distinct, size)
18
-
19
-
20
- @classmethod
21
- def gen_distincts_untyped(self, size: int, distinct: List[Any]) -> List[Any]:
22
- """
23
- This method generates an array of random distinct values.
24
- :param size: int: Number of elements to be generated.
25
- :param distinct: List[Any]: List of distinct values to be generated.
26
- :return: Iterator: Iterator of random distinct values.
27
- """
28
- return list(map(lambda x: distinct[x], np.random.randint(0, len(distinct), size)))
29
-
30
- @classmethod
31
- def gen_complex_distincts(self, size: int, pattern="x.x.x-x", replacement="x", templates=[]):
32
- """
33
- This method generates an array of random distinct values.
34
- :param size: int: Number of elements to be generated.
35
- :param pattern: str: Pattern to be replaced.
36
- :param replacement: str: Replacement of the pattern.
37
- :param templates: List[Dict]: List of dictionaries containing the method and parameters to be used in the replacement.
38
- :return: np.ndarray: Array of random distinct values.
39
- """
40
- assert pattern.count(replacement) == len(templates)
41
- list_of_lists, counter = [], 0
42
- for replacer_cursor in range(len(pattern)):
43
- if pattern[replacer_cursor] == replacement:
44
- list_of_lists.append(templates[counter]["method"](size, **templates[counter]["parms"]))
45
- counter += 1
46
- else:
47
- list_of_lists.append(np.array([pattern[replacer_cursor] for i in range(size)]))
48
- return reduce(lambda a, b: a.astype('str') + b.astype('str'), list_of_lists)
49
-
50
-
51
-
52
- if __name__ == '__main__':
53
- pass
54
-
55
-
56
-
57
- # def replace_duplicate(array_input, replace):
58
- # result = list(set(array_input))
59
- # result.extend([replace for i in range(len(array_input)-len(list(set(array_input))))])
60
- # random.shuffle(result)
61
- # return result
62
-
63
- # def handle_string_format(array_input, **kwargs):
64
- # return replace_duplicate(array_input, np.nan) \
65
- # if kwargs.get("rm_dupl") else array_input
@@ -1,14 +0,0 @@
1
- import numpy as np
2
-
3
- def expand_array(size=10, base_array=[]):
4
- return [base_array[int(i % len(base_array))] for i in range(size)]
5
-
6
- def reduce_array(size=10, base_array=[]):
7
- int_array = [int(i) for i in np.linspace(0, size-1, len(base_array))]
8
- reduced = [int_array.index(i) for i in range(size)]
9
- result = [base_array[i] for i in reduced]
10
- return result
11
-
12
-
13
- def spaced_array(interval, num_part=2):
14
- return list(np.linspace(interval[0], interval[1], num_part))
@@ -1,62 +0,0 @@
1
- import numpy as np
2
-
3
-
4
- class NumericCore:
5
-
6
-
7
- @classmethod
8
- def gen_ints(self, size: int, min: int, max: int) -> np.ndarray:
9
- """
10
- This method generates an array of random integers.
11
- :param size: int: Number of elements to be generated.
12
- :param min: int: Minimum value of the generated numbers.
13
- :param max: int: Maximum value of the generated numbers.
14
- :return: np.ndarray: Array of random integers.
15
- """
16
- return np.random.randint(min, max + 1, size)
17
-
18
-
19
- @classmethod
20
- def gen_ints_zfilled(self, size: int, length: int) -> np.ndarray:
21
- """
22
- This method generates an array of random integers with a fixed length.
23
- :param size: int: Number of elements to be generated.
24
- :param length: int: Length of the generated numbers.
25
- :return: np.ndarray: Array of random integers.
26
- """
27
- str_arr = np.random.randint(0, 10**length, size).astype('str')
28
- return np.char.zfill(str_arr, length)
29
-
30
-
31
- @classmethod
32
- def gen_floats(self, size: int, min: int, max: int, round: int = 2):
33
- """
34
- This method generates an array of random floats.
35
- :param size: int: Number of elements to be generated.
36
- :param min: int: Minimum value of the generated numbers.
37
- :param max: int: Maximum value of the generated numbers.
38
- :param round: int: Number of decimal places to round the generated numbers. Default is 2.
39
- :return: np.ndarray: Array of random floats.
40
- """
41
- sig_part = np.random.randint(min, max, size)
42
- decimal = np.random.randint(0, 10 ** round, size)
43
- return sig_part + (decimal / 10 ** round) if round > 0 else sig_part
44
-
45
-
46
- @classmethod
47
- def gen_floats_normal(self, size: int, mean: int, std: int, round: int = 2):
48
- """
49
- This method generates an array of random floats with a normal distribution.
50
- :param size: int: Number of elements to be generated.
51
- :param mean: int: Mean of the distribution.
52
- :param std: int: Standard deviation of the distribution.
53
- :param round: int: Number of decimal places to round the generated numbers. Default is 2.
54
- :return: np.ndarray: Array of random floats.
55
- """
56
- return np.round(np.random.normal(mean, std, size), round)
57
-
58
-
59
-
60
-
61
- if __name__ == '__main__':
62
- pass
@@ -1,88 +0,0 @@
1
- import os
2
- import time
3
- import pandas as pd
4
- from typing import List, Dict, Optional, Generator, Callable, Any
5
- from rand_engine.main.file_writer import FileWriter
6
- from rand_engine.main.stream_handle import StreamHandle
7
-
8
- class DataGenerator:
9
-
10
- def __init__(self, random_spec):
11
- self.random_spec = random_spec
12
- self.actual_dataframe = None
13
-
14
-
15
- def handle_splitable(self, metadata, df):
16
- for key, value in metadata.items():
17
- if value.get("splitable"):
18
- sep = value.get("sep", ";")
19
- cols = value.get("cols")
20
- df[cols] = df[key].str.split(sep, expand=True)
21
- df.drop(columns=[key], inplace=True)
22
- return df
23
-
24
-
25
- def generate_pandas_df(self, size: int, transformer: Optional[Callable]=None) -> pd.DataFrame:
26
- """
27
- This method generates a pandas DataFrame based on random data specified in the metadata parameter.
28
- :param size: int: Number of rows to be generated.
29
- :param transformer: Optional[Callable]: Function to transform the generated data.
30
- :return: pd.DataFrame: DataFrame with the generated data.
31
- """
32
- assert type(self.random_spec) is dict, "You need to pass a random_spec parameter to generate the random data."
33
- def first_level():
34
- dict_data = {key: value["method"](size, **value["parms"]) for key, value in self.random_spec.items()}
35
- df_pandas = pd.DataFrame(dict_data)
36
- df_pandas = self.handle_splitable(self.random_spec, df_pandas)
37
- if transformer: return transformer(df_pandas)
38
- return df_pandas
39
- self.actual_dataframe = first_level
40
- return self
41
-
42
-
43
- def generate_spark_df(self, spark, size: int, transformer: Optional[Callable]=None) -> Any:
44
- """
45
- This method generates a Spark DataFrame based on random data specified in the random_spec parameter.
46
- :param spark: SparkSession: SparkSession object.
47
- :param size: int: Number of rows to be generated.
48
- :param transformer: Optional[Callable]: Function to transform the generated data."""
49
- def second_level():
50
- self.generate_pandas_df(size=size, transformer=transformer)
51
- df_spark = spark.createDataFrame(self.actual_dataframe())
52
- return df_spark
53
- self.actual_dataframe = second_level
54
- return self
55
-
56
-
57
- def get_df(self):
58
- assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
59
- return self.actual_dataframe()
60
-
61
-
62
- def stream_dict(self, min_throughput: int=1, max_throughput: int = 10) -> Generator:
63
- """
64
- This method creates a generator of records to be used in a streaming context.
65
- :param min_throughput: int: Minimum throughput to be generated.
66
- :param max_throughput: int: Maximum throughput to be generated.
67
- :return: Generator: Generator of records.
68
- """
69
- assert self.actual_dataframe is not None, "You need to generate a DataFrame first."
70
- while True:
71
- df_data_microbatch = self.actual_dataframe()
72
- df_data_parsed = StreamHandle.convert_dt_to_str(df_data_microbatch)
73
- list_of_records = df_data_parsed.to_dict('records')
74
- for record in list_of_records:
75
- record["timestamp_created"] = round(time.time(), 3)
76
- yield record
77
- StreamHandle.sleep_to_contro_throughput(min_throughput, max_throughput)
78
-
79
-
80
- def write(self):
81
- microbatch_def = lambda: self.actual_dataframe
82
- return FileWriter(microbatch_def)
83
-
84
-
85
-
86
- if __name__ == '__main__':
87
-
88
- pass
File without changes