rand-engine 0.4.1__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rand_engine-0.4.1 → rand_engine-0.4.2}/PKG-INFO +1 -1
- {rand_engine-0.4.1 → rand_engine-0.4.2}/pyproject.toml +1 -1
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/cdc_generator.py +5 -36
- {rand_engine-0.4.1 → rand_engine-0.4.2}/README.md +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/__init__.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/core/datetime_core.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/core/distinct_core.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/core/distinct_utils.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/core/general_utils.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/core/numeric_core.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/data_generator.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/file_writer.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/fs_utils.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/i_random_spec.py +0 -0
- {rand_engine-0.4.1 → rand_engine-0.4.2}/rand_engine/main/stream_handle.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rand-engine
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
|
|
5
5
|
Author: marcoaureliomenezes
|
|
6
6
|
Author-email: marcoaurelioreislima@gmail.com
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "rand-engine"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.2"
|
|
4
4
|
description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
|
|
5
5
|
authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
|
|
6
6
|
repository = "https://github.com/marcoaureliomenezes/rand_engine"
|
|
@@ -75,7 +75,8 @@ class FilesGenerator:
|
|
|
75
75
|
|
|
76
76
|
class CDCGenerator(FilesGenerator):
|
|
77
77
|
|
|
78
|
-
def __init__(self,
|
|
78
|
+
def __init__(self, spark, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
|
|
79
|
+
self.spark = spark
|
|
79
80
|
self.footprint = footprint
|
|
80
81
|
self.fs_utils = fs_utils
|
|
81
82
|
self.pk_cols = pk_cols
|
|
@@ -94,7 +95,7 @@ class CDCGenerator(FilesGenerator):
|
|
|
94
95
|
|
|
95
96
|
|
|
96
97
|
def calculate_rows_to_change(self, sample):
|
|
97
|
-
df = spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
|
|
98
|
+
df = self.spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
|
|
98
99
|
df_ids_inserted = df.select(*self.pk_cols).filter("operation = 'INSERT'").distinct()
|
|
99
100
|
df_ids_deleted = df.select(*self.pk_cols).filter("operation = 'DELETE'").distinct()
|
|
100
101
|
df_pks_to_change = df_ids_inserted.join(df_ids_deleted, on=self.pk_cols, how="leftanti")
|
|
@@ -102,40 +103,8 @@ class CDCGenerator(FilesGenerator):
|
|
|
102
103
|
return df_pks_to_change
|
|
103
104
|
|
|
104
105
|
|
|
105
|
-
def calculate_rows_to_change_pandas(self, sample):
|
|
106
|
-
files = self.fs_utils.ls(self.base_path)
|
|
107
|
-
# Read and concatenate all files
|
|
108
|
-
df_list = []
|
|
109
|
-
for file_info in files:
|
|
110
|
-
print(f"Reading file: {file_info.path}")
|
|
111
|
-
if file_info.name.endswith(f".{self.ext}"):
|
|
112
|
-
if self.ext == "json":
|
|
113
|
-
df_temp = pd.read_json(file_info.path, lines=True)
|
|
114
|
-
elif self.ext == "csv":
|
|
115
|
-
df_temp = pd.read_csv(file_info.path)
|
|
116
|
-
elif self.ext == "parquet":
|
|
117
|
-
df_temp = pd.read_parquet(file_info.path)
|
|
118
|
-
else:
|
|
119
|
-
continue
|
|
120
|
-
df_list.append(df_temp)
|
|
121
|
-
if not df_list:
|
|
122
|
-
return pd.DataFrame(columns=self.pk_cols)
|
|
123
|
-
df = pd.concat(df_list, ignore_index=True)
|
|
124
|
-
df = df.dropna(subset=self.pk_cols)
|
|
125
|
-
df_ids_inserted = df[df['operation'] == 'INSERT'][self.pk_cols].drop_duplicates()
|
|
126
|
-
df_ids_deleted = df[df['operation'] == 'DELETE'][self.pk_cols].drop_duplicates()
|
|
127
|
-
df_pks_to_change = df_ids_inserted.merge(df_ids_deleted, on=self.pk_cols, how='left', indicator=True)
|
|
128
|
-
# Keep only records that exist only in left (inserted but not deleted)
|
|
129
|
-
df_pks_to_change = df_pks_to_change[df_pks_to_change['_merge'] == 'left_only']
|
|
130
|
-
df_pks_to_change = df_pks_to_change.drop('_merge', axis=1)
|
|
131
|
-
if sample < 1.0 and len(df_pks_to_change) > 0:
|
|
132
|
-
df_pks_to_change = df_pks_to_change.sample(frac=sample)
|
|
133
|
-
|
|
134
|
-
return df_pks_to_change
|
|
135
|
-
|
|
136
|
-
|
|
137
106
|
def generate_changes(self, sample, const_cols, null_rate):
|
|
138
|
-
df_pks_to_change = self.
|
|
107
|
+
df_pks_to_change = self.calculate_rows_to_change(sample=sample)
|
|
139
108
|
metadata = self.footprint.metadata()
|
|
140
109
|
size = df_pks_to_change.shape[0]
|
|
141
110
|
transformer = self.footprint.transformer_cdc_update(null_rate=null_rate, **const_cols)
|
|
@@ -166,7 +135,7 @@ class CDCGenerator(FilesGenerator):
|
|
|
166
135
|
df_changes = pd.concat([df_update, df_delete], ignore_index=True)
|
|
167
136
|
file_path = self._get_file_path()
|
|
168
137
|
df_changes.to_json(file_path, orient="records", lines=True)
|
|
169
|
-
|
|
138
|
+
print(f"File {file_path} created with {df_changes.shape[0]} records.")
|
|
170
139
|
|
|
171
140
|
def generate_cdc_stream(self, period=5, rounds=15):
|
|
172
141
|
for i in range(rounds):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|