rand-engine 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rand_engine-0.4.0 → rand_engine-0.4.2}/PKG-INFO +2 -2
- {rand_engine-0.4.0 → rand_engine-0.4.2}/README.md +1 -1
- {rand_engine-0.4.0 → rand_engine-0.4.2}/pyproject.toml +1 -1
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/cdc_generator.py +6 -37
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/fs_utils.py +8 -1
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/__init__.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/core/datetime_core.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/core/distinct_core.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/core/distinct_utils.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/core/general_utils.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/core/numeric_core.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/data_generator.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/file_writer.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/i_random_spec.py +0 -0
- {rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/stream_handle.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rand-engine
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
|
|
5
5
|
Author: marcoaureliomenezes
|
|
6
6
|
Author-email: marcoaurelioreislima@gmail.com
|
|
@@ -49,7 +49,7 @@ CoreDistinct().randint(0, 100, 10)
|
|
|
49
49
|
To create a new release, simply create and push a git tag with semantic versioning:
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
|
-
git tag 0.
|
|
52
|
+
git tag 0.4.1 && git push origin --tags
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
The GitHub Actions workflow will automatically:
|
|
@@ -29,7 +29,7 @@ CoreDistinct().randint(0, 100, 10)
|
|
|
29
29
|
To create a new release, simply create and push a git tag with semantic versioning:
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
-
git tag 0.
|
|
32
|
+
git tag 0.4.1 && git push origin --tags
|
|
33
33
|
```
|
|
34
34
|
|
|
35
35
|
The GitHub Actions workflow will automatically:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "rand-engine"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.2"
|
|
4
4
|
description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
|
|
5
5
|
authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
|
|
6
6
|
repository = "https://github.com/marcoaureliomenezes/rand_engine"
|
|
@@ -44,7 +44,7 @@ class FilesGenerator:
|
|
|
44
44
|
|
|
45
45
|
def list_files(self):
|
|
46
46
|
assert self.base_path, "Base path not configured. Run the method setup_output."
|
|
47
|
-
self.fs_utils.
|
|
47
|
+
return self.fs_utils.ls(self.base_path)
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def delete_files(self):
|
|
@@ -75,8 +75,10 @@ class FilesGenerator:
|
|
|
75
75
|
|
|
76
76
|
class CDCGenerator(FilesGenerator):
|
|
77
77
|
|
|
78
|
-
def __init__(self,
|
|
78
|
+
def __init__(self, spark, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
|
|
79
|
+
self.spark = spark
|
|
79
80
|
self.footprint = footprint
|
|
81
|
+
self.fs_utils = fs_utils
|
|
80
82
|
self.pk_cols = pk_cols
|
|
81
83
|
self.cdc_props = self.default_cdc_properties()
|
|
82
84
|
|
|
@@ -93,7 +95,7 @@ class CDCGenerator(FilesGenerator):
|
|
|
93
95
|
|
|
94
96
|
|
|
95
97
|
def calculate_rows_to_change(self, sample):
|
|
96
|
-
df = spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
|
|
98
|
+
df = self.spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
|
|
97
99
|
df_ids_inserted = df.select(*self.pk_cols).filter("operation = 'INSERT'").distinct()
|
|
98
100
|
df_ids_deleted = df.select(*self.pk_cols).filter("operation = 'DELETE'").distinct()
|
|
99
101
|
df_pks_to_change = df_ids_inserted.join(df_ids_deleted, on=self.pk_cols, how="leftanti")
|
|
@@ -101,37 +103,6 @@ class CDCGenerator(FilesGenerator):
|
|
|
101
103
|
return df_pks_to_change
|
|
102
104
|
|
|
103
105
|
|
|
104
|
-
def calculate_rows_to_change_pandas(self, sample):
|
|
105
|
-
files = self.fs_utils.ls(self.base_path)
|
|
106
|
-
# Read and concatenate all files
|
|
107
|
-
df_list = []
|
|
108
|
-
for file_info in files:
|
|
109
|
-
if file_info.name.endswith(f".{self.ext}"):
|
|
110
|
-
if self.ext == "json":
|
|
111
|
-
df_temp = pd.read_json(file_info.path, lines=True)
|
|
112
|
-
elif self.ext == "csv":
|
|
113
|
-
df_temp = pd.read_csv(file_info.path)
|
|
114
|
-
elif self.ext == "parquet":
|
|
115
|
-
df_temp = pd.read_parquet(file_info.path)
|
|
116
|
-
else:
|
|
117
|
-
continue
|
|
118
|
-
df_list.append(df_temp)
|
|
119
|
-
if not df_list:
|
|
120
|
-
return pd.DataFrame(columns=self.pk_cols)
|
|
121
|
-
df = pd.concat(df_list, ignore_index=True)
|
|
122
|
-
df = df.dropna(subset=self.pk_cols)
|
|
123
|
-
df_ids_inserted = df[df['operation'] == 'INSERT'][self.pk_cols].drop_duplicates()
|
|
124
|
-
df_ids_deleted = df[df['operation'] == 'DELETE'][self.pk_cols].drop_duplicates()
|
|
125
|
-
df_pks_to_change = df_ids_inserted.merge(df_ids_deleted, on=self.pk_cols, how='left', indicator=True)
|
|
126
|
-
# Keep only records that exist only in left (inserted but not deleted)
|
|
127
|
-
df_pks_to_change = df_pks_to_change[df_pks_to_change['_merge'] == 'left_only']
|
|
128
|
-
df_pks_to_change = df_pks_to_change.drop('_merge', axis=1)
|
|
129
|
-
if sample < 1.0 and len(df_pks_to_change) > 0:
|
|
130
|
-
df_pks_to_change = df_pks_to_change.sample(frac=sample)
|
|
131
|
-
|
|
132
|
-
return df_pks_to_change
|
|
133
|
-
|
|
134
|
-
|
|
135
106
|
def generate_changes(self, sample, const_cols, null_rate):
|
|
136
107
|
df_pks_to_change = self.calculate_rows_to_change(sample=sample)
|
|
137
108
|
metadata = self.footprint.metadata()
|
|
@@ -147,10 +118,8 @@ class CDCGenerator(FilesGenerator):
|
|
|
147
118
|
|
|
148
119
|
|
|
149
120
|
def generate_inserts(self):
|
|
150
|
-
operation="INSERT"
|
|
151
121
|
const_cols={"operation": "INSERT", "updated_at": dt.now().strftime("%Y-%m-%dT%H:%M:%S")}
|
|
152
122
|
insert_conf = self.cdc_props["INSERT"]
|
|
153
|
-
file_path = self._get_file_path()
|
|
154
123
|
rand_size = randint(insert_conf["min_size"], insert_conf["max_size"])
|
|
155
124
|
self.write_file(size=rand_size, const_cols=const_cols)
|
|
156
125
|
|
|
@@ -166,7 +135,7 @@ class CDCGenerator(FilesGenerator):
|
|
|
166
135
|
df_changes = pd.concat([df_update, df_delete], ignore_index=True)
|
|
167
136
|
file_path = self._get_file_path()
|
|
168
137
|
df_changes.to_json(file_path, orient="records", lines=True)
|
|
169
|
-
|
|
138
|
+
print(f"File {file_path} created with {df_changes.shape[0]} records.")
|
|
170
139
|
|
|
171
140
|
def generate_cdc_stream(self, period=5, rounds=15):
|
|
172
141
|
for i in range(rounds):
|
|
@@ -111,7 +111,14 @@ class DBFSUtils(FSUtils):
|
|
|
111
111
|
raise ImportError(f"DBUtils not available. Are you running in Databricks? Error: {str(e)}")
|
|
112
112
|
|
|
113
113
|
def ls(self, base_path: str) -> List[FSFileInfo]:
|
|
114
|
-
try:
|
|
114
|
+
try:
|
|
115
|
+
data = self.dbutils.fs.ls(base_path)
|
|
116
|
+
return [FSFileInfo(
|
|
117
|
+
path=item.path.replace("dbfs:", ""),
|
|
118
|
+
name=item.name,
|
|
119
|
+
size=item.size,
|
|
120
|
+
modificationTime=item.modificationTime
|
|
121
|
+
) for item in data]
|
|
115
122
|
except Exception as e:
|
|
116
123
|
return []
|
|
117
124
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|