PyPI - rand-engine - Versions diffs - 0.4.0__tar.gz → 0.4.2__tar.gz - Mend

rand-engine 0.4.0tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{rand_engine-0.4.0 → rand_engine-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rand-engine
-Version: 0.4.0
+Version: 0.4.2
 Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
 Author: marcoaureliomenezes
 Author-email: marcoaurelioreislima@gmail.com
@@ -49,7 +49,7 @@ CoreDistinct().randint(0, 100, 10)
 To create a new release, simply create and push a git tag with semantic versioning:
 ```bash
-git tag 0.3.13 && git push origin --tags
+git tag 0.4.1 && git push origin --tags
 ```
 The GitHub Actions workflow will automatically:

{rand_engine-0.4.0 → rand_engine-0.4.2}/README.md RENAMED Viewed

@@ -29,7 +29,7 @@ CoreDistinct().randint(0, 100, 10)
 To create a new release, simply create and push a git tag with semantic versioning:
 ```bash
-git tag 0.3.13 && git push origin --tags
+git tag 0.4.1 && git push origin --tags
 ```
 The GitHub Actions workflow will automatically:

{rand_engine-0.4.0 → rand_engine-0.4.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rand-engine"
-version = "0.4.0"
+version = "0.4.2"
 description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
 authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
 repository = "https://github.com/marcoaureliomenezes/rand_engine"

{rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/cdc_generator.py RENAMED Viewed

@@ -44,7 +44,7 @@ class FilesGenerator:
   def list_files(self):
     assert self.base_path, "Base path not configured. Run the method setup_output."
-    self.fs_utils.mkdir(self.base_path)
+    return self.fs_utils.ls(self.base_path)
   def delete_files(self):
@@ -75,8 +75,10 @@ class FilesGenerator:
 class CDCGenerator(FilesGenerator):
-  def __init__(self,  footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
+  def __init__(self, spark, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
+    self.spark = spark
     self.footprint = footprint
+    self.fs_utils = fs_utils
     self.pk_cols = pk_cols
     self.cdc_props = self.default_cdc_properties()
@@ -93,7 +95,7 @@ class CDCGenerator(FilesGenerator):
   def calculate_rows_to_change(self, sample):
-    df = spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
+    df = self.spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
     df_ids_inserted = df.select(*self.pk_cols).filter("operation = 'INSERT'").distinct()
     df_ids_deleted = df.select(*self.pk_cols).filter("operation = 'DELETE'").distinct()
     df_pks_to_change = df_ids_inserted.join(df_ids_deleted, on=self.pk_cols, how="leftanti")
@@ -101,37 +103,6 @@ class CDCGenerator(FilesGenerator):
     return df_pks_to_change
-  def calculate_rows_to_change_pandas(self, sample):
-    files = self.fs_utils.ls(self.base_path)
-    # Read and concatenate all files
-    df_list = []
-    for file_info in files:
-        if file_info.name.endswith(f".{self.ext}"):
-            if self.ext == "json":
-                df_temp = pd.read_json(file_info.path, lines=True)
-            elif self.ext == "csv":
-                df_temp = pd.read_csv(file_info.path)
-            elif self.ext == "parquet":
-                df_temp = pd.read_parquet(file_info.path)
-            else:
-                continue
-            df_list.append(df_temp)
-    if not df_list:
-        return pd.DataFrame(columns=self.pk_cols)
-    df = pd.concat(df_list, ignore_index=True)
-    df = df.dropna(subset=self.pk_cols)
-    df_ids_inserted = df[df['operation'] == 'INSERT'][self.pk_cols].drop_duplicates()
-    df_ids_deleted = df[df['operation'] == 'DELETE'][self.pk_cols].drop_duplicates()
-    df_pks_to_change = df_ids_inserted.merge(df_ids_deleted, on=self.pk_cols, how='left', indicator=True)
-    # Keep only records that exist only in left (inserted but not deleted)
-    df_pks_to_change = df_pks_to_change[df_pks_to_change['_merge'] == 'left_only']
-    df_pks_to_change = df_pks_to_change.drop('_merge', axis=1)
-    if sample < 1.0 and len(df_pks_to_change) > 0:
-        df_pks_to_change = df_pks_to_change.sample(frac=sample)
-    return df_pks_to_change
   def generate_changes(self, sample, const_cols, null_rate):
     df_pks_to_change = self.calculate_rows_to_change(sample=sample)
     metadata = self.footprint.metadata()
@@ -147,10 +118,8 @@ class CDCGenerator(FilesGenerator):
   def generate_inserts(self):
-    operation="INSERT"
     const_cols={"operation": "INSERT", "updated_at": dt.now().strftime("%Y-%m-%dT%H:%M:%S")}
     insert_conf = self.cdc_props["INSERT"]
-    file_path = self._get_file_path()
     rand_size = randint(insert_conf["min_size"], insert_conf["max_size"])
     self.write_file(size=rand_size, const_cols=const_cols)
@@ -166,7 +135,7 @@ class CDCGenerator(FilesGenerator):
     df_changes = pd.concat([df_update, df_delete], ignore_index=True)
     file_path = self._get_file_path()
     df_changes.to_json(file_path, orient="records", lines=True)
+    print(f"File {file_path} created with {df_changes.shape[0]} records.")
   def generate_cdc_stream(self, period=5, rounds=15):
     for i in range(rounds):

{rand_engine-0.4.0 → rand_engine-0.4.2}/rand_engine/main/fs_utils.py RENAMED Viewed

@@ -111,7 +111,14 @@ class DBFSUtils(FSUtils):
             raise ImportError(f"DBUtils not available. Are you running in Databricks? Error: {str(e)}")
     def ls(self, base_path: str) -> List[FSFileInfo]:
-        try: return self.dbutils.fs.ls(base_path)
+        try:
+            data = self.dbutils.fs.ls(base_path)
+            return [FSFileInfo(
+                path=item.path.replace("dbfs:", ""),
+                name=item.name,
+                size=item.size,
+                modificationTime=item.modificationTime
+            ) for item in data]
         except Exception as e:
             return []