rand-engine 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rand-engine
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
5
5
  Author: marcoaureliomenezes
6
6
  Author-email: marcoaurelioreislima@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "rand-engine"
3
- version = "0.4.1"
3
+ version = "0.4.2"
4
4
  description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
5
5
  authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
6
6
  repository = "https://github.com/marcoaureliomenezes/rand_engine"
@@ -75,7 +75,8 @@ class FilesGenerator:
75
75
 
76
76
  class CDCGenerator(FilesGenerator):
77
77
 
78
- def __init__(self, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
78
+ def __init__(self, spark, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
79
+ self.spark = spark
79
80
  self.footprint = footprint
80
81
  self.fs_utils = fs_utils
81
82
  self.pk_cols = pk_cols
@@ -94,7 +95,7 @@ class CDCGenerator(FilesGenerator):
94
95
 
95
96
 
96
97
  def calculate_rows_to_change(self, sample):
97
- df = spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
98
+ df = self.spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
98
99
  df_ids_inserted = df.select(*self.pk_cols).filter("operation = 'INSERT'").distinct()
99
100
  df_ids_deleted = df.select(*self.pk_cols).filter("operation = 'DELETE'").distinct()
100
101
  df_pks_to_change = df_ids_inserted.join(df_ids_deleted, on=self.pk_cols, how="leftanti")
@@ -102,40 +103,8 @@ class CDCGenerator(FilesGenerator):
102
103
  return df_pks_to_change
103
104
 
104
105
 
105
- def calculate_rows_to_change_pandas(self, sample):
106
- files = self.fs_utils.ls(self.base_path)
107
- # Read and concatenate all files
108
- df_list = []
109
- for file_info in files:
110
- print(f"Reading file: {file_info.path}")
111
- if file_info.name.endswith(f".{self.ext}"):
112
- if self.ext == "json":
113
- df_temp = pd.read_json(file_info.path, lines=True)
114
- elif self.ext == "csv":
115
- df_temp = pd.read_csv(file_info.path)
116
- elif self.ext == "parquet":
117
- df_temp = pd.read_parquet(file_info.path)
118
- else:
119
- continue
120
- df_list.append(df_temp)
121
- if not df_list:
122
- return pd.DataFrame(columns=self.pk_cols)
123
- df = pd.concat(df_list, ignore_index=True)
124
- df = df.dropna(subset=self.pk_cols)
125
- df_ids_inserted = df[df['operation'] == 'INSERT'][self.pk_cols].drop_duplicates()
126
- df_ids_deleted = df[df['operation'] == 'DELETE'][self.pk_cols].drop_duplicates()
127
- df_pks_to_change = df_ids_inserted.merge(df_ids_deleted, on=self.pk_cols, how='left', indicator=True)
128
- # Keep only records that exist only in left (inserted but not deleted)
129
- df_pks_to_change = df_pks_to_change[df_pks_to_change['_merge'] == 'left_only']
130
- df_pks_to_change = df_pks_to_change.drop('_merge', axis=1)
131
- if sample < 1.0 and len(df_pks_to_change) > 0:
132
- df_pks_to_change = df_pks_to_change.sample(frac=sample)
133
-
134
- return df_pks_to_change
135
-
136
-
137
106
  def generate_changes(self, sample, const_cols, null_rate):
138
- df_pks_to_change = self.calculate_rows_to_change_pandas(sample=sample)
107
+ df_pks_to_change = self.calculate_rows_to_change(sample=sample)
139
108
  metadata = self.footprint.metadata()
140
109
  size = df_pks_to_change.shape[0]
141
110
  transformer = self.footprint.transformer_cdc_update(null_rate=null_rate, **const_cols)
@@ -166,7 +135,7 @@ class CDCGenerator(FilesGenerator):
166
135
  df_changes = pd.concat([df_update, df_delete], ignore_index=True)
167
136
  file_path = self._get_file_path()
168
137
  df_changes.to_json(file_path, orient="records", lines=True)
169
-
138
+ print(f"File {file_path} created with {df_changes.shape[0]} records.")
170
139
 
171
140
  def generate_cdc_stream(self, period=5, rounds=15):
172
141
  for i in range(rounds):
File without changes