rand-engine 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rand-engine
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing.
5
5
  Author: marcoaureliomenezes
6
6
  Author-email: marcoaurelioreislima@gmail.com
@@ -49,7 +49,7 @@ CoreDistinct().randint(0, 100, 10)
49
49
  To create a new release, simply create and push a git tag with semantic versioning:
50
50
 
51
51
  ```bash
52
- git tag 0.3.13 && git push origin --tags
52
+ git tag 0.4.1 && git push origin --tags
53
53
  ```
54
54
 
55
55
  The GitHub Actions workflow will automatically:
@@ -29,7 +29,7 @@ CoreDistinct().randint(0, 100, 10)
29
29
  To create a new release, simply create and push a git tag with semantic versioning:
30
30
 
31
31
  ```bash
32
- git tag 0.3.13 && git push origin --tags
32
+ git tag 0.4.1 && git push origin --tags
33
33
  ```
34
34
 
35
35
  The GitHub Actions workflow will automatically:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "rand-engine"
3
- version = "0.4.0"
3
+ version = "0.4.2"
4
4
  description = "Rand Engine v2. Package with some methods to generate random data in different formats. Great to mock data while testing or developing."
5
5
  authors = ["marcoaureliomenezes <marcoaurelioreislima@gmail.com>"]
6
6
  repository = "https://github.com/marcoaureliomenezes/rand_engine"
@@ -44,7 +44,7 @@ class FilesGenerator:
44
44
 
45
45
  def list_files(self):
46
46
  assert self.base_path, "Base path not configured. Run the method setup_output."
47
- self.fs_utils.mkdir(self.base_path)
47
+ return self.fs_utils.ls(self.base_path)
48
48
 
49
49
 
50
50
  def delete_files(self):
@@ -75,8 +75,10 @@ class FilesGenerator:
75
75
 
76
76
  class CDCGenerator(FilesGenerator):
77
77
 
78
- def __init__(self, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
78
+ def __init__(self, spark, footprint: IRandomSpec, pk_cols: List=[], fs_utils: FSUtils=DBFSUtils(), ):
79
+ self.spark = spark
79
80
  self.footprint = footprint
81
+ self.fs_utils = fs_utils
80
82
  self.pk_cols = pk_cols
81
83
  self.cdc_props = self.default_cdc_properties()
82
84
 
@@ -93,7 +95,7 @@ class CDCGenerator(FilesGenerator):
93
95
 
94
96
 
95
97
  def calculate_rows_to_change(self, sample):
96
- df = spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
98
+ df = self.spark.read.format(self.ext).load(self.base_path).filter(coalesce(*self.pk_cols).isNotNull())
97
99
  df_ids_inserted = df.select(*self.pk_cols).filter("operation = 'INSERT'").distinct()
98
100
  df_ids_deleted = df.select(*self.pk_cols).filter("operation = 'DELETE'").distinct()
99
101
  df_pks_to_change = df_ids_inserted.join(df_ids_deleted, on=self.pk_cols, how="leftanti")
@@ -101,37 +103,6 @@ class CDCGenerator(FilesGenerator):
101
103
  return df_pks_to_change
102
104
 
103
105
 
104
- def calculate_rows_to_change_pandas(self, sample):
105
- files = self.fs_utils.ls(self.base_path)
106
- # Read and concatenate all files
107
- df_list = []
108
- for file_info in files:
109
- if file_info.name.endswith(f".{self.ext}"):
110
- if self.ext == "json":
111
- df_temp = pd.read_json(file_info.path, lines=True)
112
- elif self.ext == "csv":
113
- df_temp = pd.read_csv(file_info.path)
114
- elif self.ext == "parquet":
115
- df_temp = pd.read_parquet(file_info.path)
116
- else:
117
- continue
118
- df_list.append(df_temp)
119
- if not df_list:
120
- return pd.DataFrame(columns=self.pk_cols)
121
- df = pd.concat(df_list, ignore_index=True)
122
- df = df.dropna(subset=self.pk_cols)
123
- df_ids_inserted = df[df['operation'] == 'INSERT'][self.pk_cols].drop_duplicates()
124
- df_ids_deleted = df[df['operation'] == 'DELETE'][self.pk_cols].drop_duplicates()
125
- df_pks_to_change = df_ids_inserted.merge(df_ids_deleted, on=self.pk_cols, how='left', indicator=True)
126
- # Keep only records that exist only in left (inserted but not deleted)
127
- df_pks_to_change = df_pks_to_change[df_pks_to_change['_merge'] == 'left_only']
128
- df_pks_to_change = df_pks_to_change.drop('_merge', axis=1)
129
- if sample < 1.0 and len(df_pks_to_change) > 0:
130
- df_pks_to_change = df_pks_to_change.sample(frac=sample)
131
-
132
- return df_pks_to_change
133
-
134
-
135
106
  def generate_changes(self, sample, const_cols, null_rate):
136
107
  df_pks_to_change = self.calculate_rows_to_change(sample=sample)
137
108
  metadata = self.footprint.metadata()
@@ -147,10 +118,8 @@ class CDCGenerator(FilesGenerator):
147
118
 
148
119
 
149
120
  def generate_inserts(self):
150
- operation="INSERT"
151
121
  const_cols={"operation": "INSERT", "updated_at": dt.now().strftime("%Y-%m-%dT%H:%M:%S")}
152
122
  insert_conf = self.cdc_props["INSERT"]
153
- file_path = self._get_file_path()
154
123
  rand_size = randint(insert_conf["min_size"], insert_conf["max_size"])
155
124
  self.write_file(size=rand_size, const_cols=const_cols)
156
125
 
@@ -166,7 +135,7 @@ class CDCGenerator(FilesGenerator):
166
135
  df_changes = pd.concat([df_update, df_delete], ignore_index=True)
167
136
  file_path = self._get_file_path()
168
137
  df_changes.to_json(file_path, orient="records", lines=True)
169
-
138
+ print(f"File {file_path} created with {df_changes.shape[0]} records.")
170
139
 
171
140
  def generate_cdc_stream(self, period=5, rounds=15):
172
141
  for i in range(rounds):
@@ -111,7 +111,14 @@ class DBFSUtils(FSUtils):
111
111
  raise ImportError(f"DBUtils not available. Are you running in Databricks? Error: {str(e)}")
112
112
 
113
113
  def ls(self, base_path: str) -> List[FSFileInfo]:
114
- try: return self.dbutils.fs.ls(base_path)
114
+ try:
115
+ data = self.dbutils.fs.ls(base_path)
116
+ return [FSFileInfo(
117
+ path=item.path.replace("dbfs:", ""),
118
+ name=item.name,
119
+ size=item.size,
120
+ modificationTime=item.modificationTime
121
+ ) for item in data]
115
122
  except Exception as e:
116
123
  return []
117
124