dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

ml_tools/handle_excel.py CHANGED
@@ -1,11 +1,12 @@
1
- import os
1
+ from pathlib import Path
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
- from typing import List, Optional
5
- from .utilities import _script_info, sanitize_filename
4
+ from typing import List, Optional, Union
5
+ from .utilities import _script_info, sanitize_filename, make_fullpath
6
6
 
7
7
 
8
8
  __all__ = [
9
+ "find_excel_files",
9
10
  "unmerge_and_split_excel",
10
11
  "unmerge_and_split_from_directory",
11
12
  "validate_excel_schema",
@@ -14,20 +15,55 @@ __all__ = [
14
15
  ]
15
16
 
16
17
 
17
- def unmerge_and_split_excel(filepath: str) -> None:
18
+ def find_excel_files(
19
+ directory: Union[str, Path],
20
+ *,
21
+ extensions: tuple[str, ...] = (".xlsx", ".xls"),
22
+ exclude_temp: bool = True
23
+ ) -> list[Path]:
24
+ """
25
+ Returns a list of Excel file Paths in the specified directory.
26
+
27
+ Parameters:
28
+ directory (str | Path): Directory to search.
29
+ extensions (tuple[str, ...]): Valid Excel file extensions (default: .xlsx, .xls).
30
+ exclude_temp (bool): Whether to exclude files that start with '~'.
31
+
32
+ Returns:
33
+ list[Path]: List of Excel file paths matching criteria.
34
+ """
35
+ input_path = make_fullpath(directory)
36
+
37
+ if not input_path.is_dir():
38
+ raise NotADirectoryError(f"Directory not found: {input_path}")
39
+
40
+ excel_files = [
41
+ f for f in input_path.iterdir()
42
+ if f.is_file()
43
+ and f.suffix.lower() in extensions
44
+ and (not f.name.startswith('~') if exclude_temp else True)
45
+ ]
46
+
47
+ if not excel_files:
48
+ raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
49
+
50
+ return excel_files
51
+
52
+
53
+ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
18
54
  """
19
55
  Processes a single Excel file:
20
- - Unmerges all merged cells (vertical and horizontal),
21
- - Fills each merged region with the top-left cell value,
22
- - Splits each sheet into a separate Excel file,
56
+ - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value.
57
+ - Splits each sheet into a separate Excel file.
23
58
  - Saves all results in the same directory as the input file.
24
59
 
25
60
  Parameters:
26
- filepath (str): Full path to the Excel file to process.
61
+ filepath (str | Path): Full path to the Excel file to process.
27
62
  """
28
- wb = load_workbook(filepath)
29
- base_dir = os.path.dirname(os.path.abspath(filepath))
30
- base_name = os.path.splitext(os.path.basename(filepath))[0]
63
+ file_path = make_fullpath(filepath)
64
+ wb = load_workbook(file_path)
65
+ base_dir = file_path.parent
66
+ base_name = file_path.stem
31
67
 
32
68
  total_output_files = 0
33
69
 
@@ -56,40 +92,37 @@ def unmerge_and_split_excel(filepath: str) -> None:
56
92
  # Construct flat output file name
57
93
  sanitized_sheet_name = sanitize_filename(sheet_name)
58
94
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
59
- output_path = os.path.join(base_dir, output_filename)
95
+ output_path = base_dir / output_filename
60
96
  new_wb.save(output_path)
61
97
 
62
98
  # print(f"Saved: {output_path}")
63
99
  total_output_files += 1
64
100
 
65
- print(f"✅ Processed file: {filepath} into {total_output_files} output file(s).")
101
+ print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
66
102
  return None
67
103
 
68
104
 
69
- def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
105
+ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Union[str,Path]) -> None:
70
106
  """
71
107
  Processes all Excel files in the input directory:
72
- - Unmerges all merged cells (vertical and horizontal),
73
- - Fills each merged region with the top-left cell value,
74
- - Splits each sheet into separate Excel files,
108
+ - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value,
109
+ - Splits each sheet into separate Excel files.
75
110
  - Saves all results into the output directory.
76
111
 
77
112
  Parameters:
78
- input_dir (str): Directory containing Excel files to process.
79
- output_dir (str): Directory to save processed Excel files.
113
+ input_dir (str | Path): Directory containing Excel files to process.
114
+ output_dir (str | Path): Directory to save processed Excel files.
80
115
  """
81
- raw_files = [f for f in os.listdir(input_dir) if f.endswith(('.xlsx', '.xls'))]
82
- excel_files = [os.path.join(input_dir, f) for f in raw_files if not f.startswith('~')]
83
-
84
- if not excel_files:
85
- raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
116
+ global_input_path = make_fullpath(input_dir)
117
+ global_output_path = make_fullpath(output_dir, make=True)
118
+
119
+ excel_files = find_excel_files(global_input_path)
86
120
 
87
- os.makedirs(output_dir, exist_ok=True)
88
121
  total_output_files = 0
89
122
 
90
123
  for file_path in excel_files:
91
124
  wb = load_workbook(file_path)
92
- base_name = os.path.splitext(os.path.basename(file_path))[0]
125
+ base_name = file_path.stem
93
126
 
94
127
  for sheet_name in wb.sheetnames:
95
128
  ws = wb[sheet_name]
@@ -116,7 +149,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
116
149
  # Construct flat output file name
117
150
  sanitized_sheet_name = sanitize_filename(sheet_name)
118
151
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
119
- output_path = os.path.join(output_dir, output_filename)
152
+ output_path = global_output_path / output_filename
120
153
  new_wb.save(output_path)
121
154
 
122
155
  # print(f"Saved: {output_path}")
@@ -127,7 +160,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
127
160
 
128
161
 
129
162
  def validate_excel_schema(
130
- target_dir: str,
163
+ target_dir: Union[str,Path],
131
164
  expected_columns: List[str],
132
165
  strict: bool = False
133
166
  ) -> None:
@@ -135,7 +168,7 @@ def validate_excel_schema(
135
168
  Validates that each Excel file in a directory conforms to the expected column schema.
136
169
 
137
170
  Parameters:
138
- target_dir (str): Path to the directory containing Excel files.
171
+ target_dir (str | Path): Path to the directory containing Excel files.
139
172
  expected_columns (list[str]): List of expected column names.
140
173
  strict (bool): If True, columns must match exactly (names and order).
141
174
  If False, columns must contain at least all expected names.
@@ -143,52 +176,46 @@ def validate_excel_schema(
143
176
  Returns:
144
177
  List[str]: List of file paths that failed the schema validation.
145
178
  """
146
- invalid_files = []
179
+ invalid_files: list[Path] = []
147
180
  expected_set = set(expected_columns)
148
181
 
149
- excel_seen = 0
150
-
151
- for filename in os.listdir(target_dir):
152
- if not filename.lower().endswith(".xlsx"):
153
- continue # Skip non-Excel files
154
-
155
- if filename.startswith("~"): # Skip temporary files
156
- continue
157
-
158
- file_path = os.path.join(target_dir, filename)
159
- excel_seen += 1
182
+ target_path = make_fullpath(target_dir)
183
+
184
+ excel_paths = find_excel_files(target_path)
185
+
186
+ for file in excel_paths:
160
187
  try:
161
- wb = load_workbook(file_path, read_only=True)
188
+ wb = load_workbook(file, read_only=True)
162
189
  ws = wb.active # Only check the first worksheet
163
190
 
164
191
  header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
165
192
 
166
193
  if strict:
167
194
  if header != expected_columns:
168
- invalid_files.append(file_path)
195
+ invalid_files.append(file)
169
196
  else:
170
197
  header_set = set(header)
171
198
  if not expected_set.issubset(header_set):
172
- invalid_files.append(file_path)
199
+ invalid_files.append(file)
173
200
 
174
201
  except Exception as e:
175
- print(f"Error processing '{file_path}': {e}")
176
- invalid_files.append(file_path)
202
+ print(f"Error processing '{file}': {e}")
203
+ invalid_files.append(file)
177
204
 
178
- valid_excel_number = excel_seen - len(invalid_files)
179
- print(f"{valid_excel_number} out of {excel_seen} excel files conform to the schema.")
205
+ valid_excel_number = len(excel_paths) - len(invalid_files)
206
+ print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
180
207
  if invalid_files:
181
208
  print(f"⚠️ {len(invalid_files)} excel files are invalid:")
182
- for file in invalid_files:
183
- print(f" - {file}")
209
+ for in_file in invalid_files:
210
+ print(f" - {in_file.name}")
184
211
 
185
212
  return None
186
213
 
187
214
 
188
215
  def vertical_merge_transform_excel(
189
- target_dir: str,
216
+ target_dir: Union[str,Path],
190
217
  csv_filename: str,
191
- output_dir: str,
218
+ output_dir: Union[str,Path],
192
219
  target_columns: Optional[List[str]] = None,
193
220
  rename_columns: Optional[List[str]] = None
194
221
  ) -> None:
@@ -201,35 +228,31 @@ def vertical_merge_transform_excel(
201
228
  - If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
202
229
 
203
230
  Parameters:
204
- target_dir (str): Directory containing Excel files.
231
+ target_dir (str | Path): Directory containing Excel files.
205
232
  csv_filename (str): Output CSV filename.
206
- output_dir (str): Directory to save the output CSV file.
233
+ output_dir (str | Path): Directory to save the output CSV file.
207
234
  target_columns (list[str] | None): Columns to select from each Excel file.
208
235
  rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
209
236
  """
210
- raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
211
- excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
212
-
213
- if not excel_files:
214
- raise ValueError("No Excel files found in the target directory.")
237
+ target_path = make_fullpath(target_dir)
238
+ excel_files = find_excel_files(target_path)
215
239
 
216
240
  # sanitize filename
217
241
  csv_filename = sanitize_filename(csv_filename)
218
- # make directory
219
- os.makedirs(output_dir, exist_ok=True)
242
+ # make output directory
243
+ output_path = make_fullpath(output_dir, make=True)
220
244
 
221
245
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
222
- csv_path = os.path.join(output_dir, csv_filename)
246
+ csv_path = output_path / csv_filename
223
247
 
224
248
  dataframes = []
225
249
  for file in excel_files:
226
- file_path = os.path.join(target_dir, file)
227
- df = pd.read_excel(file_path, engine='openpyxl')
250
+ df = pd.read_excel(file, engine='openpyxl')
228
251
 
229
252
  if target_columns is not None:
230
253
  missing = [col for col in target_columns if col not in df.columns]
231
254
  if missing:
232
- raise ValueError(f"Missing columns in {file}: {missing}")
255
+ raise ValueError(f"Invalid columns in {file.name}: {missing}")
233
256
  df = df[target_columns]
234
257
 
235
258
  dataframes.append(df)
@@ -239,7 +262,7 @@ def vertical_merge_transform_excel(
239
262
  if rename_columns is not None:
240
263
  expected_len = len(target_columns if target_columns is not None else merged_df.columns)
241
264
  if len(rename_columns) != expected_len:
242
- raise ValueError("Length of rename_columns must match the selected columns")
265
+ raise ValueError("Length of 'rename_columns' must match the selected columns")
243
266
  merged_df.columns = rename_columns
244
267
 
245
268
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -247,9 +270,9 @@ def vertical_merge_transform_excel(
247
270
 
248
271
 
249
272
  def horizontal_merge_transform_excel(
250
- target_dir: str,
273
+ target_dir: Union[str,Path],
251
274
  csv_filename: str,
252
- output_dir: str,
275
+ output_dir: Union[str,Path],
253
276
  drop_columns: Optional[list[str]] = None,
254
277
  skip_duplicates: bool = False
255
278
  ) -> None:
@@ -265,31 +288,28 @@ def horizontal_merge_transform_excel(
265
288
  If True, only the first occurrence of each column name is kept.
266
289
 
267
290
  Parameters:
268
- target_dir (str): Directory containing Excel files.
291
+ target_dir (str | Path): Directory containing Excel files.
269
292
  csv_filename (str): Name of the output CSV file.
270
- output_dir (str): Directory to save the output CSV file.
293
+ output_dir (str | Path): Directory to save the output CSV file.
271
294
  drop_columns (list[str] | None): Columns to exclude from each file before merging.
272
295
  skip_duplicates (bool): Whether to skip duplicate columns or rename them.
273
296
  """
274
- raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
275
- excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
276
- if not excel_files:
277
- raise ValueError("No Excel files found in the target directory.")
297
+ target_path = make_fullpath(target_dir)
298
+ excel_files = find_excel_files(target_path)
278
299
 
279
300
  # sanitize filename
280
301
  csv_filename = sanitize_filename(csv_filename)
281
302
  # make directory
282
- os.makedirs(output_dir, exist_ok=True)
303
+ output_path = make_fullpath(output_dir, make=True)
283
304
 
284
305
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
285
- csv_path = os.path.join(output_dir, csv_filename)
306
+ csv_path = output_path / csv_filename
286
307
 
287
308
  dataframes = []
288
309
  max_rows = 0
289
310
 
290
311
  for file in excel_files:
291
- file_path = os.path.join(target_dir, file)
292
- df = pd.read_excel(file_path, engine='openpyxl')
312
+ df = pd.read_excel(file, engine='openpyxl')
293
313
 
294
314
  if drop_columns is not None:
295
315
  df = df.drop(columns=[col for col in drop_columns if col in df.columns])
ml_tools/logger.py CHANGED
@@ -1,11 +1,11 @@
1
- import os
1
+ from pathlib import Path
2
2
  from datetime import datetime
3
3
  from typing import Union, List, Dict, Any
4
4
  import pandas as pd
5
5
  from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
- from .utilities import sanitize_filename, _script_info
8
+ from .utilities import sanitize_filename, _script_info, make_fullpath
9
9
 
10
10
 
11
11
  __all__ = [
@@ -21,7 +21,7 @@ def custom_logger(
21
21
  str,
22
22
  BaseException
23
23
  ],
24
- save_directory: str,
24
+ save_directory: Union[str, Path],
25
25
  log_name: str,
26
26
  ) -> None:
27
27
  """
@@ -54,10 +54,12 @@ def custom_logger(
54
54
  ValueError: If the data type is unsupported.
55
55
  """
56
56
  try:
57
- os.makedirs(save_directory, exist_ok=True)
57
+ save_path = make_fullpath(save_directory, make=True)
58
+
58
59
  timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
59
60
  log_name = sanitize_filename(log_name)
60
- base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
61
+
62
+ base_path = save_path / f"{log_name}_{timestamp}"
61
63
 
62
64
  if isinstance(data, list):
63
65
  _log_list_to_txt(data, base_path + ".txt")
@@ -86,7 +88,7 @@ def custom_logger(
86
88
  print(f"Error in custom_logger: {e}")
87
89
 
88
90
 
89
- def _log_list_to_txt(data: List[Any], path: str) -> None:
91
+ def _log_list_to_txt(data: List[Any], path: Path) -> None:
90
92
  log_lines = []
91
93
  for item in data:
92
94
  try:
@@ -98,7 +100,7 @@ def _log_list_to_txt(data: List[Any], path: str) -> None:
98
100
  f.write('\n'.join(log_lines))
99
101
 
100
102
 
101
- def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
103
+ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
102
104
  sanitized_dict = {}
103
105
  max_length = max(len(v) for v in data.values()) if data else 0
104
106
 
@@ -113,7 +115,7 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
113
115
  df.to_csv(path, index=False)
114
116
 
115
117
 
116
- def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
118
+ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: Path) -> None:
117
119
  writer = pd.ExcelWriter(path, engine='openpyxl')
118
120
  data.to_excel(writer, index=True, sheet_name='Data')
119
121
 
@@ -134,18 +136,18 @@ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
134
136
  writer.close()
135
137
 
136
138
 
137
- def _log_string_to_log(data: str, path: str) -> None:
139
+ def _log_string_to_log(data: str, path: Path) -> None:
138
140
  with open(path, 'w', encoding='utf-8') as f:
139
141
  f.write(data.strip() + '\n')
140
142
 
141
143
 
142
- def _log_exception_to_log(exc: BaseException, path: str) -> None:
144
+ def _log_exception_to_log(exc: BaseException, path: Path) -> None:
143
145
  with open(path, 'w', encoding='utf-8') as f:
144
146
  f.write("Exception occurred:\n")
145
147
  traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
146
148
 
147
149
 
148
- def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
150
+ def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
149
151
  with open(path, 'w', encoding='utf-8') as f:
150
152
  json.dump(data, f, indent=4, ensure_ascii=False)
151
153