dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/METADATA +24 -14
- dragon_ml_toolbox-2.1.0.dist-info/RECORD +20 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +5 -4
- ml_tools/MICE_imputation.py +27 -28
- ml_tools/PSO_optimization.py +490 -0
- ml_tools/VIF_factor.py +20 -17
- ml_tools/{particle_swarm_optimization.py → _particle_swarm_optimization.py} +5 -0
- ml_tools/data_exploration.py +58 -32
- ml_tools/ensemble_learning.py +40 -42
- ml_tools/handle_excel.py +98 -78
- ml_tools/logger.py +13 -11
- ml_tools/utilities.py +134 -46
- dragon_ml_toolbox-1.4.8.dist-info/RECORD +0 -19
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/top_level.txt +0 -0
ml_tools/handle_excel.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from openpyxl import load_workbook, Workbook
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from typing import List, Optional
|
|
5
|
-
from .utilities import _script_info, sanitize_filename
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
from .utilities import _script_info, sanitize_filename, make_fullpath
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
__all__ = [
|
|
9
|
+
"find_excel_files",
|
|
9
10
|
"unmerge_and_split_excel",
|
|
10
11
|
"unmerge_and_split_from_directory",
|
|
11
12
|
"validate_excel_schema",
|
|
@@ -14,20 +15,55 @@ __all__ = [
|
|
|
14
15
|
]
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def
|
|
18
|
+
def find_excel_files(
|
|
19
|
+
directory: Union[str, Path],
|
|
20
|
+
*,
|
|
21
|
+
extensions: tuple[str, ...] = (".xlsx", ".xls"),
|
|
22
|
+
exclude_temp: bool = True
|
|
23
|
+
) -> list[Path]:
|
|
24
|
+
"""
|
|
25
|
+
Returns a list of Excel file Paths in the specified directory.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
directory (str | Path): Directory to search.
|
|
29
|
+
extensions (tuple[str, ...]): Valid Excel file extensions (default: .xlsx, .xls).
|
|
30
|
+
exclude_temp (bool): Whether to exclude files that start with '~'.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
list[Path]: List of Excel file paths matching criteria.
|
|
34
|
+
"""
|
|
35
|
+
input_path = make_fullpath(directory)
|
|
36
|
+
|
|
37
|
+
if not input_path.is_dir():
|
|
38
|
+
raise NotADirectoryError(f"Directory not found: {input_path}")
|
|
39
|
+
|
|
40
|
+
excel_files = [
|
|
41
|
+
f for f in input_path.iterdir()
|
|
42
|
+
if f.is_file()
|
|
43
|
+
and f.suffix.lower() in extensions
|
|
44
|
+
and (not f.name.startswith('~') if exclude_temp else True)
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
if not excel_files:
|
|
48
|
+
raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
|
|
49
|
+
|
|
50
|
+
return excel_files
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
|
|
18
54
|
"""
|
|
19
55
|
Processes a single Excel file:
|
|
20
|
-
- Unmerges all merged cells (vertical and horizontal),
|
|
21
|
-
-
|
|
22
|
-
- Splits each sheet into a separate Excel file,
|
|
56
|
+
- Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value.
|
|
57
|
+
- Splits each sheet into a separate Excel file.
|
|
23
58
|
- Saves all results in the same directory as the input file.
|
|
24
59
|
|
|
25
60
|
Parameters:
|
|
26
|
-
filepath (str): Full path to the Excel file to process.
|
|
61
|
+
filepath (str | Path): Full path to the Excel file to process.
|
|
27
62
|
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
63
|
+
file_path = make_fullpath(filepath)
|
|
64
|
+
wb = load_workbook(file_path)
|
|
65
|
+
base_dir = file_path.parent
|
|
66
|
+
base_name = file_path.stem
|
|
31
67
|
|
|
32
68
|
total_output_files = 0
|
|
33
69
|
|
|
@@ -56,40 +92,37 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
56
92
|
# Construct flat output file name
|
|
57
93
|
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
58
94
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
59
|
-
output_path =
|
|
95
|
+
output_path = base_dir / output_filename
|
|
60
96
|
new_wb.save(output_path)
|
|
61
97
|
|
|
62
98
|
# print(f"Saved: {output_path}")
|
|
63
99
|
total_output_files += 1
|
|
64
100
|
|
|
65
|
-
print(f"✅ Processed file: {
|
|
101
|
+
print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
|
|
66
102
|
return None
|
|
67
103
|
|
|
68
104
|
|
|
69
|
-
def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
105
|
+
def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Union[str,Path]) -> None:
|
|
70
106
|
"""
|
|
71
107
|
Processes all Excel files in the input directory:
|
|
72
|
-
- Unmerges all merged cells (vertical and horizontal),
|
|
73
|
-
-
|
|
74
|
-
- Splits each sheet into separate Excel files,
|
|
108
|
+
- Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value,
|
|
109
|
+
- Splits each sheet into separate Excel files.
|
|
75
110
|
- Saves all results into the output directory.
|
|
76
111
|
|
|
77
112
|
Parameters:
|
|
78
|
-
input_dir (str): Directory containing Excel files to process.
|
|
79
|
-
output_dir (str): Directory to save processed Excel files.
|
|
113
|
+
input_dir (str | Path): Directory containing Excel files to process.
|
|
114
|
+
output_dir (str | Path): Directory to save processed Excel files.
|
|
80
115
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
|
|
116
|
+
global_input_path = make_fullpath(input_dir)
|
|
117
|
+
global_output_path = make_fullpath(output_dir, make=True)
|
|
118
|
+
|
|
119
|
+
excel_files = find_excel_files(global_input_path)
|
|
86
120
|
|
|
87
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
88
121
|
total_output_files = 0
|
|
89
122
|
|
|
90
123
|
for file_path in excel_files:
|
|
91
124
|
wb = load_workbook(file_path)
|
|
92
|
-
base_name =
|
|
125
|
+
base_name = file_path.stem
|
|
93
126
|
|
|
94
127
|
for sheet_name in wb.sheetnames:
|
|
95
128
|
ws = wb[sheet_name]
|
|
@@ -116,7 +149,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
116
149
|
# Construct flat output file name
|
|
117
150
|
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
118
151
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
119
|
-
output_path =
|
|
152
|
+
output_path = global_output_path / output_filename
|
|
120
153
|
new_wb.save(output_path)
|
|
121
154
|
|
|
122
155
|
# print(f"Saved: {output_path}")
|
|
@@ -127,7 +160,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
127
160
|
|
|
128
161
|
|
|
129
162
|
def validate_excel_schema(
|
|
130
|
-
target_dir: str,
|
|
163
|
+
target_dir: Union[str,Path],
|
|
131
164
|
expected_columns: List[str],
|
|
132
165
|
strict: bool = False
|
|
133
166
|
) -> None:
|
|
@@ -135,7 +168,7 @@ def validate_excel_schema(
|
|
|
135
168
|
Validates that each Excel file in a directory conforms to the expected column schema.
|
|
136
169
|
|
|
137
170
|
Parameters:
|
|
138
|
-
target_dir (str): Path to the directory containing Excel files.
|
|
171
|
+
target_dir (str | Path): Path to the directory containing Excel files.
|
|
139
172
|
expected_columns (list[str]): List of expected column names.
|
|
140
173
|
strict (bool): If True, columns must match exactly (names and order).
|
|
141
174
|
If False, columns must contain at least all expected names.
|
|
@@ -143,52 +176,46 @@ def validate_excel_schema(
|
|
|
143
176
|
Returns:
|
|
144
177
|
List[str]: List of file paths that failed the schema validation.
|
|
145
178
|
"""
|
|
146
|
-
invalid_files = []
|
|
179
|
+
invalid_files: list[Path] = []
|
|
147
180
|
expected_set = set(expected_columns)
|
|
148
181
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if filename.startswith("~"): # Skip temporary files
|
|
156
|
-
continue
|
|
157
|
-
|
|
158
|
-
file_path = os.path.join(target_dir, filename)
|
|
159
|
-
excel_seen += 1
|
|
182
|
+
target_path = make_fullpath(target_dir)
|
|
183
|
+
|
|
184
|
+
excel_paths = find_excel_files(target_path)
|
|
185
|
+
|
|
186
|
+
for file in excel_paths:
|
|
160
187
|
try:
|
|
161
|
-
wb = load_workbook(
|
|
188
|
+
wb = load_workbook(file, read_only=True)
|
|
162
189
|
ws = wb.active # Only check the first worksheet
|
|
163
190
|
|
|
164
191
|
header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
|
|
165
192
|
|
|
166
193
|
if strict:
|
|
167
194
|
if header != expected_columns:
|
|
168
|
-
invalid_files.append(
|
|
195
|
+
invalid_files.append(file)
|
|
169
196
|
else:
|
|
170
197
|
header_set = set(header)
|
|
171
198
|
if not expected_set.issubset(header_set):
|
|
172
|
-
invalid_files.append(
|
|
199
|
+
invalid_files.append(file)
|
|
173
200
|
|
|
174
201
|
except Exception as e:
|
|
175
|
-
print(f"Error processing '{
|
|
176
|
-
invalid_files.append(
|
|
202
|
+
print(f"Error processing '{file}': {e}")
|
|
203
|
+
invalid_files.append(file)
|
|
177
204
|
|
|
178
|
-
valid_excel_number =
|
|
179
|
-
print(f"{valid_excel_number} out of {
|
|
205
|
+
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
206
|
+
print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
180
207
|
if invalid_files:
|
|
181
208
|
print(f"⚠️ {len(invalid_files)} excel files are invalid:")
|
|
182
|
-
for
|
|
183
|
-
print(f" - {
|
|
209
|
+
for in_file in invalid_files:
|
|
210
|
+
print(f" - {in_file.name}")
|
|
184
211
|
|
|
185
212
|
return None
|
|
186
213
|
|
|
187
214
|
|
|
188
215
|
def vertical_merge_transform_excel(
|
|
189
|
-
target_dir: str,
|
|
216
|
+
target_dir: Union[str,Path],
|
|
190
217
|
csv_filename: str,
|
|
191
|
-
output_dir: str,
|
|
218
|
+
output_dir: Union[str,Path],
|
|
192
219
|
target_columns: Optional[List[str]] = None,
|
|
193
220
|
rename_columns: Optional[List[str]] = None
|
|
194
221
|
) -> None:
|
|
@@ -201,35 +228,31 @@ def vertical_merge_transform_excel(
|
|
|
201
228
|
- If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
|
|
202
229
|
|
|
203
230
|
Parameters:
|
|
204
|
-
target_dir (str): Directory containing Excel files.
|
|
231
|
+
target_dir (str | Path): Directory containing Excel files.
|
|
205
232
|
csv_filename (str): Output CSV filename.
|
|
206
|
-
output_dir (str): Directory to save the output CSV file.
|
|
233
|
+
output_dir (str | Path): Directory to save the output CSV file.
|
|
207
234
|
target_columns (list[str] | None): Columns to select from each Excel file.
|
|
208
235
|
rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
|
|
209
236
|
"""
|
|
210
|
-
|
|
211
|
-
excel_files =
|
|
212
|
-
|
|
213
|
-
if not excel_files:
|
|
214
|
-
raise ValueError("No Excel files found in the target directory.")
|
|
237
|
+
target_path = make_fullpath(target_dir)
|
|
238
|
+
excel_files = find_excel_files(target_path)
|
|
215
239
|
|
|
216
240
|
# sanitize filename
|
|
217
241
|
csv_filename = sanitize_filename(csv_filename)
|
|
218
|
-
# make directory
|
|
219
|
-
|
|
242
|
+
# make output directory
|
|
243
|
+
output_path = make_fullpath(output_dir, make=True)
|
|
220
244
|
|
|
221
245
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
222
|
-
csv_path =
|
|
246
|
+
csv_path = output_path / csv_filename
|
|
223
247
|
|
|
224
248
|
dataframes = []
|
|
225
249
|
for file in excel_files:
|
|
226
|
-
|
|
227
|
-
df = pd.read_excel(file_path, engine='openpyxl')
|
|
250
|
+
df = pd.read_excel(file, engine='openpyxl')
|
|
228
251
|
|
|
229
252
|
if target_columns is not None:
|
|
230
253
|
missing = [col for col in target_columns if col not in df.columns]
|
|
231
254
|
if missing:
|
|
232
|
-
raise ValueError(f"
|
|
255
|
+
raise ValueError(f"Invalid columns in {file.name}: {missing}")
|
|
233
256
|
df = df[target_columns]
|
|
234
257
|
|
|
235
258
|
dataframes.append(df)
|
|
@@ -239,7 +262,7 @@ def vertical_merge_transform_excel(
|
|
|
239
262
|
if rename_columns is not None:
|
|
240
263
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
241
264
|
if len(rename_columns) != expected_len:
|
|
242
|
-
raise ValueError("Length of rename_columns must match the selected columns")
|
|
265
|
+
raise ValueError("Length of 'rename_columns' must match the selected columns")
|
|
243
266
|
merged_df.columns = rename_columns
|
|
244
267
|
|
|
245
268
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
@@ -247,9 +270,9 @@ def vertical_merge_transform_excel(
|
|
|
247
270
|
|
|
248
271
|
|
|
249
272
|
def horizontal_merge_transform_excel(
|
|
250
|
-
target_dir: str,
|
|
273
|
+
target_dir: Union[str,Path],
|
|
251
274
|
csv_filename: str,
|
|
252
|
-
output_dir: str,
|
|
275
|
+
output_dir: Union[str,Path],
|
|
253
276
|
drop_columns: Optional[list[str]] = None,
|
|
254
277
|
skip_duplicates: bool = False
|
|
255
278
|
) -> None:
|
|
@@ -265,31 +288,28 @@ def horizontal_merge_transform_excel(
|
|
|
265
288
|
If True, only the first occurrence of each column name is kept.
|
|
266
289
|
|
|
267
290
|
Parameters:
|
|
268
|
-
target_dir (str): Directory containing Excel files.
|
|
291
|
+
target_dir (str | Path): Directory containing Excel files.
|
|
269
292
|
csv_filename (str): Name of the output CSV file.
|
|
270
|
-
output_dir (str): Directory to save the output CSV file.
|
|
293
|
+
output_dir (str | Path): Directory to save the output CSV file.
|
|
271
294
|
drop_columns (list[str] | None): Columns to exclude from each file before merging.
|
|
272
295
|
skip_duplicates (bool): Whether to skip duplicate columns or rename them.
|
|
273
296
|
"""
|
|
274
|
-
|
|
275
|
-
excel_files =
|
|
276
|
-
if not excel_files:
|
|
277
|
-
raise ValueError("No Excel files found in the target directory.")
|
|
297
|
+
target_path = make_fullpath(target_dir)
|
|
298
|
+
excel_files = find_excel_files(target_path)
|
|
278
299
|
|
|
279
300
|
# sanitize filename
|
|
280
301
|
csv_filename = sanitize_filename(csv_filename)
|
|
281
302
|
# make directory
|
|
282
|
-
|
|
303
|
+
output_path = make_fullpath(output_dir, make=True)
|
|
283
304
|
|
|
284
305
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
285
|
-
csv_path =
|
|
306
|
+
csv_path = output_path / csv_filename
|
|
286
307
|
|
|
287
308
|
dataframes = []
|
|
288
309
|
max_rows = 0
|
|
289
310
|
|
|
290
311
|
for file in excel_files:
|
|
291
|
-
|
|
292
|
-
df = pd.read_excel(file_path, engine='openpyxl')
|
|
312
|
+
df = pd.read_excel(file, engine='openpyxl')
|
|
293
313
|
|
|
294
314
|
if drop_columns is not None:
|
|
295
315
|
df = df.drop(columns=[col for col in drop_columns if col in df.columns])
|
ml_tools/logger.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Union, List, Dict, Any
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from openpyxl.styles import Font, PatternFill
|
|
6
6
|
import traceback
|
|
7
7
|
import json
|
|
8
|
-
from .utilities import sanitize_filename, _script_info
|
|
8
|
+
from .utilities import sanitize_filename, _script_info, make_fullpath
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
@@ -21,7 +21,7 @@ def custom_logger(
|
|
|
21
21
|
str,
|
|
22
22
|
BaseException
|
|
23
23
|
],
|
|
24
|
-
save_directory: str,
|
|
24
|
+
save_directory: Union[str, Path],
|
|
25
25
|
log_name: str,
|
|
26
26
|
) -> None:
|
|
27
27
|
"""
|
|
@@ -54,10 +54,12 @@ def custom_logger(
|
|
|
54
54
|
ValueError: If the data type is unsupported.
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
|
-
|
|
57
|
+
save_path = make_fullpath(save_directory, make=True)
|
|
58
|
+
|
|
58
59
|
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
|
|
59
60
|
log_name = sanitize_filename(log_name)
|
|
60
|
-
|
|
61
|
+
|
|
62
|
+
base_path = save_path / f"{log_name}_{timestamp}"
|
|
61
63
|
|
|
62
64
|
if isinstance(data, list):
|
|
63
65
|
_log_list_to_txt(data, base_path + ".txt")
|
|
@@ -86,7 +88,7 @@ def custom_logger(
|
|
|
86
88
|
print(f"Error in custom_logger: {e}")
|
|
87
89
|
|
|
88
90
|
|
|
89
|
-
def _log_list_to_txt(data: List[Any], path:
|
|
91
|
+
def _log_list_to_txt(data: List[Any], path: Path) -> None:
|
|
90
92
|
log_lines = []
|
|
91
93
|
for item in data:
|
|
92
94
|
try:
|
|
@@ -98,7 +100,7 @@ def _log_list_to_txt(data: List[Any], path: str) -> None:
|
|
|
98
100
|
f.write('\n'.join(log_lines))
|
|
99
101
|
|
|
100
102
|
|
|
101
|
-
def _log_dict_to_csv(data: Dict[Any, List[Any]], path:
|
|
103
|
+
def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
|
|
102
104
|
sanitized_dict = {}
|
|
103
105
|
max_length = max(len(v) for v in data.values()) if data else 0
|
|
104
106
|
|
|
@@ -113,7 +115,7 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
|
|
|
113
115
|
df.to_csv(path, index=False)
|
|
114
116
|
|
|
115
117
|
|
|
116
|
-
def _log_dataframe_to_xlsx(data: pd.DataFrame, path:
|
|
118
|
+
def _log_dataframe_to_xlsx(data: pd.DataFrame, path: Path) -> None:
|
|
117
119
|
writer = pd.ExcelWriter(path, engine='openpyxl')
|
|
118
120
|
data.to_excel(writer, index=True, sheet_name='Data')
|
|
119
121
|
|
|
@@ -134,18 +136,18 @@ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
|
|
|
134
136
|
writer.close()
|
|
135
137
|
|
|
136
138
|
|
|
137
|
-
def _log_string_to_log(data: str, path:
|
|
139
|
+
def _log_string_to_log(data: str, path: Path) -> None:
|
|
138
140
|
with open(path, 'w', encoding='utf-8') as f:
|
|
139
141
|
f.write(data.strip() + '\n')
|
|
140
142
|
|
|
141
143
|
|
|
142
|
-
def _log_exception_to_log(exc: BaseException, path:
|
|
144
|
+
def _log_exception_to_log(exc: BaseException, path: Path) -> None:
|
|
143
145
|
with open(path, 'w', encoding='utf-8') as f:
|
|
144
146
|
f.write("Exception occurred:\n")
|
|
145
147
|
traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
|
|
146
148
|
|
|
147
149
|
|
|
148
|
-
def _log_dict_to_json(data: Dict[Any, Any], path:
|
|
150
|
+
def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
|
|
149
151
|
with open(path, 'w', encoding='utf-8') as f:
|
|
150
152
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
151
153
|
|