dragon-ml-toolbox 9.0.0__py3-none-any.whl → 9.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/RECORD +8 -8
- ml_tools/ETL_engineering.py +2 -2
- ml_tools/handle_excel.py +26 -12
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-9.0.0.dist-info → dragon_ml_toolbox-9.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-9.
|
|
2
|
-
dragon_ml_toolbox-9.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-9.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-9.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=c-YyhfuNglUhDBbTN71-iHozWL7Y9E8KqNNS5hK1nA4,44883
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
@@ -24,12 +24,12 @@ ml_tools/data_exploration.py,sha256=hKA_3U-piJ8TtDWhzX_T2Awkg-25e0DC5E8qloqPo6w,
|
|
|
24
24
|
ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
|
|
25
25
|
ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
|
|
26
26
|
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
27
|
-
ml_tools/handle_excel.py,sha256=
|
|
27
|
+
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
28
28
|
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
29
29
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
30
30
|
ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
|
|
31
31
|
ml_tools/utilities.py,sha256=zzfYR7SUSb2rZILTNoCjl_pfLlPdHf4263atXuEb3iE,19341
|
|
32
|
-
dragon_ml_toolbox-9.
|
|
33
|
-
dragon_ml_toolbox-9.
|
|
34
|
-
dragon_ml_toolbox-9.
|
|
35
|
-
dragon_ml_toolbox-9.
|
|
32
|
+
dragon_ml_toolbox-9.1.0.dist-info/METADATA,sha256=NMMsbllyABDp8fVflbBWJ9aIQ0KemBI-3hBlj5JhE2E,6941
|
|
33
|
+
dragon_ml_toolbox-9.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
34
|
+
dragon_ml_toolbox-9.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
35
|
+
dragon_ml_toolbox-9.1.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -62,7 +62,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
|
|
|
62
62
|
|
|
63
63
|
# --- 3. Process Each Column ---
|
|
64
64
|
for i, column_name in enumerate(df.columns):
|
|
65
|
-
_LOGGER.info(f"Processing column: '{column_name}'...")
|
|
65
|
+
# _LOGGER.info(f"Processing column: '{column_name}'...")
|
|
66
66
|
|
|
67
67
|
# --- Get unique values AS IS ---
|
|
68
68
|
try:
|
|
@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
|
|
|
96
96
|
except IOError:
|
|
97
97
|
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
98
98
|
else:
|
|
99
|
-
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values
|
|
99
|
+
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
100
100
|
|
|
101
101
|
_LOGGER.info("Process complete.")
|
|
102
102
|
|
ml_tools/handle_excel.py
CHANGED
|
@@ -167,49 +167,63 @@ def validate_excel_schema(
|
|
|
167
167
|
strict: bool = False
|
|
168
168
|
) -> None:
|
|
169
169
|
"""
|
|
170
|
-
Validates that each Excel file in a directory conforms to the expected column schema.
|
|
170
|
+
Validates that each Excel file in a directory conforms to the expected column schema. Only the first worksheet of each file is analyzed.
|
|
171
171
|
|
|
172
172
|
Parameters:
|
|
173
173
|
target_dir (str | Path): Path to the directory containing Excel files.
|
|
174
174
|
expected_columns (list[str]): List of expected column names.
|
|
175
175
|
strict (bool): If True, columns must match exactly (names and order).
|
|
176
176
|
If False, columns must contain at least all expected names.
|
|
177
|
-
|
|
178
|
-
Returns:
|
|
179
|
-
List[str]: List of file paths that failed the schema validation.
|
|
180
177
|
"""
|
|
181
|
-
invalid_files:
|
|
178
|
+
invalid_files: dict[str, str] = {}
|
|
182
179
|
expected_set = set(expected_columns)
|
|
183
180
|
|
|
184
181
|
target_path = make_fullpath(target_dir)
|
|
185
|
-
|
|
186
182
|
excel_paths = find_excel_files(target_path)
|
|
187
183
|
|
|
188
184
|
for file in excel_paths:
|
|
189
185
|
try:
|
|
186
|
+
# Using first worksheet
|
|
190
187
|
wb = load_workbook(file, read_only=True)
|
|
191
|
-
ws = wb.active
|
|
188
|
+
ws = wb.active
|
|
192
189
|
|
|
193
190
|
header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
|
|
194
191
|
|
|
192
|
+
# Change 2: Detailed reason-finding logic
|
|
195
193
|
if strict:
|
|
196
194
|
if header != expected_columns:
|
|
197
|
-
|
|
195
|
+
header_set = set(header)
|
|
196
|
+
reason_parts = []
|
|
197
|
+
missing = sorted(list(expected_set - header_set)) # type: ignore
|
|
198
|
+
extra = sorted(list(header_set - expected_set)) # type: ignore
|
|
199
|
+
|
|
200
|
+
if missing:
|
|
201
|
+
reason_parts.append(f"Missing: {missing}")
|
|
202
|
+
if extra:
|
|
203
|
+
reason_parts.append(f"Extra: {extra}")
|
|
204
|
+
if not missing and not extra:
|
|
205
|
+
reason_parts.append("Incorrect column order")
|
|
206
|
+
|
|
207
|
+
invalid_files[file.name] = ". ".join(reason_parts)
|
|
198
208
|
else:
|
|
199
209
|
header_set = set(header)
|
|
200
210
|
if not expected_set.issubset(header_set):
|
|
201
|
-
|
|
211
|
+
missing_cols = sorted(list(expected_set - header_set)) # type: ignore
|
|
212
|
+
reason = f"Missing required columns: {missing_cols}"
|
|
213
|
+
invalid_files[file.name] = reason
|
|
202
214
|
|
|
203
215
|
except Exception as e:
|
|
204
216
|
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
205
|
-
invalid_files.
|
|
217
|
+
invalid_files[file.name] = f"File could not be read. Error: {e}"
|
|
206
218
|
|
|
207
219
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
208
220
|
_LOGGER.info(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
221
|
+
|
|
222
|
+
# Change 3: Updated print loop to show the reason
|
|
209
223
|
if invalid_files:
|
|
210
224
|
_LOGGER.warning(f"{len(invalid_files)} excel files are invalid:")
|
|
211
|
-
for
|
|
212
|
-
print(f" - {
|
|
225
|
+
for file_name, reason in invalid_files.items():
|
|
226
|
+
print(f" - {file_name}: {reason}")
|
|
213
227
|
|
|
214
228
|
return None
|
|
215
229
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|