dscience-tools 2.3.0__tar.gz → 2.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dscience_tools-2.3.0/dscience_tools.egg-info → dscience_tools-2.3.2}/PKG-INFO +14 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/README.md +13 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2/dscience_tools.egg-info}/PKG-INFO +14 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/pyproject.toml +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/setup.cfg +1 -1
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/__init__.py +3 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/distance.py +8 -8
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/ds_tool.py +3 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_compute_metrics.py +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_corr_matrix.py +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance.py +20 -15
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance_additional.py +4 -4
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance_numba_cupy.py +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_generate_dist.py +2 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_generate_from_metrics.py +2 -3
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_metrics.py +7 -7
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_models.py +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_outliers.py +1 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_zip_io.py +5 -2
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/LICENSE +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/LICENSE-NC.txt +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/MANIFEST.in +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/SOURCES.txt +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/dependency_links.txt +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/requires.txt +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/top_level.txt +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/metrics.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/models.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_add_missing.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_alphanum.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_category_stats.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_chatterjee.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_check_ninf.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_describe_cat.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_describe_num.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_df_stats.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_entropy.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_evaluate_cls.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_function_list.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_grubbs.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_kl_divergence.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_labeling.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_min_max.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_normality.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_plot_cm.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_sparse_calc.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_stationarity.py +0 -0
- {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_trials_res_df.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dscience_tools
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.2
|
|
4
4
|
Summary: DSTools: Data Science Tools Library
|
|
5
5
|
Home-page: https://github.com/s-kav/ds_tools
|
|
6
6
|
Author: Sergii Kavun
|
|
@@ -421,12 +421,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
|
|
|
421
421
|
- **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
|
|
422
422
|
- **Embedding Losses:** `triplet_loss`.
|
|
423
423
|
- **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
|
|
424
|
+
- **`list_metrics`**: Prints a list of all available metrics.
|
|
424
425
|
|
|
425
426
|
### Distance Toolkit (`tools.distance.*`)
|
|
426
427
|
A high-performance toolkit for calculating distances and similarities.
|
|
427
428
|
- **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
|
|
428
429
|
- **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
|
|
429
430
|
- **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
|
|
431
|
+
- **`list_distances`**: Prints a list of all available distances.
|
|
430
432
|
|
|
431
433
|
|
|
432
434
|
# Authors
|
|
@@ -440,19 +442,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
|
|
|
440
442
|
|
|
441
443
|
# TODO
|
|
442
444
|
1. Add some important kind of plots:
|
|
445
|
+
|
|
443
446
|
📌 KS Plot – Measures how well your model separates positive and negative classes.
|
|
447
|
+
|
|
444
448
|
📌 SHAP Plot – Explains feature impact and model interpretability.
|
|
449
|
+
|
|
445
450
|
📌 QQ Plot – Checks if your data follows a theoretical distribution.
|
|
451
|
+
|
|
446
452
|
📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
|
|
453
|
+
|
|
447
454
|
📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
|
|
455
|
+
|
|
448
456
|
📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
|
|
457
|
+
|
|
449
458
|
📌 ROC Curve – Evaluates classification performance across thresholds.
|
|
459
|
+
|
|
450
460
|
📌 Precision–Recall Curve – Crucial for imbalanced datasets.
|
|
461
|
+
|
|
451
462
|
📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
|
|
452
463
|
|
|
453
|
-
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
|
|
464
|
+
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
|
|
454
465
|
|
|
455
|
-
3. Add some fast distance metrics (expand of existed).
|
|
466
|
+
3. Add some fast distance metrics (expand of existed): done, 2.3.0.
|
|
456
467
|
|
|
457
468
|
|
|
458
469
|
# References
|
|
@@ -377,12 +377,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
|
|
|
377
377
|
- **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
|
|
378
378
|
- **Embedding Losses:** `triplet_loss`.
|
|
379
379
|
- **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
|
|
380
|
+
- **`list_metrics`**: Prints a list of all available metrics.
|
|
380
381
|
|
|
381
382
|
### Distance Toolkit (`tools.distance.*`)
|
|
382
383
|
A high-performance toolkit for calculating distances and similarities.
|
|
383
384
|
- **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
|
|
384
385
|
- **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
|
|
385
386
|
- **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
|
|
387
|
+
- **`list_distances`**: Prints a list of all available distances.
|
|
386
388
|
|
|
387
389
|
|
|
388
390
|
# Authors
|
|
@@ -396,19 +398,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
|
|
|
396
398
|
|
|
397
399
|
# TODO
|
|
398
400
|
1. Add some important kind of plots:
|
|
401
|
+
|
|
399
402
|
📌 KS Plot – Measures how well your model separates positive and negative classes.
|
|
403
|
+
|
|
400
404
|
📌 SHAP Plot – Explains feature impact and model interpretability.
|
|
405
|
+
|
|
401
406
|
📌 QQ Plot – Checks if your data follows a theoretical distribution.
|
|
407
|
+
|
|
402
408
|
📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
|
|
409
|
+
|
|
403
410
|
📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
|
|
411
|
+
|
|
404
412
|
📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
|
|
413
|
+
|
|
405
414
|
📌 ROC Curve – Evaluates classification performance across thresholds.
|
|
415
|
+
|
|
406
416
|
📌 Precision–Recall Curve – Crucial for imbalanced datasets.
|
|
417
|
+
|
|
407
418
|
📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
|
|
408
419
|
|
|
409
|
-
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
|
|
420
|
+
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
|
|
410
421
|
|
|
411
|
-
3. Add some fast distance metrics (expand of existed).
|
|
422
|
+
3. Add some fast distance metrics (expand of existed): done, 2.3.0.
|
|
412
423
|
|
|
413
424
|
|
|
414
425
|
# References
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dscience_tools
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.2
|
|
4
4
|
Summary: DSTools: Data Science Tools Library
|
|
5
5
|
Home-page: https://github.com/s-kav/ds_tools
|
|
6
6
|
Author: Sergii Kavun
|
|
@@ -421,12 +421,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
|
|
|
421
421
|
- **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
|
|
422
422
|
- **Embedding Losses:** `triplet_loss`.
|
|
423
423
|
- **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
|
|
424
|
+
- **`list_metrics`**: Prints a list of all available metrics.
|
|
424
425
|
|
|
425
426
|
### Distance Toolkit (`tools.distance.*`)
|
|
426
427
|
A high-performance toolkit for calculating distances and similarities.
|
|
427
428
|
- **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
|
|
428
429
|
- **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
|
|
429
430
|
- **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
|
|
431
|
+
- **`list_distances`**: Prints a list of all available distances.
|
|
430
432
|
|
|
431
433
|
|
|
432
434
|
# Authors
|
|
@@ -440,19 +442,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
|
|
|
440
442
|
|
|
441
443
|
# TODO
|
|
442
444
|
1. Add some important kind of plots:
|
|
445
|
+
|
|
443
446
|
📌 KS Plot – Measures how well your model separates positive and negative classes.
|
|
447
|
+
|
|
444
448
|
📌 SHAP Plot – Explains feature impact and model interpretability.
|
|
449
|
+
|
|
445
450
|
📌 QQ Plot – Checks if your data follows a theoretical distribution.
|
|
451
|
+
|
|
446
452
|
📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
|
|
453
|
+
|
|
447
454
|
📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
|
|
455
|
+
|
|
448
456
|
📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
|
|
457
|
+
|
|
449
458
|
📌 ROC Curve – Evaluates classification performance across thresholds.
|
|
459
|
+
|
|
450
460
|
📌 Precision–Recall Curve – Crucial for imbalanced datasets.
|
|
461
|
+
|
|
451
462
|
📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
|
|
452
463
|
|
|
453
|
-
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
|
|
464
|
+
2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
|
|
454
465
|
|
|
455
|
-
3. Add some fast distance metrics (expand of existed).
|
|
466
|
+
3. Add some fast distance metrics (expand of existed): done, 2.3.0.
|
|
456
467
|
|
|
457
468
|
|
|
458
469
|
# References
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dscience_tools"
|
|
3
|
-
version = "2.3.
|
|
3
|
+
version = "2.3.2"
|
|
4
4
|
description = "DSTools: Data Science Tools Library"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
|
|
@@ -86,5 +86,4 @@ filterwarnings = [
|
|
|
86
86
|
testpaths = ["tests"]
|
|
87
87
|
pythonpath = [
|
|
88
88
|
".",
|
|
89
|
-
"src",
|
|
90
89
|
]
|
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
DSTools: A library of helpful functions for various data science research stages.
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
__version__ = "2.3.
|
|
16
|
+
__version__ = "2.3.2"
|
|
17
17
|
|
|
18
|
-
from ds_tool import DSTools
|
|
19
|
-
from models import (
|
|
18
|
+
from .ds_tool import DSTools
|
|
19
|
+
from .models import (
|
|
20
20
|
CorrelationConfig,
|
|
21
21
|
DistributionConfig,
|
|
22
22
|
GrubbsTestResult,
|
|
@@ -630,20 +630,20 @@ class Distance:
|
|
|
630
630
|
f"Distance initialized. GPU: {self.gpu_available}, Numba: {self.numba_available}"
|
|
631
631
|
)
|
|
632
632
|
|
|
633
|
-
def
|
|
633
|
+
def list_distances(self) -> pd.DataFrame:
|
|
634
634
|
"""
|
|
635
|
-
Returns a DataFrame listing all available
|
|
635
|
+
Returns a DataFrame listing all available distances functions, their
|
|
636
636
|
descriptions, and usage signatures.
|
|
637
637
|
|
|
638
638
|
Returns:
|
|
639
|
-
pd.DataFrame: A table with columns '
|
|
639
|
+
pd.DataFrame: A table with columns 'Distance', 'Description', and 'Usage'.
|
|
640
640
|
"""
|
|
641
641
|
methods_data = []
|
|
642
642
|
|
|
643
643
|
# Iterate over all members of the instance
|
|
644
644
|
for name, func in inspect.getmembers(self, predicate=inspect.ismethod):
|
|
645
|
-
# Skip private methods (starting with _) and the
|
|
646
|
-
if name.startswith("_") or name == "
|
|
645
|
+
# Skip private methods (starting with _) and the list_distances method itself
|
|
646
|
+
if name.startswith("_") or name == "list_distances":
|
|
647
647
|
continue
|
|
648
648
|
|
|
649
649
|
# Get the first line of the docstring
|
|
@@ -658,16 +658,16 @@ class Distance:
|
|
|
658
658
|
|
|
659
659
|
methods_data.append(
|
|
660
660
|
{
|
|
661
|
-
"
|
|
661
|
+
"Distance": name,
|
|
662
662
|
"Description": description,
|
|
663
663
|
"Usage": f"{name}{signature}",
|
|
664
664
|
}
|
|
665
665
|
)
|
|
666
666
|
|
|
667
|
-
# Create DataFrame and sort by
|
|
667
|
+
# Create DataFrame and sort by Distance name
|
|
668
668
|
df = pd.DataFrame(methods_data)
|
|
669
669
|
if not df.empty:
|
|
670
|
-
df = df.sort_values(by="
|
|
670
|
+
df = df.sort_values(by="Distance").reset_index(drop=True)
|
|
671
671
|
|
|
672
672
|
return df
|
|
673
673
|
|
|
@@ -49,9 +49,9 @@ from sklearn.metrics import (
|
|
|
49
49
|
from sklearn.preprocessing import OrdinalEncoder
|
|
50
50
|
from statsmodels.tsa.stattools import adfuller
|
|
51
51
|
|
|
52
|
-
from distance import Distance
|
|
53
|
-
from metrics import Metrics
|
|
54
|
-
from models import (
|
|
52
|
+
from .distance import Distance
|
|
53
|
+
from .metrics import Metrics
|
|
54
|
+
from .models import (
|
|
55
55
|
CorrelationConfig,
|
|
56
56
|
DistributionConfig,
|
|
57
57
|
GrubbsTestResult,
|
|
@@ -22,13 +22,12 @@ Error and edge case handling (incorrect sizes, empty arrays, incorrect parameter
|
|
|
22
22
|
import numpy as np
|
|
23
23
|
import pandas as pd
|
|
24
24
|
import pytest
|
|
25
|
+
from ds_tools.distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
|
|
25
26
|
from scipy.spatial.distance import cdist
|
|
26
27
|
from scipy.stats import entropy
|
|
27
28
|
|
|
28
|
-
from distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
|
|
29
|
-
|
|
30
29
|
if NUMBA_AVAILABLE:
|
|
31
|
-
from distance import (
|
|
30
|
+
from ds_tools.distance import (
|
|
32
31
|
_canberra_numba,
|
|
33
32
|
_chebyshev_numba,
|
|
34
33
|
_cosine_similarity_numba,
|
|
@@ -236,14 +235,16 @@ def test_public_interface_calls_correct_backend(tools, mocker, small_sample_vect
|
|
|
236
235
|
if NUMBA_AVAILABLE:
|
|
237
236
|
mocker.patch.object(tools.distance, "gpu_available", False)
|
|
238
237
|
mocker.patch.object(tools.distance, "numba_available", True)
|
|
239
|
-
mock_numba = mocker.patch(
|
|
238
|
+
mock_numba = mocker.patch(
|
|
239
|
+
"ds_tools.distance._manhattan_numba", return_value=1.0
|
|
240
|
+
)
|
|
240
241
|
tools.distance.manhattan(u, v)
|
|
241
242
|
mock_numba.assert_called_once()
|
|
242
243
|
|
|
243
244
|
# Case 2: Only NumPy is available
|
|
244
245
|
mocker.patch.object(tools.distance, "gpu_available", False)
|
|
245
246
|
mocker.patch.object(tools.distance, "numba_available", False)
|
|
246
|
-
mock_numpy = mocker.patch("distance._manhattan_numpy", return_value=2.0)
|
|
247
|
+
mock_numpy = mocker.patch("ds_tools.distance._manhattan_numpy", return_value=2.0)
|
|
247
248
|
tools.distance.manhattan(u, v)
|
|
248
249
|
mock_numpy.assert_called_once()
|
|
249
250
|
|
|
@@ -264,9 +265,11 @@ def test_backend_dispatching_logic(
|
|
|
264
265
|
|
|
265
266
|
dist = tools.distance
|
|
266
267
|
mocker.patch.object(dist, "gpu_available", True)
|
|
267
|
-
mocker.patch("distance.cp.asarray", side_effect=lambda x: x)
|
|
268
|
-
mock_numba = mocker.patch("distance._euclidean_numba", return_value=1.0)
|
|
269
|
-
mock_cupy = mocker.patch(
|
|
268
|
+
mocker.patch("ds_tools.distance.cp.asarray", side_effect=lambda x: x)
|
|
269
|
+
mock_numba = mocker.patch("ds_tools.distance._euclidean_numba", return_value=1.0)
|
|
270
|
+
mock_cupy = mocker.patch(
|
|
271
|
+
"ds_tools.distance._euclidean_cupy", return_value=2.0, create=True
|
|
272
|
+
)
|
|
270
273
|
|
|
271
274
|
# 1. Small data -> Numba should be used
|
|
272
275
|
u_small, v_small = small_sample_vectors
|
|
@@ -393,7 +396,9 @@ def test_fallback_without_numba(tools, mocker, small_sample_vectors):
|
|
|
393
396
|
mocker.patch.object(dist, "numba_available", False)
|
|
394
397
|
|
|
395
398
|
# --- Spy on the NumPy backend to ensure it's called ---
|
|
396
|
-
mock_numpy_fallback = mocker.patch(
|
|
399
|
+
mock_numpy_fallback = mocker.patch(
|
|
400
|
+
"ds_tools.distance._euclidean_numpy", return_value=99.0
|
|
401
|
+
)
|
|
397
402
|
|
|
398
403
|
# Call the method on the original 'tools' instance
|
|
399
404
|
result = dist.euclidean(u, v)
|
|
@@ -922,19 +927,19 @@ def test_cupy_backend_correctness(
|
|
|
922
927
|
assert np.isclose(result, expected, rtol=1e-5)
|
|
923
928
|
|
|
924
929
|
|
|
925
|
-
def
|
|
926
|
-
"""Tests that
|
|
927
|
-
df = tools.distance.
|
|
930
|
+
def test_list_distances_returns_correct_dataframe(tools):
|
|
931
|
+
"""Tests that list_distances returns a non-empty DataFrame with correct columns."""
|
|
932
|
+
df = tools.distance.list_distances()
|
|
928
933
|
|
|
929
934
|
assert isinstance(df, pd.DataFrame)
|
|
930
935
|
assert not df.empty
|
|
931
936
|
|
|
932
937
|
# Check expected columns
|
|
933
|
-
expected_cols = ["
|
|
938
|
+
expected_cols = ["Distance", "Description", "Usage"]
|
|
934
939
|
assert list(df.columns) == expected_cols
|
|
935
940
|
|
|
936
941
|
# Check that known metrics are present
|
|
937
|
-
metrics = df["
|
|
942
|
+
metrics = df["Distance"].tolist()
|
|
938
943
|
assert "euclidean" in metrics
|
|
939
944
|
assert "manhattan" in metrics
|
|
940
945
|
assert "cosine_similarity" in metrics
|
|
@@ -944,6 +949,6 @@ def test_list_metrics_returns_correct_dataframe(tools):
|
|
|
944
949
|
assert "_dispatch_v2v" not in metrics
|
|
945
950
|
|
|
946
951
|
# Check content of a specific row (e.g., euclidean)
|
|
947
|
-
euclidean_row = df[df["
|
|
952
|
+
euclidean_row = df[df["Distance"] == "euclidean"].iloc[0]
|
|
948
953
|
assert "Euclidean" in euclidean_row["Description"]
|
|
949
954
|
assert "force_cpu" in euclidean_row["Usage"]
|
|
@@ -14,9 +14,8 @@ import types
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pytest
|
|
17
|
-
|
|
18
|
-
import
|
|
19
|
-
from distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
|
|
17
|
+
from ds_tools import distance
|
|
18
|
+
from ds_tools.distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
|
|
20
19
|
|
|
21
20
|
# --- Helpers ---------------------------------------------------------------
|
|
22
21
|
|
|
@@ -76,7 +75,8 @@ def test_pairwise_euclidean_fallback_when_cupy_missing(monkeypatch, mocker):
|
|
|
76
75
|
monkeypatch.delitem(distance.__dict__, "_pairwise_euclidean_cupy", raising=False)
|
|
77
76
|
|
|
78
77
|
mock_numpy = mocker.patch(
|
|
79
|
-
"distance._pairwise_euclidean_numpy",
|
|
78
|
+
"ds_tools.distance._pairwise_euclidean_numpy",
|
|
79
|
+
return_value=np.full((10, 12), 777.0),
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
res = dist.pairwise_euclidean(X, Y, force_cpu=False)
|
|
@@ -12,10 +12,9 @@
|
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pytest
|
|
15
|
+
from ds_tools.models import DistributionConfig
|
|
15
16
|
from scipy import stats
|
|
16
17
|
|
|
17
|
-
from ds_tool import DistributionConfig
|
|
18
|
-
|
|
19
18
|
|
|
20
19
|
def describe_generated_data(data: np.ndarray):
|
|
21
20
|
return {
|
|
@@ -88,7 +87,7 @@ def test_generate_distribution_zero_std_case(tools, mocker):
|
|
|
88
87
|
mean=50, median=50, std=1, min_val=0, max_val=100, skewness=0, kurtosis=3, n=10
|
|
89
88
|
)
|
|
90
89
|
# artificially create a situation with zero std by mocking np.std
|
|
91
|
-
mocker.patch("ds_tool.np.std", return_value=0)
|
|
90
|
+
mocker.patch("ds_tools.ds_tool.np.std", return_value=0)
|
|
92
91
|
data = tools.generate_distribution(config)
|
|
93
92
|
|
|
94
93
|
unique_values = np.unique(data)
|
|
@@ -14,10 +14,9 @@ import numpy as np
|
|
|
14
14
|
import pandas as pd
|
|
15
15
|
import polars as pl
|
|
16
16
|
import pytest
|
|
17
|
+
from ds_tools.models import DistributionConfig
|
|
17
18
|
from scipy import stats
|
|
18
19
|
|
|
19
|
-
from ds_tool import DistributionConfig
|
|
20
|
-
|
|
21
20
|
|
|
22
21
|
def describe_metrics(arr: np.ndarray):
|
|
23
22
|
return {
|
|
@@ -146,7 +145,7 @@ def test_generate_distribution_zero_std_case(tools, mocker):
|
|
|
146
145
|
mean=50, median=50, std=1, min_val=0, max_val=100, skewness=0, kurtosis=3, n=10
|
|
147
146
|
)
|
|
148
147
|
# artificially create a situation with zero std by mocking np.std
|
|
149
|
-
mocker.patch("ds_tool.np.std", return_value=0)
|
|
148
|
+
mocker.patch("ds_tools.ds_tool.np.std", return_value=0)
|
|
150
149
|
data = tools.generate_distribution(config)
|
|
151
150
|
unique_values = np.unique(data)
|
|
152
151
|
expected_values = {config.min_val, config.max_val, config.mean}
|
|
@@ -25,9 +25,7 @@ Error and edge case handling (empty arrays, size mismatch, invalid parameters).
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
import pandas as pd
|
|
27
27
|
import pytest
|
|
28
|
-
|
|
29
|
-
from ds_tool import Metrics
|
|
30
|
-
from metrics import CUPY_AVAILABLE, NUMBA_AVAILABLE
|
|
28
|
+
from ds_tools.metrics import CUPY_AVAILABLE, NUMBA_AVAILABLE, Metrics
|
|
31
29
|
|
|
32
30
|
pytestmark_cupy = pytest.mark.skipif(
|
|
33
31
|
not CUPY_AVAILABLE, reason="CuPy or compatible GPU is not available"
|
|
@@ -247,10 +245,12 @@ def test_gpu_threshold_logic(tools, mocker, small_sample_data, large_sample_data
|
|
|
247
245
|
pytest.skip("This test requires both CuPy and Numba to be installed.")
|
|
248
246
|
|
|
249
247
|
mocker.patch.object(tools.metrics, "gpu_available", True)
|
|
250
|
-
mocker.patch("metrics.cp.asarray", side_effect=lambda x: x, create=True)
|
|
248
|
+
mocker.patch("ds_tools.metrics.cp.asarray", side_effect=lambda x: x, create=True)
|
|
251
249
|
# Mock the backends to see which one is called
|
|
252
|
-
mock_cupy = mocker.patch(
|
|
253
|
-
|
|
250
|
+
mock_cupy = mocker.patch(
|
|
251
|
+
"ds_tools.metrics._mae_cupy", return_value=1.0, create=True
|
|
252
|
+
)
|
|
253
|
+
mock_numba = mocker.patch("ds_tools.metrics._mae_numba", return_value=2.0)
|
|
254
254
|
|
|
255
255
|
# 1. Test with small data -> Numba should be called
|
|
256
256
|
y_true_small, y_pred_small = small_sample_data
|
|
@@ -271,7 +271,7 @@ def test_gpu_threshold_logic(tools, mocker, small_sample_data, large_sample_data
|
|
|
271
271
|
)
|
|
272
272
|
def test_dispatch_fallback_to_numpy(tools, mocker, small_sample_data):
|
|
273
273
|
"""Tests that dispatcher falls back to NumPy if Numba is missing."""
|
|
274
|
-
mock_numpy = mocker.patch("metrics._mae_numpy", return_value=99.0)
|
|
274
|
+
mock_numpy = mocker.patch("ds_tools.metrics._mae_numpy", return_value=99.0)
|
|
275
275
|
y_true, y_pred = small_sample_data
|
|
276
276
|
result = tools.metrics.mae(y_true, y_pred, force_cpu=True)
|
|
277
277
|
mock_numpy.assert_called_once()
|
|
@@ -135,9 +135,12 @@ def test_read_dataframes_from_zip_pandas_fallback_to_csv(
|
|
|
135
135
|
# mock pd.read_parquet so it throws an error,
|
|
136
136
|
# and check that after this pd.read_csv will be called.
|
|
137
137
|
mocker.patch(
|
|
138
|
-
"ds_tool.pd.read_parquet",
|
|
138
|
+
"ds_tools.ds_tool.pd.read_parquet",
|
|
139
|
+
side_effect=ValueError("Simulated read error"),
|
|
140
|
+
)
|
|
141
|
+
mock_read_csv = mocker.patch(
|
|
142
|
+
"ds_tools.ds_tool.pd.read_csv", return_value=sample_pandas_df
|
|
139
143
|
)
|
|
140
|
-
mock_read_csv = mocker.patch("ds_tool.pd.read_csv", return_value=sample_pandas_df)
|
|
141
144
|
|
|
142
145
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
143
146
|
# Create a dummy zip archive with a "non-parquet" file, but with the .parquet extension
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|