dscience-tools 2.3.0__tar.gz → 2.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {dscience_tools-2.3.0/dscience_tools.egg-info → dscience_tools-2.3.2}/PKG-INFO +14 -3
  2. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/README.md +13 -2
  3. {dscience_tools-2.3.0 → dscience_tools-2.3.2/dscience_tools.egg-info}/PKG-INFO +14 -3
  4. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/pyproject.toml +1 -2
  5. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/setup.cfg +1 -1
  6. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/__init__.py +3 -3
  7. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/distance.py +8 -8
  8. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/ds_tool.py +3 -3
  9. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_compute_metrics.py +1 -2
  10. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_corr_matrix.py +1 -2
  11. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance.py +20 -15
  12. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance_additional.py +4 -4
  13. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_distance_numba_cupy.py +1 -2
  14. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_generate_dist.py +2 -3
  15. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_generate_from_metrics.py +2 -3
  16. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_metrics.py +7 -7
  17. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_models.py +1 -2
  18. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_outliers.py +1 -2
  19. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_zip_io.py +5 -2
  20. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/LICENSE +0 -0
  21. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/LICENSE-NC.txt +0 -0
  22. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/MANIFEST.in +0 -0
  23. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/SOURCES.txt +0 -0
  24. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/dependency_links.txt +0 -0
  25. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/requires.txt +0 -0
  26. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/dscience_tools.egg-info/top_level.txt +0 -0
  27. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/metrics.py +0 -0
  28. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/src/models.py +0 -0
  29. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_add_missing.py +0 -0
  30. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_alphanum.py +0 -0
  31. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_category_stats.py +0 -0
  32. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_chatterjee.py +0 -0
  33. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_check_ninf.py +0 -0
  34. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_describe_cat.py +0 -0
  35. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_describe_num.py +0 -0
  36. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_df_stats.py +0 -0
  37. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_entropy.py +0 -0
  38. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_evaluate_cls.py +0 -0
  39. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_function_list.py +0 -0
  40. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_grubbs.py +0 -0
  41. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_kl_divergence.py +0 -0
  42. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_labeling.py +0 -0
  43. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_min_max.py +0 -0
  44. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_normality.py +0 -0
  45. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_plot_cm.py +0 -0
  46. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_sparse_calc.py +0 -0
  47. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_stationarity.py +0 -0
  48. {dscience_tools-2.3.0 → dscience_tools-2.3.2}/tests/test_trials_res_df.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dscience_tools
3
- Version: 2.3.0
3
+ Version: 2.3.2
4
4
  Summary: DSTools: Data Science Tools Library
5
5
  Home-page: https://github.com/s-kav/ds_tools
6
6
  Author: Sergii Kavun
@@ -421,12 +421,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
421
421
  - **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
422
422
  - **Embedding Losses:** `triplet_loss`.
423
423
  - **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
424
+ - **`list_metrics`**: Prints a list of all available metrics.
424
425
 
425
426
  ### Distance Toolkit (`tools.distance.*`)
426
427
  A high-performance toolkit for calculating distances and similarities.
427
428
  - **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
428
429
  - **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
429
430
  - **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
431
+ - **`list_distances`**: Prints a list of all available distances.
430
432
 
431
433
 
432
434
  # Authors
@@ -440,19 +442,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
440
442
 
441
443
  # TODO
442
444
  1. Add some important kind of plots:
445
+
443
446
  📌 KS Plot – Measures how well your model separates positive and negative classes.
447
+
444
448
  📌 SHAP Plot – Explains feature impact and model interpretability.
449
+
445
450
  📌 QQ Plot – Checks if your data follows a theoretical distribution.
451
+
446
452
  📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
453
+
447
454
  📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
455
+
448
456
  📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
457
+
449
458
  📌 ROC Curve – Evaluates classification performance across thresholds.
459
+
450
460
  📌 Precision–Recall Curve – Crucial for imbalanced datasets.
461
+
451
462
  📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
452
463
 
453
- 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
464
+ 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
454
465
 
455
- 3. Add some fast distance metrics (expand of existed).
466
+ 3. Add some fast distance metrics (expand of existed): done, 2.3.0.
456
467
 
457
468
 
458
469
  # References
@@ -377,12 +377,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
377
377
  - **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
378
378
  - **Embedding Losses:** `triplet_loss`.
379
379
  - **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
380
+ - **`list_metrics`**: Prints a list of all available metrics.
380
381
 
381
382
  ### Distance Toolkit (`tools.distance.*`)
382
383
  A high-performance toolkit for calculating distances and similarities.
383
384
  - **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
384
385
  - **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
385
386
  - **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
387
+ - **`list_distances`**: Prints a list of all available distances.
386
388
 
387
389
 
388
390
  # Authors
@@ -396,19 +398,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
396
398
 
397
399
  # TODO
398
400
  1. Add some important kind of plots:
401
+
399
402
  📌 KS Plot – Measures how well your model separates positive and negative classes.
403
+
400
404
  📌 SHAP Plot – Explains feature impact and model interpretability.
405
+
401
406
  📌 QQ Plot – Checks if your data follows a theoretical distribution.
407
+
402
408
  📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
409
+
403
410
  📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
411
+
404
412
  📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
413
+
405
414
  📌 ROC Curve – Evaluates classification performance across thresholds.
415
+
406
416
  📌 Precision–Recall Curve – Crucial for imbalanced datasets.
417
+
407
418
  📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
408
419
 
409
- 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
420
+ 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
410
421
 
411
- 3. Add some fast distance metrics (expand of existed).
422
+ 3. Add some fast distance metrics (expand of existed): done, 2.3.0.
412
423
 
413
424
 
414
425
  # References
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dscience_tools
3
- Version: 2.3.0
3
+ Version: 2.3.2
4
4
  Summary: DSTools: Data Science Tools Library
5
5
  Home-page: https://github.com/s-kav/ds_tools
6
6
  Author: Sergii Kavun
@@ -421,12 +421,14 @@ A high-performance toolkit for calculating loss functions and their gradients.
421
421
  - **Classification Losses:** `hinge_loss`, `log_loss` (Binary Cross-Entropy).
422
422
  - **Embedding Losses:** `triplet_loss`.
423
423
  - **Monitoring:** `start_monitoring`, `update`, `get_history_df`, `plot_history`.
424
+ - **`list_metrics`**: Prints a list of all available metrics.
424
425
 
425
426
  ### Distance Toolkit (`tools.distance.*`)
426
427
  A high-performance toolkit for calculating distances and similarities.
427
428
  - **Vector-to-Vector:** `euclidean`, `manhattan`, `cosine_similarity`, `minkowski`, `chebyshev`, `mahalanobis`, `haversine`, `hamming`, `jaccard`.
428
429
  - **Matrix Operations:** `pairwise_euclidean`, `kmeans_distance`.
429
430
  - **Neighbor Searches:** `knn_distances`, `radius_neighbors`.
431
+ - **`list_distances`**: Prints a list of all available distances.
430
432
 
431
433
 
432
434
  # Authors
@@ -440,19 +442,28 @@ See [CONTRIBUTING](/CONTRIBUTING.md)
440
442
 
441
443
  # TODO
442
444
  1. Add some important kind of plots:
445
+
443
446
  📌 KS Plot – Measures how well your model separates positive and negative classes.
447
+
444
448
  📌 SHAP Plot – Explains feature impact and model interpretability.
449
+
445
450
  📌 QQ Plot – Checks if your data follows a theoretical distribution.
451
+
446
452
  📌 Cumulative Explained Variance – Helps decide the optimal number of PCA components.
453
+
447
454
  📌 Gini vs Entropy – Key metrics for understanding decision tree impurity.
455
+
448
456
  📌 Bias–Variance Tradeoff – Shows the balance between underfitting and overfitting.
457
+
449
458
  📌 ROC Curve – Evaluates classification performance across thresholds.
459
+
450
460
  📌 Precision–Recall Curve – Crucial for imbalanced datasets.
461
+
451
462
  📌 Elbow Curve – Helps choose the right number of clusters in K-Means.
452
463
 
453
- 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula
464
+ 2. Implement Fast Fourier Transform (FFT) algorithm and Shannon’s interpolation formula: done, 2.3.0.
454
465
 
455
- 3. Add some fast distance metrics (expand of existed).
466
+ 3. Add some fast distance metrics (expand of existed): done, 2.3.0.
456
467
 
457
468
 
458
469
  # References
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dscience_tools"
3
- version = "2.3.0"
3
+ version = "2.3.2"
4
4
  description = "DSTools: Data Science Tools Library"
5
5
  readme = "README.md"
6
6
 
@@ -86,5 +86,4 @@ filterwarnings = [
86
86
  testpaths = ["tests"]
87
87
  pythonpath = [
88
88
  ".",
89
- "src",
90
89
  ]
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = dscience_tools
3
- version = 2.3.0
3
+ version = 2.3.2
4
4
  author = Sergii Kavun
5
5
  author_email = kavserg@gmail.com
6
6
  description = A library of helpful functions for various data science research stages.
@@ -13,10 +13,10 @@
13
13
  DSTools: A library of helpful functions for various data science research stages.
14
14
  """
15
15
 
16
- __version__ = "2.3.0"
16
+ __version__ = "2.3.2"
17
17
 
18
- from ds_tool import DSTools
19
- from models import (
18
+ from .ds_tool import DSTools
19
+ from .models import (
20
20
  CorrelationConfig,
21
21
  DistributionConfig,
22
22
  GrubbsTestResult,
@@ -630,20 +630,20 @@ class Distance:
630
630
  f"Distance initialized. GPU: {self.gpu_available}, Numba: {self.numba_available}"
631
631
  )
632
632
 
633
- def list_metrics(self) -> pd.DataFrame:
633
+ def list_distances(self) -> pd.DataFrame:
634
634
  """
635
- Returns a DataFrame listing all available metric functions, their
635
+ Returns a DataFrame listing all available distances functions, their
636
636
  descriptions, and usage signatures.
637
637
 
638
638
  Returns:
639
- pd.DataFrame: A table with columns 'Metric', 'Description', and 'Usage'.
639
+ pd.DataFrame: A table with columns 'Distance', 'Description', and 'Usage'.
640
640
  """
641
641
  methods_data = []
642
642
 
643
643
  # Iterate over all members of the instance
644
644
  for name, func in inspect.getmembers(self, predicate=inspect.ismethod):
645
- # Skip private methods (starting with _) and the list_metrics method itself
646
- if name.startswith("_") or name == "list_metrics":
645
+ # Skip private methods (starting with _) and the list_distances method itself
646
+ if name.startswith("_") or name == "list_distances":
647
647
  continue
648
648
 
649
649
  # Get the first line of the docstring
@@ -658,16 +658,16 @@ class Distance:
658
658
 
659
659
  methods_data.append(
660
660
  {
661
- "Metric": name,
661
+ "Distance": name,
662
662
  "Description": description,
663
663
  "Usage": f"{name}{signature}",
664
664
  }
665
665
  )
666
666
 
667
- # Create DataFrame and sort by Metric name
667
+ # Create DataFrame and sort by Distance name
668
668
  df = pd.DataFrame(methods_data)
669
669
  if not df.empty:
670
- df = df.sort_values(by="Metric").reset_index(drop=True)
670
+ df = df.sort_values(by="Distance").reset_index(drop=True)
671
671
 
672
672
  return df
673
673
 
@@ -49,9 +49,9 @@ from sklearn.metrics import (
49
49
  from sklearn.preprocessing import OrdinalEncoder
50
50
  from statsmodels.tsa.stattools import adfuller
51
51
 
52
- from distance import Distance
53
- from metrics import Metrics
54
- from models import (
52
+ from .distance import Distance
53
+ from .metrics import Metrics
54
+ from .models import (
55
55
  CorrelationConfig,
56
56
  DistributionConfig,
57
57
  GrubbsTestResult,
@@ -12,8 +12,7 @@
12
12
 
13
13
  import numpy as np
14
14
  import pytest
15
-
16
- from ds_tool import MetricsConfig
15
+ from ds_tools.models import MetricsConfig
17
16
 
18
17
  N_SAMPLES = 200
19
18
  THRESHOLD = 0.5
@@ -13,8 +13,7 @@
13
13
  import numpy as np
14
14
  import pandas as pd
15
15
  import pytest
16
-
17
- from ds_tool import CorrelationConfig
16
+ from ds_tools.models import CorrelationConfig
18
17
 
19
18
  N_SAMPLES = 100
20
19
 
@@ -22,13 +22,12 @@ Error and edge case handling (incorrect sizes, empty arrays, incorrect parameter
22
22
  import numpy as np
23
23
  import pandas as pd
24
24
  import pytest
25
+ from ds_tools.distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
25
26
  from scipy.spatial.distance import cdist
26
27
  from scipy.stats import entropy
27
28
 
28
- from distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
29
-
30
29
  if NUMBA_AVAILABLE:
31
- from distance import (
30
+ from ds_tools.distance import (
32
31
  _canberra_numba,
33
32
  _chebyshev_numba,
34
33
  _cosine_similarity_numba,
@@ -236,14 +235,16 @@ def test_public_interface_calls_correct_backend(tools, mocker, small_sample_vect
236
235
  if NUMBA_AVAILABLE:
237
236
  mocker.patch.object(tools.distance, "gpu_available", False)
238
237
  mocker.patch.object(tools.distance, "numba_available", True)
239
- mock_numba = mocker.patch("distance._manhattan_numba", return_value=1.0)
238
+ mock_numba = mocker.patch(
239
+ "ds_tools.distance._manhattan_numba", return_value=1.0
240
+ )
240
241
  tools.distance.manhattan(u, v)
241
242
  mock_numba.assert_called_once()
242
243
 
243
244
  # Case 2: Only NumPy is available
244
245
  mocker.patch.object(tools.distance, "gpu_available", False)
245
246
  mocker.patch.object(tools.distance, "numba_available", False)
246
- mock_numpy = mocker.patch("distance._manhattan_numpy", return_value=2.0)
247
+ mock_numpy = mocker.patch("ds_tools.distance._manhattan_numpy", return_value=2.0)
247
248
  tools.distance.manhattan(u, v)
248
249
  mock_numpy.assert_called_once()
249
250
 
@@ -264,9 +265,11 @@ def test_backend_dispatching_logic(
264
265
 
265
266
  dist = tools.distance
266
267
  mocker.patch.object(dist, "gpu_available", True)
267
- mocker.patch("distance.cp.asarray", side_effect=lambda x: x)
268
- mock_numba = mocker.patch("distance._euclidean_numba", return_value=1.0)
269
- mock_cupy = mocker.patch("distance._euclidean_cupy", return_value=2.0, create=True)
268
+ mocker.patch("ds_tools.distance.cp.asarray", side_effect=lambda x: x)
269
+ mock_numba = mocker.patch("ds_tools.distance._euclidean_numba", return_value=1.0)
270
+ mock_cupy = mocker.patch(
271
+ "ds_tools.distance._euclidean_cupy", return_value=2.0, create=True
272
+ )
270
273
 
271
274
  # 1. Small data -> Numba should be used
272
275
  u_small, v_small = small_sample_vectors
@@ -393,7 +396,9 @@ def test_fallback_without_numba(tools, mocker, small_sample_vectors):
393
396
  mocker.patch.object(dist, "numba_available", False)
394
397
 
395
398
  # --- Spy on the NumPy backend to ensure it's called ---
396
- mock_numpy_fallback = mocker.patch("distance._euclidean_numpy", return_value=99.0)
399
+ mock_numpy_fallback = mocker.patch(
400
+ "ds_tools.distance._euclidean_numpy", return_value=99.0
401
+ )
397
402
 
398
403
  # Call the method on the original 'tools' instance
399
404
  result = dist.euclidean(u, v)
@@ -922,19 +927,19 @@ def test_cupy_backend_correctness(
922
927
  assert np.isclose(result, expected, rtol=1e-5)
923
928
 
924
929
 
925
- def test_list_metrics_returns_correct_dataframe(tools):
926
- """Tests that list_metrics returns a non-empty DataFrame with correct columns."""
927
- df = tools.distance.list_metrics()
930
+ def test_list_distances_returns_correct_dataframe(tools):
931
+ """Tests that list_distances returns a non-empty DataFrame with correct columns."""
932
+ df = tools.distance.list_distances()
928
933
 
929
934
  assert isinstance(df, pd.DataFrame)
930
935
  assert not df.empty
931
936
 
932
937
  # Check expected columns
933
- expected_cols = ["Metric", "Description", "Usage"]
938
+ expected_cols = ["Distance", "Description", "Usage"]
934
939
  assert list(df.columns) == expected_cols
935
940
 
936
941
  # Check that known metrics are present
937
- metrics = df["Metric"].tolist()
942
+ metrics = df["Distance"].tolist()
938
943
  assert "euclidean" in metrics
939
944
  assert "manhattan" in metrics
940
945
  assert "cosine_similarity" in metrics
@@ -944,6 +949,6 @@ def test_list_metrics_returns_correct_dataframe(tools):
944
949
  assert "_dispatch_v2v" not in metrics
945
950
 
946
951
  # Check content of a specific row (e.g., euclidean)
947
- euclidean_row = df[df["Metric"] == "euclidean"].iloc[0]
952
+ euclidean_row = df[df["Distance"] == "euclidean"].iloc[0]
948
953
  assert "Euclidean" in euclidean_row["Description"]
949
954
  assert "force_cpu" in euclidean_row["Usage"]
@@ -14,9 +14,8 @@ import types
14
14
 
15
15
  import numpy as np
16
16
  import pytest
17
-
18
- import distance
19
- from distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
17
+ from ds_tools import distance
18
+ from ds_tools.distance import CUPY_AVAILABLE, NUMBA_AVAILABLE
20
19
 
21
20
  # --- Helpers ---------------------------------------------------------------
22
21
 
@@ -76,7 +75,8 @@ def test_pairwise_euclidean_fallback_when_cupy_missing(monkeypatch, mocker):
76
75
  monkeypatch.delitem(distance.__dict__, "_pairwise_euclidean_cupy", raising=False)
77
76
 
78
77
  mock_numpy = mocker.patch(
79
- "distance._pairwise_euclidean_numpy", return_value=np.full((10, 12), 777.0)
78
+ "ds_tools.distance._pairwise_euclidean_numpy",
79
+ return_value=np.full((10, 12), 777.0),
80
80
  )
81
81
 
82
82
  res = dist.pairwise_euclidean(X, Y, force_cpu=False)
@@ -1,8 +1,7 @@
1
1
  # tests/test_distance_numba_cupy.py
2
2
  import numpy as np
3
3
  import pytest
4
-
5
- import distance
4
+ from ds_tools import distance
6
5
 
7
6
  NUMBA_AVAILABLE = distance.NUMBA_AVAILABLE
8
7
  CUPY_AVAILABLE = distance.CUPY_AVAILABLE
@@ -12,10 +12,9 @@
12
12
 
13
13
  import numpy as np
14
14
  import pytest
15
+ from ds_tools.models import DistributionConfig
15
16
  from scipy import stats
16
17
 
17
- from ds_tool import DistributionConfig
18
-
19
18
 
20
19
  def describe_generated_data(data: np.ndarray):
21
20
  return {
@@ -88,7 +87,7 @@ def test_generate_distribution_zero_std_case(tools, mocker):
88
87
  mean=50, median=50, std=1, min_val=0, max_val=100, skewness=0, kurtosis=3, n=10
89
88
  )
90
89
  # artificially create a situation with zero std by mocking np.std
91
- mocker.patch("ds_tool.np.std", return_value=0)
90
+ mocker.patch("ds_tools.ds_tool.np.std", return_value=0)
92
91
  data = tools.generate_distribution(config)
93
92
 
94
93
  unique_values = np.unique(data)
@@ -14,10 +14,9 @@ import numpy as np
14
14
  import pandas as pd
15
15
  import polars as pl
16
16
  import pytest
17
+ from ds_tools.models import DistributionConfig
17
18
  from scipy import stats
18
19
 
19
- from ds_tool import DistributionConfig
20
-
21
20
 
22
21
  def describe_metrics(arr: np.ndarray):
23
22
  return {
@@ -146,7 +145,7 @@ def test_generate_distribution_zero_std_case(tools, mocker):
146
145
  mean=50, median=50, std=1, min_val=0, max_val=100, skewness=0, kurtosis=3, n=10
147
146
  )
148
147
  # artificially create a situation with zero std by mocking np.std
149
- mocker.patch("ds_tool.np.std", return_value=0)
148
+ mocker.patch("ds_tools.ds_tool.np.std", return_value=0)
150
149
  data = tools.generate_distribution(config)
151
150
  unique_values = np.unique(data)
152
151
  expected_values = {config.min_val, config.max_val, config.mean}
@@ -25,9 +25,7 @@ Error and edge case handling (empty arrays, size mismatch, invalid parameters).
25
25
  import numpy as np
26
26
  import pandas as pd
27
27
  import pytest
28
-
29
- from ds_tool import Metrics
30
- from metrics import CUPY_AVAILABLE, NUMBA_AVAILABLE
28
+ from ds_tools.metrics import CUPY_AVAILABLE, NUMBA_AVAILABLE, Metrics
31
29
 
32
30
  pytestmark_cupy = pytest.mark.skipif(
33
31
  not CUPY_AVAILABLE, reason="CuPy or compatible GPU is not available"
@@ -247,10 +245,12 @@ def test_gpu_threshold_logic(tools, mocker, small_sample_data, large_sample_data
247
245
  pytest.skip("This test requires both CuPy and Numba to be installed.")
248
246
 
249
247
  mocker.patch.object(tools.metrics, "gpu_available", True)
250
- mocker.patch("metrics.cp.asarray", side_effect=lambda x: x, create=True)
248
+ mocker.patch("ds_tools.metrics.cp.asarray", side_effect=lambda x: x, create=True)
251
249
  # Mock the backends to see which one is called
252
- mock_cupy = mocker.patch("metrics._mae_cupy", return_value=1.0, create=True)
253
- mock_numba = mocker.patch("metrics._mae_numba", return_value=2.0)
250
+ mock_cupy = mocker.patch(
251
+ "ds_tools.metrics._mae_cupy", return_value=1.0, create=True
252
+ )
253
+ mock_numba = mocker.patch("ds_tools.metrics._mae_numba", return_value=2.0)
254
254
 
255
255
  # 1. Test with small data -> Numba should be called
256
256
  y_true_small, y_pred_small = small_sample_data
@@ -271,7 +271,7 @@ def test_gpu_threshold_logic(tools, mocker, small_sample_data, large_sample_data
271
271
  )
272
272
  def test_dispatch_fallback_to_numpy(tools, mocker, small_sample_data):
273
273
  """Tests that dispatcher falls back to NumPy if Numba is missing."""
274
- mock_numpy = mocker.patch("metrics._mae_numpy", return_value=99.0)
274
+ mock_numpy = mocker.patch("ds_tools.metrics._mae_numpy", return_value=99.0)
275
275
  y_true, y_pred = small_sample_data
276
276
  result = tools.metrics.mae(y_true, y_pred, force_cpu=True)
277
277
  mock_numpy.assert_called_once()
@@ -13,8 +13,7 @@ This file validates default values, constraints, and custom validation logic.
13
13
  *
14
14
  """
15
15
  import pytest
16
-
17
- from models import (
16
+ from ds_tools.models import (
18
17
  CorrelationConfig,
19
18
  DistributionConfig,
20
19
  GrubbsTestResult,
@@ -13,8 +13,7 @@
13
13
  import numpy as np
14
14
  import pandas as pd
15
15
  import pytest
16
-
17
- from ds_tool import OutlierConfig
16
+ from ds_tools.models import OutlierConfig
18
17
 
19
18
 
20
19
  @pytest.fixture(scope="module")
@@ -135,9 +135,12 @@ def test_read_dataframes_from_zip_pandas_fallback_to_csv(
135
135
  # mock pd.read_parquet so it throws an error,
136
136
  # and check that after this pd.read_csv will be called.
137
137
  mocker.patch(
138
- "ds_tool.pd.read_parquet", side_effect=ValueError("Simulated read error")
138
+ "ds_tools.ds_tool.pd.read_parquet",
139
+ side_effect=ValueError("Simulated read error"),
140
+ )
141
+ mock_read_csv = mocker.patch(
142
+ "ds_tools.ds_tool.pd.read_csv", return_value=sample_pandas_df
139
143
  )
140
- mock_read_csv = mocker.patch("ds_tool.pd.read_csv", return_value=sample_pandas_df)
141
144
 
142
145
  with tempfile.TemporaryDirectory() as temp_dir:
143
146
  # Create a dummy zip archive with a "non-parquet" file, but with the .parquet extension
File without changes