PyPI - scratchkit - Versions diffs - 0.2.0__py3-none-any.whl - Mend

scratchkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

mlscratch/__init__.py +56 -0
mlscratch/__main__.py +118 -0
mlscratch/bayesian/__init__.py +53 -0
mlscratch/bayesian/bayesian_linear_regression.py +171 -0
mlscratch/bayesian/bayesian_network.py +248 -0
mlscratch/bayesian/bayesian_nn.py +315 -0
mlscratch/bayesian/gaussian_process.py +207 -0
mlscratch/bayesian/hmm.py +277 -0
mlscratch/bayesian/init.py +52 -0
mlscratch/bayesian/kalman_filter.py +182 -0
mlscratch/bayesian/naive_bayes.py +209 -0
mlscratch/metrics/__init__.py +59 -0
mlscratch/metrics/classification.py +365 -0
mlscratch/metrics/regression.py +79 -0
mlscratch/neural/__init__.py +121 -0
mlscratch/neural/attention.py +420 -0
mlscratch/neural/autoencoder.py +543 -0
mlscratch/neural/boltzmann.py +231 -0
mlscratch/neural/cnn.py +593 -0
mlscratch/neural/cvnn.py +322 -0
mlscratch/neural/gan.py +364 -0
mlscratch/neural/hopfield.py +193 -0
mlscratch/neural/perceptron.py +398 -0
mlscratch/neural/rbf_network.py +230 -0
mlscratch/neural/recurrent.py +569 -0
mlscratch/preprocessing/__init__.py +38 -0
mlscratch/preprocessing/encoders.py +140 -0
mlscratch/preprocessing/model_selection.py +119 -0
mlscratch/preprocessing/polynomial.py +105 -0
mlscratch/preprocessing/scalers.py +220 -0
mlscratch/py.typed +0 -0
mlscratch/reinforcement/__init__.py +59 -0
mlscratch/reinforcement/ddpg.py +363 -0
mlscratch/reinforcement/dqn.py +319 -0
mlscratch/reinforcement/ppo.py +452 -0
mlscratch/reinforcement/q_learning.py +352 -0
mlscratch/reinforcement/sac.py +382 -0
mlscratch/reinforcement/utils.py +594 -0
mlscratch/supervised/__init__.py +76 -0
mlscratch/supervised/_validation.py +50 -0
mlscratch/supervised/adaboost.py +255 -0
mlscratch/supervised/decision_tree.py +495 -0
mlscratch/supervised/gradient_boosting.py +354 -0
mlscratch/supervised/knn.py +234 -0
mlscratch/supervised/lasso_regression.py +125 -0
mlscratch/supervised/linear_models.py +459 -0
mlscratch/supervised/linear_regression.py +197 -0
mlscratch/supervised/logistic_regression.py +119 -0
mlscratch/supervised/naive_bayes.py +113 -0
mlscratch/supervised/random_forest.py +321 -0
mlscratch/supervised/ridge_regression.py +93 -0
mlscratch/supervised/svm.py +356 -0
mlscratch/unsupervised/__init__.py +39 -0
mlscratch/unsupervised/apriori.py +178 -0
mlscratch/unsupervised/dbscan.py +141 -0
mlscratch/unsupervised/gmm.py +204 -0
mlscratch/unsupervised/hierarchical_clustering.py +137 -0
mlscratch/unsupervised/ica.py +167 -0
mlscratch/unsupervised/kmeans.py +135 -0
mlscratch/unsupervised/kmedoids.py +133 -0
mlscratch/unsupervised/pca.py +103 -0
mlscratch/unsupervised/tsne.py +200 -0
scratchkit-0.2.0.dist-info/METADATA +241 -0
scratchkit-0.2.0.dist-info/RECORD +68 -0
scratchkit-0.2.0.dist-info/WHEEL +5 -0
scratchkit-0.2.0.dist-info/entry_points.txt +2 -0
scratchkit-0.2.0.dist-info/licenses/LICENSE +201 -0
scratchkit-0.2.0.dist-info/top_level.txt +1 -0

mlscratch/supervised/random_forest.py ADDED Viewed

@@ -0,0 +1,321 @@
+r"""
+Random Forest
+=============
+Bootstrap-aggregated ("bagged") ensembles of :class:`DecisionTreeClassifier`
+/ :class:`DecisionTreeRegressor` trees, decorrelated by also restricting
+each tree to a random subset of features (the "random subspace" method).
+Algorithm
+---------
+For each of ``n_estimators`` trees:
+1. Draw a bootstrap sample of ``n`` rows with replacement (if
+   ``bootstrap=True``).
+2. Draw ``max_features`` columns without replacement.
+3. Fit a full (or depth-limited) tree on that bootstrap sample restricted
+   to those columns.
+``RandomForestClassifier`` combines trees by averaging their
+``predict_proba`` output (soft voting) and taking the arg-max; rows
+where a particular tree never saw a class during its bootstrap draw are
+naturally handled because that tree's probability for the missing
+class is implicitly zero, not undefined.
+``RandomForestRegressor`` combines trees by averaging their scalar
+predictions.
+Out-of-bag (OOB) estimation
+----------------------------
+When ``oob_score=True``, each tree's prediction is also collected for
+the ``~37%`` of rows it never trained on (the rows not drawn by its
+bootstrap sample), giving an unbiased estimate of generalisation
+performance without held-out data.
+Complexity
+----------
+- Training : O(n_estimators * n d log n)
+- Inference: O(n_estimators * depth)
+"""
+from __future__ import annotations
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from ._validation import validate_x, validate_xy
+from .decision_tree import DecisionTreeClassifier, DecisionTreeRegressor
+FloatArray = NDArray[np.float64]
+IntArray = NDArray[np.int64]
+_EPS = 1e-12
+def _resolve_max_features(max_features: int | float | str | None, n_features: int) -> int:
+    if max_features is None:
+        return n_features
+    if isinstance(max_features, str):
+        if max_features == "sqrt":
+            return max(1, int(np.sqrt(n_features)))
+        if max_features == "log2":
+            return max(1, int(np.log2(n_features)))
+        raise ValueError("max_features must be None, int, float, 'sqrt', or 'log2'.")
+    if isinstance(max_features, float):
+        if not (0.0 < max_features <= 1.0):
+            raise ValueError("max_features as a float must be in (0, 1].")
+        return max(1, int(round(max_features * n_features)))
+    return max(1, min(int(max_features), n_features))
+# ──────────────────────────────────────────────────────────────────────────
+# RandomForestClassifier
+# ──────────────────────────────────────────────────────────────────────────
+class RandomForestClassifier:
+    """Bagged ensemble of decision-tree classifiers with feature subsampling.
+    Parameters
+    ----------
+    n_estimators : int, default=100
+    max_depth : int | None, default=None
+    min_samples_split : int, default=2
+    min_samples_leaf : int, default=1
+    criterion : str, default='gini'
+        ``'gini'`` or ``'entropy'``, forwarded to each tree.
+    max_features : int | float | str | None, default='sqrt'
+        Number of features considered by each tree: an int (exact count),
+        a float in (0, 1] (fraction), ``'sqrt'``, ``'log2'``, or ``None``
+        (use all features).
+    bootstrap : bool, default=True
+        Whether each tree is trained on a bootstrap resample.
+    oob_score : bool, default=False
+        Whether to compute an out-of-bag accuracy estimate (``oob_score_``).
+    random_state : int | None, default=None
+    Attributes
+    ----------
+    estimators_ : list of (tree, feature_indices) tuples
+    classes_ : sorted unique labels seen during fit
+    feature_importances_ : mean impurity-decrease importance across trees
+    oob_score_ : float, only set when ``oob_score=True``
+    """
+    def __init__(
+        self,
+        n_estimators: int = 100,
+        max_depth: int | None = None,
+        min_samples_split: int = 2,
+        min_samples_leaf: int = 1,
+        criterion: str = "gini",
+        max_features: int | float | str | None = "sqrt",
+        bootstrap: bool = True,
+        oob_score: bool = False,
+        random_state: int | None = None,
+    ) -> None:
+        if int(n_estimators) < 1:
+            raise ValueError("n_estimators must be >= 1.")
+        if oob_score and not bootstrap:
+            raise ValueError("oob_score requires bootstrap=True.")
+        self.n_estimators = int(n_estimators)
+        self.max_depth = max_depth
+        self.min_samples_split = int(min_samples_split)
+        self.min_samples_leaf = int(min_samples_leaf)
+        self.criterion = criterion
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.oob_score = oob_score
+        self.random_state = random_state
+        self.estimators_: list[tuple[DecisionTreeClassifier, IntArray]] = []
+        self.classes_: IntArray | None = None
+        self.n_features_in_: int | None = None
+        self.feature_importances_: FloatArray | None = None
+        self.oob_score_: float | None = None
+    def fit(self, X: ArrayLike, y: ArrayLike) -> RandomForestClassifier:
+        X_arr, y_raw = validate_xy(X, y)
+        self.classes_, y_idx = np.unique(y_raw, return_inverse=True)
+        y_idx = y_idx.astype(np.int64)
+        n_samples, n_features = X_arr.shape
+        self.n_features_in_ = n_features
+        n_classes = self.classes_.size
+        n_feat_sub = _resolve_max_features(self.max_features, n_features)
+        rng = np.random.default_rng(self.random_state)
+        self.estimators_ = []
+        importances = np.zeros(n_features, dtype=np.float64)
+        oob_proba = np.zeros((n_samples, n_classes)) if self.oob_score else None
+        oob_count = np.zeros(n_samples, dtype=np.int64) if self.oob_score else None
+        for _ in range(self.n_estimators):
+            sample_idx = (
+                rng.integers(0, n_samples, n_samples) if self.bootstrap else np.arange(n_samples)
+            )
+            feat_idx = rng.choice(n_features, size=n_feat_sub, replace=False)
+            tree = DecisionTreeClassifier(
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+                criterion=self.criterion,
+            )
+            tree.fit(X_arr[sample_idx][:, feat_idx], y_idx[sample_idx])
+            self.estimators_.append((tree, feat_idx))
+            importances[feat_idx] += tree.feature_importances_
+            if self.oob_score:
+                in_bag = np.zeros(n_samples, dtype=bool)
+                in_bag[sample_idx] = True
+                oob_idx = np.flatnonzero(~in_bag)
+                if oob_idx.size:
+                    proba = tree.predict_proba(X_arr[oob_idx][:, feat_idx])
+                    oob_proba[np.ix_(oob_idx, tree.classes_)] += proba
+                    oob_count[oob_idx] += 1
+        importances /= self.n_estimators
+        total = importances.sum()
+        self.feature_importances_ = importances / total if total > _EPS else importances
+        if self.oob_score:
+            has_oob = oob_count > 0
+            if np.any(has_oob):
+                pred_idx = np.argmax(oob_proba[has_oob], axis=1)
+                self.oob_score_ = float(np.mean(pred_idx == y_idx[has_oob]))
+            else:
+                self.oob_score_ = float("nan")
+        return self
+    def predict_proba(self, X: ArrayLike) -> FloatArray:
+        if not self.estimators_:
+            raise RuntimeError("Call fit() before predict_proba().")
+        X_arr = validate_x(X)
+        n_classes = self.classes_.size
+        proba = np.zeros((X_arr.shape[0], n_classes), dtype=np.float64)
+        for tree, feat_idx in self.estimators_:
+            p = tree.predict_proba(X_arr[:, feat_idx])
+            proba[:, tree.classes_] += p
+        proba /= len(self.estimators_)
+        return proba
+    def predict(self, X: ArrayLike) -> NDArray:
+        proba = self.predict_proba(X)
+        return self.classes_[np.argmax(proba, axis=1)]
+    def score(self, X: ArrayLike, y: ArrayLike) -> float:
+        X_arr, y_arr = validate_xy(X, y)
+        return float(np.mean(self.predict(X_arr) == y_arr))
+# ──────────────────────────────────────────────────────────────────────────
+# RandomForestRegressor
+# ──────────────────────────────────────────────────────────────────────────
+class RandomForestRegressor:
+    """Bagged ensemble of decision-tree regressors with feature subsampling.
+    Parameters mirror :class:`RandomForestClassifier`, except
+    ``max_features`` defaults to ``1.0`` (consider all features at every
+    split, the conventional bagging-regressor default) and there is no
+    ``criterion`` choice (trees always split on weighted MSE).
+    """
+    def __init__(
+        self,
+        n_estimators: int = 100,
+        max_depth: int | None = None,
+        min_samples_split: int = 2,
+        min_samples_leaf: int = 1,
+        max_features: int | float | str | None = 1.0,
+        bootstrap: bool = True,
+        oob_score: bool = False,
+        random_state: int | None = None,
+    ) -> None:
+        if int(n_estimators) < 1:
+            raise ValueError("n_estimators must be >= 1.")
+        if oob_score and not bootstrap:
+            raise ValueError("oob_score requires bootstrap=True.")
+        self.n_estimators = int(n_estimators)
+        self.max_depth = max_depth
+        self.min_samples_split = int(min_samples_split)
+        self.min_samples_leaf = int(min_samples_leaf)
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.oob_score = oob_score
+        self.random_state = random_state
+        self.estimators_: list[tuple[DecisionTreeRegressor, IntArray]] = []
+        self.n_features_in_: int | None = None
+        self.feature_importances_: FloatArray | None = None
+        self.oob_score_: float | None = None
+    def fit(self, X: ArrayLike, y: ArrayLike) -> RandomForestRegressor:
+        X_arr, y_arr = validate_xy(X, y)
+        y_arr = y_arr.astype(np.float64)
+        n_samples, n_features = X_arr.shape
+        self.n_features_in_ = n_features
+        n_feat_sub = _resolve_max_features(self.max_features, n_features)
+        rng = np.random.default_rng(self.random_state)
+        self.estimators_ = []
+        importances = np.zeros(n_features, dtype=np.float64)
+        oob_sum = np.zeros(n_samples) if self.oob_score else None
+        oob_count = np.zeros(n_samples, dtype=np.int64) if self.oob_score else None
+        for _ in range(self.n_estimators):
+            sample_idx = (
+                rng.integers(0, n_samples, n_samples) if self.bootstrap else np.arange(n_samples)
+            )
+            feat_idx = rng.choice(n_features, size=n_feat_sub, replace=False)
+            tree = DecisionTreeRegressor(
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+            )
+            tree.fit(X_arr[sample_idx][:, feat_idx], y_arr[sample_idx])
+            self.estimators_.append((tree, feat_idx))
+            importances[feat_idx] += tree.feature_importances_
+            if self.oob_score:
+                in_bag = np.zeros(n_samples, dtype=bool)
+                in_bag[sample_idx] = True
+                oob_idx = np.flatnonzero(~in_bag)
+                if oob_idx.size:
+                    oob_sum[oob_idx] += tree.predict(X_arr[oob_idx][:, feat_idx])
+                    oob_count[oob_idx] += 1
+        importances /= self.n_estimators
+        total = importances.sum()
+        self.feature_importances_ = importances / total if total > _EPS else importances
+        if self.oob_score:
+            has_oob = oob_count > 0
+            if np.any(has_oob):
+                oob_pred = oob_sum[has_oob] / oob_count[has_oob]
+                y_true = y_arr[has_oob]
+                ss_res = np.sum((y_true - oob_pred) ** 2)
+                ss_tot = np.sum((y_true - y_true.mean()) ** 2)
+                self.oob_score_ = float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0
+            else:
+                self.oob_score_ = float("nan")
+        return self
+    def predict(self, X: ArrayLike) -> FloatArray:
+        if not self.estimators_:
+            raise RuntimeError("Call fit() before predict().")
+        X_arr = validate_x(X)
+        preds = np.zeros(X_arr.shape[0], dtype=np.float64)
+        for tree, feat_idx in self.estimators_:
+            preds += tree.predict(X_arr[:, feat_idx])
+        return preds / len(self.estimators_)
+    def score(self, X: ArrayLike, y: ArrayLike) -> float:
+        X_arr, y_arr = validate_xy(X, y)
+        preds = self.predict(X_arr)
+        ss_res = np.sum((y_arr - preds) ** 2)
+        ss_tot = np.sum((y_arr - y_arr.mean()) ** 2)
+        return float(1.0 - ss_res / ss_tot) if ss_tot > _EPS else 0.0

mlscratch/supervised/ridge_regression.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""
+Ridge Regression
+================
+Ridge regression using the closed-form regularized normal equations.
+"""
+from __future__ import annotations
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+FloatArray = NDArray[np.float64]
+def _validate_regression_inputs(
+    X: ArrayLike, y: ArrayLike,
+) -> tuple[FloatArray, FloatArray]:
+    X_arr = np.asarray(X, dtype=float)
+    y_arr = np.asarray(y, dtype=float).flatten()
+    if X_arr.ndim != 2:
+        raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
+    if X_arr.shape[0] != y_arr.shape[0]:
+        raise ValueError(
+            f"X has {X_arr.shape[0]} samples but y has {y_arr.shape[0]}."
+        )
+    return X_arr, y_arr
+class RidgeRegression:
+    """Ridge regression with an L2 penalty on coefficients.
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Regularization strength (L2 penalty coefficient).
+    add_intercept : bool, default=True
+        Whether to fit an intercept term.
+    Attributes
+    ----------
+    coef_ : FloatArray
+        Estimated coefficients for each feature.
+    intercept_ : float
+        Estimated intercept.
+    """
+    def __init__(self, alpha: float = 1.0, add_intercept: bool = True) -> None:
+        self.alpha = float(alpha)
+        self.add_intercept = add_intercept
+        self.coef_: FloatArray | None = None
+        self.intercept_: float | None = None
+    def fit(self, X: ArrayLike, y: ArrayLike) -> "RidgeRegression":
+        """Fit the Ridge regression model using the closed-form solution."""
+        X_arr, y_arr = _validate_regression_inputs(X, y)
+        if self.add_intercept:
+            X_arr = np.column_stack([np.ones(X_arr.shape[0]), X_arr])
+        n_features = X_arr.shape[1]
+        identity = np.eye(n_features)
+        if self.add_intercept:
+            identity[0, 0] = 0.0
+        coef = np.linalg.solve(
+            X_arr.T @ X_arr + self.alpha * identity,
+            X_arr.T @ y_arr,
+        )
+        if self.add_intercept:
+            self.intercept_ = float(coef[0])
+            self.coef_ = coef[1:].astype(np.float64)
+        else:
+            self.intercept_ = 0.0
+            self.coef_ = coef.astype(np.float64)
+        return self
+    def predict(self, X: ArrayLike) -> FloatArray:
+        """Predict targets using the fitted Ridge model."""
+        if self.coef_ is None or self.intercept_ is None:
+            raise RuntimeError("Call fit() before predict().")
+        X_arr = np.asarray(X, dtype=float)
+        if X_arr.ndim != 2:
+            raise ValueError("X must be a 2D array.")
+        return (X_arr @ self.coef_ + self.intercept_).astype(np.float64)
+    def score(self, X: ArrayLike, y: ArrayLike) -> float:
+        """Return R² of the fitted Ridge model."""
+        X_arr, y_arr = _validate_regression_inputs(X, y)
+        y_pred = self.predict(X_arr)
+        ss_res = np.sum((y_arr - y_pred) ** 2)
+        ss_tot = np.sum((y_arr - np.mean(y_arr)) ** 2)
+        return float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0