PyPI - bm-preprocessing - Versions diffs - 0.4.0__tar.gz → 0.6.0__tar.gz - Mend

bm-preprocessing 0.4.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: bm-preprocessing
-Version: 0.4.0
+Version: 0.6.0
 Summary: A package to preprocess text data
 Requires-Python: >=3.8
 Requires-Dist: build>=1.2.2.post1
 Requires-Dist: graphviz>=0.20.3
 Requires-Dist: matplotlib>=3.7.5
 Requires-Dist: pandas>=2.0.3
+Requires-Dist: scikit-learn>=1.3.2
 Requires-Dist: twine>=6.1.0
 Description-Content-Type: text/markdown

{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/USAGE.md RENAMED Viewed

@@ -15,7 +15,7 @@ Create a file `example.py`:
 ```python
 # Import modules
 from bm_preprocessing.IR import all
-from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, id3, id3_test, preprocessing
+from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
 # Print the source code
 print("=== IR All Module ===")
@@ -24,6 +24,12 @@ print(all)
 print("\n=== DM Apriori Module ===")
 print(apriori)
+print("\n=== DM AdaBoost Module ===")
+print(adaboost)
+print("\n=== DM Bagging Module ===")
+print(bagging)
 print("\n=== DM Hash Module ===")
 print(hash)
@@ -67,6 +73,14 @@ Then in the Python REPL:
 >>> print(apriori)
 # Prints entire DM/apriori.py source code
+>>> from bm_preprocessing.DM import adaboost
+>>> print(adaboost)
+# Prints entire DM/adaboost.py source code
+>>> from bm_preprocessing.DM import bagging
+>>> print(bagging)
+# Prints entire DM/bagging.py source code
 >>> from bm_preprocessing.DM import hunts, hunts_test
 >>> print(hunts)
 # Prints entire DM/hunts.py source code
@@ -87,6 +101,8 @@ Then in the Python REPL:
 ```bash
 python -c "from bm_preprocessing.IR import all; print(all)"
 python -c "from bm_preprocessing.DM import apriori; print(apriori)"
+python -c "from bm_preprocessing.DM import adaboost; print(adaboost)"
+python -c "from bm_preprocessing.DM import bagging; print(bagging)"
 python -c "from bm_preprocessing.DM import hash; print(hash)"
 python -c "from bm_preprocessing.DM import hunts; print(hunts)"
 python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
@@ -104,6 +120,8 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
 | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
 | `from bm_preprocessing.DM import all` | Data Mining algorithms |
 | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
+| `from bm_preprocessing.DM import adaboost` | Bagging & AdaBoost ensemble classifiers |
+| `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
 | `from bm_preprocessing.DM import hash` | Hash-based mining |
 | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
 | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |

{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "bm-preprocessing"
-version = "0.4.0"
+version = "0.6.0"
 description = "A package to preprocess text data"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -13,6 +13,7 @@ dependencies = [
     "graphviz>=0.20.3",
     "matplotlib>=3.7.5",
     "pandas>=2.0.3",
+    "scikit-learn>=1.3.2",
     "twine>=6.1.0",
 ]

{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/__init__.py RENAMED Viewed

@@ -1,7 +1,9 @@
 """DM subpackage - Data Mining source code."""
+from .adaboost import adaboost
 from .all import all
 from .apriori import apriori
+from .bagging import bagging
 from .hash import hash
 from .hunts import hunts
 from .hunts_test import hunts_test
@@ -9,5 +11,5 @@ from .id3 import id3
 from .id3_test import id3_test
 from .preprocessing import preprocessing
-__all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
+__all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]

bm_preprocessing-0.6.0/src/bm_preprocessing/DM/adaboost.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/adaboost.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "adaboost.py"
+adaboost = SourceCodeModule("DM.adaboost", _source_file)

bm_preprocessing-0.6.0/src/bm_preprocessing/DM/bagging.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/bagging.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "bagging.py"
+bagging = SourceCodeModule("DM.bagging", _source_file)

bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/adaboost.py ADDED Viewed

@@ -0,0 +1,69 @@
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
+from sklearn.metrics import accuracy_score, classification_report
+# Load dataset
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+# ==========================================
+# Bagging Classifier
+# ==========================================
+print("=" * 50)
+print("BAGGING CLASSIFIER")
+print("=" * 50)
+bagging_model = BaggingClassifier(
+    estimator=DecisionTreeClassifier(random_state=42),
+    n_estimators=10,
+    random_state=42,
+)
+bagging_model.fit(X_train, y_train)
+y_pred_bagging = bagging_model.predict(X_test)
+print(f"\nAccuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
+print(f"\nFirst 10 Predictions: {y_pred_bagging[:10]}")
+print(f"First 10 Actual:      {y_test[:10]}")
+print(f"\nClassification Report:\n{classification_report(y_test, y_pred_bagging, target_names=iris.target_names)}")
+# ==========================================
+# AdaBoost Classifier
+# ==========================================
+print("=" * 50)
+print("ADABOOST CLASSIFIER")
+print("=" * 50)
+adaboost_model = AdaBoostClassifier(
+    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
+    n_estimators=50,
+    learning_rate=1.0,
+    random_state=42,
+)
+adaboost_model.fit(X_train, y_train)
+y_pred_adaboost = adaboost_model.predict(X_test)
+print(f"\nAccuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")
+print(f"\nFirst 10 Predictions: {y_pred_adaboost[:10]}")
+print(f"First 10 Actual:      {y_test[:10]}")
+print(f"\nClassification Report:\n{classification_report(y_test, y_pred_adaboost, target_names=iris.target_names)}")
+# ==========================================
+# Comparison
+# ==========================================
+print("=" * 50)
+print("COMPARISON")
+print("=" * 50)
+print(f"Bagging Accuracy:  {accuracy_score(y_test, y_pred_bagging):.4f}")
+print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")

bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/bagging.py ADDED Viewed

@@ -0,0 +1,173 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from collections import Counter
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+def gini(y):
+    counts = np.bincount(y)
+    probs = counts / len(y)
+    return 1 - np.sum(probs**2)
+class TreeNode:
+    def __init__(self, depth=0, max_depth=3):
+        self.depth = depth
+        self.max_depth = max_depth
+        self.feature = None
+        self.threshold = None
+        self.left = None
+        self.right = None
+        self.value = None
+    def fit(self, X, y):
+        if len(set(y)) == 1:
+            self.value = y[0]
+            return
+        if self.depth >= self.max_depth or len(y) <= 2:
+            self.value = Counter(y).most_common(1)[0][0]
+            return
+        n_samples, n_features = X.shape
+        best_gini = 1.0
+        for feature in range(n_features):
+            thresholds = np.unique(X[:, feature])
+            for t in thresholds:
+                left_mask = X[:, feature] <= t
+                right_mask = X[:, feature] > t
+                if sum(left_mask) == 0 or sum(right_mask) == 0:
+                    continue
+                g = (sum(left_mask)/n_samples)*gini(y[left_mask]) + \
+                    (sum(right_mask)/n_samples)*gini(y[right_mask])
+                if g < best_gini:
+                    best_gini = g
+                    self.feature = feature
+                    self.threshold = t
+        if self.feature is None:
+            self.value = Counter(y).most_common(1)[0][0]
+            return
+        left_mask = X[:, self.feature] <= self.threshold
+        right_mask = X[:, self.feature] > self.threshold
+        self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
+        self.left.fit(X[left_mask], y[left_mask])
+        self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
+        self.right.fit(X[right_mask], y[right_mask])
+    def predict(self, X):
+        if self.value is not None:
+            return np.array([self.value]*len(X))
+        left_mask = X[:, self.feature] <= self.threshold
+        right_mask = X[:, self.feature] > self.threshold
+        y_pred = np.empty(X.shape[0], dtype=int)
+        if sum(left_mask) > 0:
+            y_pred[left_mask] = self.left.predict(X[left_mask])
+        if sum(right_mask) > 0:
+            y_pred[right_mask] = self.right.predict(X[right_mask])
+        return y_pred
+np.random.seed(42)
+n_estimators = 10
+models = []
+for i in range(n_estimators):
+    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
+    X_sample = X_train[indices]
+    y_sample = y_train[indices]
+    tree = TreeNode(max_depth=3)
+    tree.fit(X_sample, y_sample)
+    models.append(tree)
+all_preds = np.array([model.predict(X_test) for model in models])
+final_preds = []
+for i in range(len(X_test)):
+    votes = Counter(all_preds[:, i])
+    final_preds.append(votes.most_common(1)[0][0])
+final_preds = np.array(final_preds)
+accuracy = np.sum(final_preds == y_test) / len(y_test)
+print("First 10 Predictions:", final_preds[:10])
+print("Accuracy:", accuracy)
+"""Fully libraries"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
+from sklearn.metrics import accuracy_score
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y,
+    test_size=0.3,
+    random_state=42,
+    stratify=y
+)
+base_classifier = DecisionTreeClassifier(random_state=42)
+bagging_model = BaggingClassifier(
+    estimator=base_classifier,
+    n_estimators=10,
+    random_state=42
+)
+bagging_model.fit(X_train, y_train)
+y_pred = bagging_model.predict(X_test)
+accuracy = accuracy_score(y_test, y_pred)
+print("Bagging Classifier Accuracy:", accuracy)
+"""Only DT as library function"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+from collections import Counter
+import numpy as np
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+np.random.seed(42)
+n_estimators = 10
+estimators = []
+for i in range(n_estimators):
+    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
+    X_sample = X_train[indices]
+    y_sample = y_train[indices]
+    tree = DecisionTreeClassifier(random_state=42)
+    tree.fit(X_sample, y_sample)
+    estimators.append(tree)
+all_preds = np.array([tree.predict(X_test) for tree in estimators])
+final_preds = []
+for i in range(len(X_test)):
+    votes = Counter(all_preds[:, i])
+    final_preds.append(votes.most_common(1)[0][0])
+final_preds = np.array(final_preds)
+accuracy = accuracy_score(y_test, final_preds)
+print("First 10 Predictions:", final_preds[:10])
+print("Manual Bagging Accuracy:", accuracy)