PyPI - bm-preprocessing - Versions diffs - 0.3.0__tar.gz → 0.5.0__tar.gz - Mend

bm-preprocessing 0.3.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: bm-preprocessing
-Version: 0.3.0
+Version: 0.5.0
 Summary: A package to preprocess text data
 Requires-Python: >=3.8
 Requires-Dist: build>=1.2.2.post1
 Requires-Dist: graphviz>=0.20.3
 Requires-Dist: matplotlib>=3.7.5
 Requires-Dist: pandas>=2.0.3
+Requires-Dist: scikit-learn>=1.3.2
 Requires-Dist: twine>=6.1.0
 Description-Content-Type: text/markdown

{bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/USAGE.md RENAMED Viewed

@@ -15,7 +15,7 @@ Create a file `example.py`:
 ```python
 # Import modules
 from bm_preprocessing.IR import all
-from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, preprocessing
+from bm_preprocessing.DM import apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
 # Print the source code
 print("=== IR All Module ===")
@@ -24,6 +24,9 @@ print(all)
 print("\n=== DM Apriori Module ===")
 print(apriori)
+print("\n=== DM Bagging Module ===")
+print(bagging)
 print("\n=== DM Hash Module ===")
 print(hash)
@@ -33,6 +36,12 @@ print(hunts)
 print("\n=== DM Hunts Test Module ===")
 print(hunts_test)
+print("\n=== DM ID3 Module ===")
+print(id3)
+print("\n=== DM ID3 Test Module ===")
+print(id3_test)
 print("\n=== DM Preprocessing Module ===")
 print(preprocessing)
 ```
@@ -61,11 +70,21 @@ Then in the Python REPL:
 >>> print(apriori)
 # Prints entire DM/apriori.py source code
+>>> from bm_preprocessing.DM import bagging
+>>> print(bagging)
+# Prints entire DM/bagging.py source code
 >>> from bm_preprocessing.DM import hunts, hunts_test
 >>> print(hunts)
 # Prints entire DM/hunts.py source code
 >>> print(hunts_test)
 # Prints entire DM/hunts_test.py source code
+>>> from bm_preprocessing.DM import id3, id3_test
+>>> print(id3)
+# Prints entire DM/id3.py source code
+>>> print(id3_test)
+# Prints entire DM/id3_test.py source code
 ```
 ---
@@ -75,9 +94,12 @@ Then in the Python REPL:
 ```bash
 python -c "from bm_preprocessing.IR import all; print(all)"
 python -c "from bm_preprocessing.DM import apriori; print(apriori)"
+python -c "from bm_preprocessing.DM import bagging; print(bagging)"
 python -c "from bm_preprocessing.DM import hash; print(hash)"
 python -c "from bm_preprocessing.DM import hunts; print(hunts)"
 python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
+python -c "from bm_preprocessing.DM import id3; print(id3)"
+python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
 python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
 ```
@@ -90,7 +112,10 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
 | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
 | `from bm_preprocessing.DM import all` | Data Mining algorithms |
 | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
+| `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
 | `from bm_preprocessing.DM import hash` | Hash-based mining |
 | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
 | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
+| `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
+| `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
 | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |

{bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "bm-preprocessing"
-version = "0.3.0"
+version = "0.5.0"
 description = "A package to preprocess text data"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -13,6 +13,7 @@ dependencies = [
     "graphviz>=0.20.3",
     "matplotlib>=3.7.5",
     "pandas>=2.0.3",
+    "scikit-learn>=1.3.2",
     "twine>=6.1.0",
 ]

{bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/__init__.py RENAMED Viewed

@@ -2,9 +2,13 @@
 from .all import all
 from .apriori import apriori
+from .bagging import bagging
 from .hash import hash
 from .hunts import hunts
 from .hunts_test import hunts_test
+from .id3 import id3
+from .id3_test import id3_test
 from .preprocessing import preprocessing
-__all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "preprocessing"]
+__all__ = ["all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/bagging.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/bagging.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "bagging.py"
+bagging = SourceCodeModule("DM.bagging", _source_file)

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/id3.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/id3.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "id3.py"
+id3 = SourceCodeModule("DM.id3", _source_file)

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/id3_test.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/id3_test.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "id3_test.py"
+id3_test = SourceCodeModule("DM.id3_test", _source_file)

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/bagging.py ADDED Viewed

@@ -0,0 +1,173 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from collections import Counter
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+def gini(y):
+    counts = np.bincount(y)
+    probs = counts / len(y)
+    return 1 - np.sum(probs**2)
+class TreeNode:
+    def __init__(self, depth=0, max_depth=3):
+        self.depth = depth
+        self.max_depth = max_depth
+        self.feature = None
+        self.threshold = None
+        self.left = None
+        self.right = None
+        self.value = None
+    def fit(self, X, y):
+        if len(set(y)) == 1:
+            self.value = y[0]
+            return
+        if self.depth >= self.max_depth or len(y) <= 2:
+            self.value = Counter(y).most_common(1)[0][0]
+            return
+        n_samples, n_features = X.shape
+        best_gini = 1.0
+        for feature in range(n_features):
+            thresholds = np.unique(X[:, feature])
+            for t in thresholds:
+                left_mask = X[:, feature] <= t
+                right_mask = X[:, feature] > t
+                if sum(left_mask) == 0 or sum(right_mask) == 0:
+                    continue
+                g = (sum(left_mask)/n_samples)*gini(y[left_mask]) + \
+                    (sum(right_mask)/n_samples)*gini(y[right_mask])
+                if g < best_gini:
+                    best_gini = g
+                    self.feature = feature
+                    self.threshold = t
+        if self.feature is None:
+            self.value = Counter(y).most_common(1)[0][0]
+            return
+        left_mask = X[:, self.feature] <= self.threshold
+        right_mask = X[:, self.feature] > self.threshold
+        self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
+        self.left.fit(X[left_mask], y[left_mask])
+        self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
+        self.right.fit(X[right_mask], y[right_mask])
+    def predict(self, X):
+        if self.value is not None:
+            return np.array([self.value]*len(X))
+        left_mask = X[:, self.feature] <= self.threshold
+        right_mask = X[:, self.feature] > self.threshold
+        y_pred = np.empty(X.shape[0], dtype=int)
+        if sum(left_mask) > 0:
+            y_pred[left_mask] = self.left.predict(X[left_mask])
+        if sum(right_mask) > 0:
+            y_pred[right_mask] = self.right.predict(X[right_mask])
+        return y_pred
+np.random.seed(42)
+n_estimators = 10
+models = []
+for i in range(n_estimators):
+    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
+    X_sample = X_train[indices]
+    y_sample = y_train[indices]
+    tree = TreeNode(max_depth=3)
+    tree.fit(X_sample, y_sample)
+    models.append(tree)
+all_preds = np.array([model.predict(X_test) for model in models])
+final_preds = []
+for i in range(len(X_test)):
+    votes = Counter(all_preds[:, i])
+    final_preds.append(votes.most_common(1)[0][0])
+final_preds = np.array(final_preds)
+accuracy = np.sum(final_preds == y_test) / len(y_test)
+print("First 10 Predictions:", final_preds[:10])
+print("Accuracy:", accuracy)
+"""Fully libraries"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
+from sklearn.metrics import accuracy_score
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y,
+    test_size=0.3,
+    random_state=42,
+    stratify=y
+)
+base_classifier = DecisionTreeClassifier(random_state=42)
+bagging_model = BaggingClassifier(
+    estimator=base_classifier,
+    n_estimators=10,
+    random_state=42
+)
+bagging_model.fit(X_train, y_train)
+y_pred = bagging_model.predict(X_test)
+accuracy = accuracy_score(y_test, y_pred)
+print("Bagging Classifier Accuracy:", accuracy)
+"""Only DT as library function"""
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+from collections import Counter
+import numpy as np
+iris = load_iris()
+X = iris.data
+y = iris.target
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42, stratify=y
+)
+np.random.seed(42)
+n_estimators = 10
+estimators = []
+for i in range(n_estimators):
+    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
+    X_sample = X_train[indices]
+    y_sample = y_train[indices]
+    tree = DecisionTreeClassifier(random_state=42)
+    tree.fit(X_sample, y_sample)
+    estimators.append(tree)
+all_preds = np.array([tree.predict(X_test) for tree in estimators])
+final_preds = []
+for i in range(len(X_test)):
+    votes = Counter(all_preds[:, i])
+    final_preds.append(votes.most_common(1)[0][0])
+final_preds = np.array(final_preds)
+accuracy = accuracy_score(y_test, final_preds)
+print("First 10 Predictions:", final_preds[:10])
+print("Manual Bagging Accuracy:", accuracy)

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/id3.py ADDED Viewed

@@ -0,0 +1,134 @@
+import pandas as pd
+import numpy as np
+import os
+# Load and process data
+data_path = os.path.join(os.path.dirname(__file__), "data.csv")
+df = pd.read_csv(data_path)
+df["Annual Income"] = (
+    df["Annual Income"]
+    .astype(str)
+    .str.replace("K", "", regex=False)
+    .str.replace(" ", "", regex=False)
+    .astype(int)
+    * 1000
+)
+# Entropy calculation
+def entropy(df, target_column):
+    counts = df[target_column].value_counts()
+    probs = counts / len(df)
+    return -sum(probs * np.log2(probs))
+# Information gain calculation
+def information_gain(df, feature, target_column):
+    total_entropy = entropy(df, target_column)
+    values = df[feature].unique()
+    weighted_entropy = 0
+    for value in values:
+        subset = df[df[feature] == value]
+        weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
+    return total_entropy - weighted_entropy
+# Best feature selection
+def best_feature(df, feature_columns, target_column):
+    gains = {
+        feature: information_gain(df, feature, target_column)
+        for feature in feature_columns
+    }
+    return max(gains, key=gains.get)
+# Node class
+class ID3Node:
+    def __init__(self, feature=None, value=None, label=None):
+        self.feature = feature
+        self.value = value
+        self.children = {}
+        self.label = label
+    def is_leaf(self):
+        return self.label is not None
+# ID3 algorithm
+def id3(df, target_column, feature_columns):
+    # If the target column is pure, return a leaf node
+    if len(df[target_column].unique()) == 1:
+        return ID3Node(label=df[target_column].mode()[0])
+    # If no features left, return leaf with majority class
+    if not feature_columns:
+        return ID3Node(label=df[target_column].mode()[0])
+    feature = best_feature(df, feature_columns, target_column)
+    node = ID3Node(feature=feature)
+    if pd.api.types.is_numeric_dtype(df[feature]):
+        median_value = df[feature].median()
+        left_df = df[df[feature] <= median_value]
+        right_df = df[df[feature] > median_value]
+        node.value = f"{feature} <= {median_value}"
+        remaining_features = [col for col in feature_columns if col != feature]
+        node.children["<= " + str(median_value)] = id3(
+            left_df, target_column, remaining_features
+        )
+        node.children["> " + str(median_value)] = id3(
+            right_df, target_column, remaining_features
+        )
+    else:
+        unique_vals = df[feature].unique()
+        for val in unique_vals:
+            subset = df[df[feature] == val]
+            remaining_features = [col for col in feature_columns if col != feature]
+            node.children[val] = id3(subset, target_column, remaining_features)
+    return node
+# Print tree function
+def print_id3_tree(node, indent=""):
+    if node.is_leaf():
+        print(f"{indent}Leaf: {node.label}")
+        return
+    if node.value:
+        print(f"{indent}[Numeric Split] {node.value}")
+    else:
+        print(f"{indent}[Categorical Split] {node.feature}")
+    for val, child in node.children.items():
+        print(f"{indent}--> {val}:")
+        print_id3_tree(child, indent + "    ")
+def main():
+    feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
+    tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
+    print("=== ID3 Algorithm - Decision Tree (data.csv) ===\n")
+    print_id3_tree(tree_root)
+    # Tennis dataset
+    tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
+    tennis_df = pd.read_csv(tennis_path)
+    tennis_features = [col for col in tennis_df.columns if col != "Play"]
+    tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
+    print("\n\n=== ID3 Algorithm - Decision Tree (tennis.csv) ===\n")
+    print_id3_tree(tennis_tree)
+if __name__ == "__main__":
+    main()

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/id3_test.py ADDED Viewed

@@ -0,0 +1,148 @@
+import pandas as pd
+import numpy as np
+import graphviz
+import os
+# Load and process data
+data_path = os.path.join(os.path.dirname(__file__), "data.csv")
+df = pd.read_csv(data_path)
+df["Annual Income"] = (
+    df["Annual Income"]
+    .astype(str)
+    .str.replace("K", "", regex=False)
+    .str.replace(" ", "", regex=False)
+    .astype(int)
+    * 1000
+)
+# Entropy calculation
+def entropy(df, target_column):
+    counts = df[target_column].value_counts()
+    probs = counts / len(df)
+    return -sum(probs * np.log2(probs))
+# Information gain calculation
+def information_gain(df, feature, target_column):
+    total_entropy = entropy(df, target_column)
+    values = df[feature].unique()
+    weighted_entropy = 0
+    for value in values:
+        subset = df[df[feature] == value]
+        weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
+    return total_entropy - weighted_entropy
+# Best feature selection
+def best_feature(df, feature_columns, target_column):
+    gains = {
+        feature: information_gain(df, feature, target_column)
+        for feature in feature_columns
+    }
+    return max(gains, key=gains.get)
+# Node class
+class ID3Node:
+    def __init__(self, feature=None, value=None, label=None):
+        self.feature = feature
+        self.value = value
+        self.children = {}
+        self.label = label
+    def is_leaf(self):
+        return self.label is not None
+# ID3 algorithm
+def id3(df, target_column, feature_columns):
+    # If the target column is pure, return a leaf node
+    if len(df[target_column].unique()) == 1:
+        return ID3Node(label=df[target_column].mode()[0])
+    # If no features left, return leaf with majority class
+    if not feature_columns:
+        return ID3Node(label=df[target_column].mode()[0])
+    feature = best_feature(df, feature_columns, target_column)
+    node = ID3Node(feature=feature)
+    if pd.api.types.is_numeric_dtype(df[feature]):
+        median_value = df[feature].median()
+        left_df = df[df[feature] <= median_value]
+        right_df = df[df[feature] > median_value]
+        node.value = f"{feature} <= {median_value}"
+        remaining_features = [col for col in feature_columns if col != feature]
+        node.children["<= " + str(median_value)] = id3(
+            left_df, target_column, remaining_features
+        )
+        node.children["> " + str(median_value)] = id3(
+            right_df, target_column, remaining_features
+        )
+    else:
+        unique_vals = df[feature].unique()
+        for val in unique_vals:
+            subset = df[df[feature] == val]
+            remaining_features = [col for col in feature_columns if col != feature]
+            node.children[val] = id3(subset, target_column, remaining_features)
+    return node
+# Tree visualization function using graphviz
+def visualize_id3_tree(node, parent_name="Root", graph=None):
+    if graph is None:
+        graph = graphviz.Digraph(format="png", engine="dot")
+    if node.is_leaf():
+        graph.node(parent_name, label=str(node.label), shape="ellipse")
+    else:
+        if node.value:
+            label = node.value
+        else:
+            label = str(node.feature)
+        graph.node(parent_name, label=label, shape="box")
+        for val, child in node.children.items():
+            child_name = f"{parent_name}_{val}"
+            graph.edge(parent_name, child_name, label=str(val))
+            visualize_id3_tree(child, child_name, graph)
+    return graph
+def main():
+    feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
+    tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
+    # Visualize the tree using graphviz
+    graph = visualize_id3_tree(tree_root)
+    output_path = os.path.join(os.path.dirname(__file__), "id3_decision_tree")
+    graph.render(output_path, view=True, cleanup=True)
+    print(f"Decision tree rendered and saved as '{output_path}.png'")
+    # Tennis dataset
+    tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
+    tennis_df = pd.read_csv(tennis_path)
+    tennis_features = [col for col in tennis_df.columns if col != "Play"]
+    tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
+    graph_tennis = visualize_id3_tree(tennis_tree)
+    tennis_output_path = os.path.join(
+        os.path.dirname(__file__), "id3_tennis_decision_tree"
+    )
+    graph_tennis.render(tennis_output_path, view=True, cleanup=True)
+    print(f"Tennis decision tree rendered and saved as '{tennis_output_path}.png'")
+if __name__ == "__main__":
+    main()

bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/tennis.csv ADDED Viewed

@@ -0,0 +1,15 @@
+Outlook,Temperature,Humidity,Wind,Play
+Sunny,Hot,High,Weak,No
+Sunny,Hot,High,Strong,No
+Overcast,Hot,High,Weak,Yes
+Rain,Mild,High,Weak,Yes
+Rain,Cool,Normal,Weak,Yes
+Rain,Cool,Normal,Strong,No
+Overcast,Cool,Normal,Strong,Yes
+Sunny,Mild,High,Weak,No
+Sunny,Cool,Normal,Weak,Yes
+Rain,Mild,Normal,Weak,Yes
+Sunny,Mild,Normal,Strong,Yes
+Overcast,Mild,High,Strong,Yes
+Overcast,Hot,Normal,Weak,Yes
+Rain,Mild,High,Strong,No