bm-preprocessing 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/PKG-INFO +2 -1
  2. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/USAGE.md +19 -1
  3. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/pyproject.toml +2 -1
  4. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/__init__.py +3 -1
  5. bm_preprocessing-0.6.0/src/bm_preprocessing/DM/adaboost.py +30 -0
  6. bm_preprocessing-0.6.0/src/bm_preprocessing/DM/bagging.py +30 -0
  7. bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/adaboost.py +69 -0
  8. bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/bagging.py +173 -0
  9. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/.gitignore +0 -0
  10. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/README.md +0 -0
  11. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/all.py +0 -0
  12. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  13. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hash.py +0 -0
  14. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  15. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  16. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/id3.py +0 -0
  17. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
  18. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  19. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  20. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  21. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  22. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  23. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  24. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  25. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
  26. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
  27. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  28. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
  29. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  30. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/all.py +0 -0
  31. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  32. {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
7
7
  Requires-Dist: graphviz>=0.20.3
8
8
  Requires-Dist: matplotlib>=3.7.5
9
9
  Requires-Dist: pandas>=2.0.3
10
+ Requires-Dist: scikit-learn>=1.3.2
10
11
  Requires-Dist: twine>=6.1.0
11
12
  Description-Content-Type: text/markdown
12
13
 
@@ -15,7 +15,7 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, id3, id3_test, preprocessing
18
+ from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
19
19
 
20
20
  # Print the source code
21
21
  print("=== IR All Module ===")
@@ -24,6 +24,12 @@ print(all)
24
24
  print("\n=== DM Apriori Module ===")
25
25
  print(apriori)
26
26
 
27
+ print("\n=== DM AdaBoost Module ===")
28
+ print(adaboost)
29
+
30
+ print("\n=== DM Bagging Module ===")
31
+ print(bagging)
32
+
27
33
  print("\n=== DM Hash Module ===")
28
34
  print(hash)
29
35
 
@@ -67,6 +73,14 @@ Then in the Python REPL:
67
73
  >>> print(apriori)
68
74
  # Prints entire DM/apriori.py source code
69
75
 
76
+ >>> from bm_preprocessing.DM import adaboost
77
+ >>> print(adaboost)
78
+ # Prints entire DM/adaboost.py source code
79
+
80
+ >>> from bm_preprocessing.DM import bagging
81
+ >>> print(bagging)
82
+ # Prints entire DM/bagging.py source code
83
+
70
84
  >>> from bm_preprocessing.DM import hunts, hunts_test
71
85
  >>> print(hunts)
72
86
  # Prints entire DM/hunts.py source code
@@ -87,6 +101,8 @@ Then in the Python REPL:
87
101
  ```bash
88
102
  python -c "from bm_preprocessing.IR import all; print(all)"
89
103
  python -c "from bm_preprocessing.DM import apriori; print(apriori)"
104
+ python -c "from bm_preprocessing.DM import adaboost; print(adaboost)"
105
+ python -c "from bm_preprocessing.DM import bagging; print(bagging)"
90
106
  python -c "from bm_preprocessing.DM import hash; print(hash)"
91
107
  python -c "from bm_preprocessing.DM import hunts; print(hunts)"
92
108
  python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
@@ -104,6 +120,8 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
104
120
  | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
105
121
  | `from bm_preprocessing.DM import all` | Data Mining algorithms |
106
122
  | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
123
+ | `from bm_preprocessing.DM import adaboost` | Bagging & AdaBoost ensemble classifiers |
124
+ | `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
107
125
  | `from bm_preprocessing.DM import hash` | Hash-based mining |
108
126
  | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
109
127
  | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.4.0"
7
+ version = "0.6.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,6 +13,7 @@ dependencies = [
13
13
  "graphviz>=0.20.3",
14
14
  "matplotlib>=3.7.5",
15
15
  "pandas>=2.0.3",
16
+ "scikit-learn>=1.3.2",
16
17
  "twine>=6.1.0",
17
18
  ]
18
19
 
@@ -1,7 +1,9 @@
1
1
  """DM subpackage - Data Mining source code."""
2
2
 
3
+ from .adaboost import adaboost
3
4
  from .all import all
4
5
  from .apriori import apriori
6
+ from .bagging import bagging
5
7
  from .hash import hash
6
8
  from .hunts import hunts
7
9
  from .hunts_test import hunts_test
@@ -9,5 +11,5 @@ from .id3 import id3
9
11
  from .id3_test import id3_test
10
12
  from .preprocessing import preprocessing
11
13
 
12
- __all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
14
+ __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
13
15
 
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/adaboost.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "adaboost.py"
30
+ adaboost = SourceCodeModule("DM.adaboost", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/bagging.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "bagging.py"
30
+ bagging = SourceCodeModule("DM.bagging", _source_file)
@@ -0,0 +1,69 @@
1
+ from sklearn.datasets import load_iris
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.tree import DecisionTreeClassifier
4
+ from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
5
+ from sklearn.metrics import accuracy_score, classification_report
6
+
7
+ # Load dataset
8
+ iris = load_iris()
9
+ X = iris.data
10
+ y = iris.target
11
+
12
+ X_train, X_test, y_train, y_test = train_test_split(
13
+ X, y, test_size=0.3, random_state=42, stratify=y
14
+ )
15
+
16
+ # ==========================================
17
+ # Bagging Classifier
18
+ # ==========================================
19
+
20
+ print("=" * 50)
21
+ print("BAGGING CLASSIFIER")
22
+ print("=" * 50)
23
+
24
+ bagging_model = BaggingClassifier(
25
+ estimator=DecisionTreeClassifier(random_state=42),
26
+ n_estimators=10,
27
+ random_state=42,
28
+ )
29
+
30
+ bagging_model.fit(X_train, y_train)
31
+ y_pred_bagging = bagging_model.predict(X_test)
32
+
33
+ print(f"\nAccuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
34
+ print(f"\nFirst 10 Predictions: {y_pred_bagging[:10]}")
35
+ print(f"First 10 Actual: {y_test[:10]}")
36
+ print(f"\nClassification Report:\n{classification_report(y_test, y_pred_bagging, target_names=iris.target_names)}")
37
+
38
+ # ==========================================
39
+ # AdaBoost Classifier
40
+ # ==========================================
41
+
42
+ print("=" * 50)
43
+ print("ADABOOST CLASSIFIER")
44
+ print("=" * 50)
45
+
46
+ adaboost_model = AdaBoostClassifier(
47
+ estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
48
+ n_estimators=50,
49
+ learning_rate=1.0,
50
+ random_state=42,
51
+ )
52
+
53
+ adaboost_model.fit(X_train, y_train)
54
+ y_pred_adaboost = adaboost_model.predict(X_test)
55
+
56
+ print(f"\nAccuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")
57
+ print(f"\nFirst 10 Predictions: {y_pred_adaboost[:10]}")
58
+ print(f"First 10 Actual: {y_test[:10]}")
59
+ print(f"\nClassification Report:\n{classification_report(y_test, y_pred_adaboost, target_names=iris.target_names)}")
60
+
61
+ # ==========================================
62
+ # Comparison
63
+ # ==========================================
64
+
65
+ print("=" * 50)
66
+ print("COMPARISON")
67
+ print("=" * 50)
68
+ print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
69
+ print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")
@@ -0,0 +1,173 @@
1
+ import numpy as np
2
+ from sklearn.datasets import load_iris
3
+ from sklearn.model_selection import train_test_split
4
+ from collections import Counter
5
+
6
+ iris = load_iris()
7
+ X = iris.data
8
+ y = iris.target
9
+
10
+ X_train, X_test, y_train, y_test = train_test_split(
11
+ X, y, test_size=0.3, random_state=42, stratify=y
12
+ )
13
+
14
+ def gini(y):
15
+ counts = np.bincount(y)
16
+ probs = counts / len(y)
17
+ return 1 - np.sum(probs**2)
18
+
19
+ class TreeNode:
20
+ def __init__(self, depth=0, max_depth=3):
21
+ self.depth = depth
22
+ self.max_depth = max_depth
23
+ self.feature = None
24
+ self.threshold = None
25
+ self.left = None
26
+ self.right = None
27
+ self.value = None
28
+
29
+ def fit(self, X, y):
30
+ if len(set(y)) == 1:
31
+ self.value = y[0]
32
+ return
33
+ if self.depth >= self.max_depth or len(y) <= 2:
34
+ self.value = Counter(y).most_common(1)[0][0]
35
+ return
36
+ n_samples, n_features = X.shape
37
+ best_gini = 1.0
38
+ for feature in range(n_features):
39
+ thresholds = np.unique(X[:, feature])
40
+ for t in thresholds:
41
+ left_mask = X[:, feature] <= t
42
+ right_mask = X[:, feature] > t
43
+ if sum(left_mask) == 0 or sum(right_mask) == 0:
44
+ continue
45
+ g = (sum(left_mask)/n_samples)*gini(y[left_mask]) + \
46
+ (sum(right_mask)/n_samples)*gini(y[right_mask])
47
+ if g < best_gini:
48
+ best_gini = g
49
+ self.feature = feature
50
+ self.threshold = t
51
+ if self.feature is None:
52
+ self.value = Counter(y).most_common(1)[0][0]
53
+ return
54
+ left_mask = X[:, self.feature] <= self.threshold
55
+ right_mask = X[:, self.feature] > self.threshold
56
+ self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
57
+ self.left.fit(X[left_mask], y[left_mask])
58
+ self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
59
+ self.right.fit(X[right_mask], y[right_mask])
60
+
61
+ def predict(self, X):
62
+ if self.value is not None:
63
+ return np.array([self.value]*len(X))
64
+ left_mask = X[:, self.feature] <= self.threshold
65
+ right_mask = X[:, self.feature] > self.threshold
66
+ y_pred = np.empty(X.shape[0], dtype=int)
67
+ if sum(left_mask) > 0:
68
+ y_pred[left_mask] = self.left.predict(X[left_mask])
69
+ if sum(right_mask) > 0:
70
+ y_pred[right_mask] = self.right.predict(X[right_mask])
71
+ return y_pred
72
+
73
+ np.random.seed(42)
74
+ n_estimators = 10
75
+ models = []
76
+
77
+ for i in range(n_estimators):
78
+ indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
79
+ X_sample = X_train[indices]
80
+ y_sample = y_train[indices]
81
+ tree = TreeNode(max_depth=3)
82
+ tree.fit(X_sample, y_sample)
83
+ models.append(tree)
84
+
85
+ all_preds = np.array([model.predict(X_test) for model in models])
86
+ final_preds = []
87
+
88
+ for i in range(len(X_test)):
89
+ votes = Counter(all_preds[:, i])
90
+ final_preds.append(votes.most_common(1)[0][0])
91
+
92
+ final_preds = np.array(final_preds)
93
+ accuracy = np.sum(final_preds == y_test) / len(y_test)
94
+
95
+ print("First 10 Predictions:", final_preds[:10])
96
+ print("Accuracy:", accuracy)
97
+
98
+ """Fully libraries"""
99
+
100
+ from sklearn.datasets import load_iris
101
+ from sklearn.model_selection import train_test_split
102
+ from sklearn.tree import DecisionTreeClassifier
103
+ from sklearn.ensemble import BaggingClassifier
104
+ from sklearn.metrics import accuracy_score
105
+
106
+ iris = load_iris()
107
+ X = iris.data
108
+ y = iris.target
109
+
110
+ X_train, X_test, y_train, y_test = train_test_split(
111
+ X, y,
112
+ test_size=0.3,
113
+ random_state=42,
114
+ stratify=y
115
+ )
116
+
117
+ base_classifier = DecisionTreeClassifier(random_state=42)
118
+
119
+ bagging_model = BaggingClassifier(
120
+ estimator=base_classifier,
121
+ n_estimators=10,
122
+ random_state=42
123
+ )
124
+
125
+ bagging_model.fit(X_train, y_train)
126
+
127
+ y_pred = bagging_model.predict(X_test)
128
+
129
+ accuracy = accuracy_score(y_test, y_pred)
130
+
131
+ print("Bagging Classifier Accuracy:", accuracy)
132
+
133
+ """Only DT as library function"""
134
+
135
+ from sklearn.datasets import load_iris
136
+ from sklearn.model_selection import train_test_split
137
+ from sklearn.tree import DecisionTreeClassifier
138
+ from sklearn.metrics import accuracy_score
139
+ from collections import Counter
140
+ import numpy as np
141
+
142
+ iris = load_iris()
143
+ X = iris.data
144
+ y = iris.target
145
+
146
+ X_train, X_test, y_train, y_test = train_test_split(
147
+ X, y, test_size=0.3, random_state=42, stratify=y
148
+ )
149
+
150
+ np.random.seed(42)
151
+ n_estimators = 10
152
+ estimators = []
153
+
154
+ for i in range(n_estimators):
155
+ indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
156
+ X_sample = X_train[indices]
157
+ y_sample = y_train[indices]
158
+
159
+ tree = DecisionTreeClassifier(random_state=42)
160
+ tree.fit(X_sample, y_sample)
161
+ estimators.append(tree)
162
+
163
+ all_preds = np.array([tree.predict(X_test) for tree in estimators])
164
+
165
+ final_preds = []
166
+ for i in range(len(X_test)):
167
+ votes = Counter(all_preds[:, i])
168
+ final_preds.append(votes.most_common(1)[0][0])
169
+ final_preds = np.array(final_preds)
170
+
171
+ accuracy = accuracy_score(y_test, final_preds)
172
+ print("First 10 Predictions:", final_preds[:10])
173
+ print("Manual Bagging Accuracy:", accuracy)