bm-preprocessing 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/PKG-INFO +2 -1
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/USAGE.md +19 -1
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/pyproject.toml +2 -1
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/__init__.py +3 -1
- bm_preprocessing-0.6.0/src/bm_preprocessing/DM/adaboost.py +30 -0
- bm_preprocessing-0.6.0/src/bm_preprocessing/DM/bagging.py +30 -0
- bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/adaboost.py +69 -0
- bm_preprocessing-0.6.0/src/bm_preprocessing/DM/sources/bagging.py +173 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/.gitignore +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/README.md +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/all.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/apriori.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hash.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hunts.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/id3.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/__init__.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/all.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
- {bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/__init__.py +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bm-preprocessing
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: A package to preprocess text data
|
|
5
5
|
Requires-Python: >=3.8
|
|
6
6
|
Requires-Dist: build>=1.2.2.post1
|
|
7
7
|
Requires-Dist: graphviz>=0.20.3
|
|
8
8
|
Requires-Dist: matplotlib>=3.7.5
|
|
9
9
|
Requires-Dist: pandas>=2.0.3
|
|
10
|
+
Requires-Dist: scikit-learn>=1.3.2
|
|
10
11
|
Requires-Dist: twine>=6.1.0
|
|
11
12
|
Description-Content-Type: text/markdown
|
|
12
13
|
|
|
@@ -15,7 +15,7 @@ Create a file `example.py`:
|
|
|
15
15
|
```python
|
|
16
16
|
# Import modules
|
|
17
17
|
from bm_preprocessing.IR import all
|
|
18
|
-
from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, id3, id3_test, preprocessing
|
|
18
|
+
from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
|
|
19
19
|
|
|
20
20
|
# Print the source code
|
|
21
21
|
print("=== IR All Module ===")
|
|
@@ -24,6 +24,12 @@ print(all)
|
|
|
24
24
|
print("\n=== DM Apriori Module ===")
|
|
25
25
|
print(apriori)
|
|
26
26
|
|
|
27
|
+
print("\n=== DM AdaBoost Module ===")
|
|
28
|
+
print(adaboost)
|
|
29
|
+
|
|
30
|
+
print("\n=== DM Bagging Module ===")
|
|
31
|
+
print(bagging)
|
|
32
|
+
|
|
27
33
|
print("\n=== DM Hash Module ===")
|
|
28
34
|
print(hash)
|
|
29
35
|
|
|
@@ -67,6 +73,14 @@ Then in the Python REPL:
|
|
|
67
73
|
>>> print(apriori)
|
|
68
74
|
# Prints entire DM/apriori.py source code
|
|
69
75
|
|
|
76
|
+
>>> from bm_preprocessing.DM import adaboost
|
|
77
|
+
>>> print(adaboost)
|
|
78
|
+
# Prints entire DM/adaboost.py source code
|
|
79
|
+
|
|
80
|
+
>>> from bm_preprocessing.DM import bagging
|
|
81
|
+
>>> print(bagging)
|
|
82
|
+
# Prints entire DM/bagging.py source code
|
|
83
|
+
|
|
70
84
|
>>> from bm_preprocessing.DM import hunts, hunts_test
|
|
71
85
|
>>> print(hunts)
|
|
72
86
|
# Prints entire DM/hunts.py source code
|
|
@@ -87,6 +101,8 @@ Then in the Python REPL:
|
|
|
87
101
|
```bash
|
|
88
102
|
python -c "from bm_preprocessing.IR import all; print(all)"
|
|
89
103
|
python -c "from bm_preprocessing.DM import apriori; print(apriori)"
|
|
104
|
+
python -c "from bm_preprocessing.DM import adaboost; print(adaboost)"
|
|
105
|
+
python -c "from bm_preprocessing.DM import bagging; print(bagging)"
|
|
90
106
|
python -c "from bm_preprocessing.DM import hash; print(hash)"
|
|
91
107
|
python -c "from bm_preprocessing.DM import hunts; print(hunts)"
|
|
92
108
|
python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
|
|
@@ -104,6 +120,8 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
|
|
|
104
120
|
| `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
|
|
105
121
|
| `from bm_preprocessing.DM import all` | Data Mining algorithms |
|
|
106
122
|
| `from bm_preprocessing.DM import apriori` | Apriori algorithm |
|
|
123
|
+
| `from bm_preprocessing.DM import adaboost` | Bagging & AdaBoost ensemble classifiers |
|
|
124
|
+
| `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
|
|
107
125
|
| `from bm_preprocessing.DM import hash` | Hash-based mining |
|
|
108
126
|
| `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
|
|
109
127
|
| `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "bm-preprocessing"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "A package to preprocess text data"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -13,6 +13,7 @@ dependencies = [
|
|
|
13
13
|
"graphviz>=0.20.3",
|
|
14
14
|
"matplotlib>=3.7.5",
|
|
15
15
|
"pandas>=2.0.3",
|
|
16
|
+
"scikit-learn>=1.3.2",
|
|
16
17
|
"twine>=6.1.0",
|
|
17
18
|
]
|
|
18
19
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"""DM subpackage - Data Mining source code."""
|
|
2
2
|
|
|
3
|
+
from .adaboost import adaboost
|
|
3
4
|
from .all import all
|
|
4
5
|
from .apriori import apriori
|
|
6
|
+
from .bagging import bagging
|
|
5
7
|
from .hash import hash
|
|
6
8
|
from .hunts import hunts
|
|
7
9
|
from .hunts_test import hunts_test
|
|
@@ -9,5 +11,5 @@ from .id3 import id3
|
|
|
9
11
|
from .id3_test import id3_test
|
|
10
12
|
from .preprocessing import preprocessing
|
|
11
13
|
|
|
12
|
-
__all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
|
|
14
|
+
__all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
|
|
13
15
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/adaboost.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "adaboost.py"
|
|
30
|
+
adaboost = SourceCodeModule("DM.adaboost", _source_file)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/bagging.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "bagging.py"
|
|
30
|
+
bagging = SourceCodeModule("DM.bagging", _source_file)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from sklearn.datasets import load_iris
|
|
2
|
+
from sklearn.model_selection import train_test_split
|
|
3
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
4
|
+
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
|
|
5
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
6
|
+
|
|
7
|
+
# Load dataset
|
|
8
|
+
iris = load_iris()
|
|
9
|
+
X = iris.data
|
|
10
|
+
y = iris.target
|
|
11
|
+
|
|
12
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
13
|
+
X, y, test_size=0.3, random_state=42, stratify=y
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# ==========================================
|
|
17
|
+
# Bagging Classifier
|
|
18
|
+
# ==========================================
|
|
19
|
+
|
|
20
|
+
print("=" * 50)
|
|
21
|
+
print("BAGGING CLASSIFIER")
|
|
22
|
+
print("=" * 50)
|
|
23
|
+
|
|
24
|
+
bagging_model = BaggingClassifier(
|
|
25
|
+
estimator=DecisionTreeClassifier(random_state=42),
|
|
26
|
+
n_estimators=10,
|
|
27
|
+
random_state=42,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
bagging_model.fit(X_train, y_train)
|
|
31
|
+
y_pred_bagging = bagging_model.predict(X_test)
|
|
32
|
+
|
|
33
|
+
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
|
|
34
|
+
print(f"\nFirst 10 Predictions: {y_pred_bagging[:10]}")
|
|
35
|
+
print(f"First 10 Actual: {y_test[:10]}")
|
|
36
|
+
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_bagging, target_names=iris.target_names)}")
|
|
37
|
+
|
|
38
|
+
# ==========================================
|
|
39
|
+
# AdaBoost Classifier
|
|
40
|
+
# ==========================================
|
|
41
|
+
|
|
42
|
+
print("=" * 50)
|
|
43
|
+
print("ADABOOST CLASSIFIER")
|
|
44
|
+
print("=" * 50)
|
|
45
|
+
|
|
46
|
+
adaboost_model = AdaBoostClassifier(
|
|
47
|
+
estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
|
|
48
|
+
n_estimators=50,
|
|
49
|
+
learning_rate=1.0,
|
|
50
|
+
random_state=42,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
adaboost_model.fit(X_train, y_train)
|
|
54
|
+
y_pred_adaboost = adaboost_model.predict(X_test)
|
|
55
|
+
|
|
56
|
+
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")
|
|
57
|
+
print(f"\nFirst 10 Predictions: {y_pred_adaboost[:10]}")
|
|
58
|
+
print(f"First 10 Actual: {y_test[:10]}")
|
|
59
|
+
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_adaboost, target_names=iris.target_names)}")
|
|
60
|
+
|
|
61
|
+
# ==========================================
|
|
62
|
+
# Comparison
|
|
63
|
+
# ==========================================
|
|
64
|
+
|
|
65
|
+
print("=" * 50)
|
|
66
|
+
print("COMPARISON")
|
|
67
|
+
print("=" * 50)
|
|
68
|
+
print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
|
|
69
|
+
print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_adaboost):.4f}")
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.datasets import load_iris
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
iris = load_iris()
|
|
7
|
+
X = iris.data
|
|
8
|
+
y = iris.target
|
|
9
|
+
|
|
10
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
11
|
+
X, y, test_size=0.3, random_state=42, stratify=y
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def gini(y):
|
|
15
|
+
counts = np.bincount(y)
|
|
16
|
+
probs = counts / len(y)
|
|
17
|
+
return 1 - np.sum(probs**2)
|
|
18
|
+
|
|
19
|
+
class TreeNode:
|
|
20
|
+
def __init__(self, depth=0, max_depth=3):
|
|
21
|
+
self.depth = depth
|
|
22
|
+
self.max_depth = max_depth
|
|
23
|
+
self.feature = None
|
|
24
|
+
self.threshold = None
|
|
25
|
+
self.left = None
|
|
26
|
+
self.right = None
|
|
27
|
+
self.value = None
|
|
28
|
+
|
|
29
|
+
def fit(self, X, y):
|
|
30
|
+
if len(set(y)) == 1:
|
|
31
|
+
self.value = y[0]
|
|
32
|
+
return
|
|
33
|
+
if self.depth >= self.max_depth or len(y) <= 2:
|
|
34
|
+
self.value = Counter(y).most_common(1)[0][0]
|
|
35
|
+
return
|
|
36
|
+
n_samples, n_features = X.shape
|
|
37
|
+
best_gini = 1.0
|
|
38
|
+
for feature in range(n_features):
|
|
39
|
+
thresholds = np.unique(X[:, feature])
|
|
40
|
+
for t in thresholds:
|
|
41
|
+
left_mask = X[:, feature] <= t
|
|
42
|
+
right_mask = X[:, feature] > t
|
|
43
|
+
if sum(left_mask) == 0 or sum(right_mask) == 0:
|
|
44
|
+
continue
|
|
45
|
+
g = (sum(left_mask)/n_samples)*gini(y[left_mask]) + \
|
|
46
|
+
(sum(right_mask)/n_samples)*gini(y[right_mask])
|
|
47
|
+
if g < best_gini:
|
|
48
|
+
best_gini = g
|
|
49
|
+
self.feature = feature
|
|
50
|
+
self.threshold = t
|
|
51
|
+
if self.feature is None:
|
|
52
|
+
self.value = Counter(y).most_common(1)[0][0]
|
|
53
|
+
return
|
|
54
|
+
left_mask = X[:, self.feature] <= self.threshold
|
|
55
|
+
right_mask = X[:, self.feature] > self.threshold
|
|
56
|
+
self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
|
|
57
|
+
self.left.fit(X[left_mask], y[left_mask])
|
|
58
|
+
self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
|
|
59
|
+
self.right.fit(X[right_mask], y[right_mask])
|
|
60
|
+
|
|
61
|
+
def predict(self, X):
|
|
62
|
+
if self.value is not None:
|
|
63
|
+
return np.array([self.value]*len(X))
|
|
64
|
+
left_mask = X[:, self.feature] <= self.threshold
|
|
65
|
+
right_mask = X[:, self.feature] > self.threshold
|
|
66
|
+
y_pred = np.empty(X.shape[0], dtype=int)
|
|
67
|
+
if sum(left_mask) > 0:
|
|
68
|
+
y_pred[left_mask] = self.left.predict(X[left_mask])
|
|
69
|
+
if sum(right_mask) > 0:
|
|
70
|
+
y_pred[right_mask] = self.right.predict(X[right_mask])
|
|
71
|
+
return y_pred
|
|
72
|
+
|
|
73
|
+
np.random.seed(42)
|
|
74
|
+
n_estimators = 10
|
|
75
|
+
models = []
|
|
76
|
+
|
|
77
|
+
for i in range(n_estimators):
|
|
78
|
+
indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
|
|
79
|
+
X_sample = X_train[indices]
|
|
80
|
+
y_sample = y_train[indices]
|
|
81
|
+
tree = TreeNode(max_depth=3)
|
|
82
|
+
tree.fit(X_sample, y_sample)
|
|
83
|
+
models.append(tree)
|
|
84
|
+
|
|
85
|
+
all_preds = np.array([model.predict(X_test) for model in models])
|
|
86
|
+
final_preds = []
|
|
87
|
+
|
|
88
|
+
for i in range(len(X_test)):
|
|
89
|
+
votes = Counter(all_preds[:, i])
|
|
90
|
+
final_preds.append(votes.most_common(1)[0][0])
|
|
91
|
+
|
|
92
|
+
final_preds = np.array(final_preds)
|
|
93
|
+
accuracy = np.sum(final_preds == y_test) / len(y_test)
|
|
94
|
+
|
|
95
|
+
print("First 10 Predictions:", final_preds[:10])
|
|
96
|
+
print("Accuracy:", accuracy)
|
|
97
|
+
|
|
98
|
+
"""Fully libraries"""
|
|
99
|
+
|
|
100
|
+
from sklearn.datasets import load_iris
|
|
101
|
+
from sklearn.model_selection import train_test_split
|
|
102
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
103
|
+
from sklearn.ensemble import BaggingClassifier
|
|
104
|
+
from sklearn.metrics import accuracy_score
|
|
105
|
+
|
|
106
|
+
iris = load_iris()
|
|
107
|
+
X = iris.data
|
|
108
|
+
y = iris.target
|
|
109
|
+
|
|
110
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
111
|
+
X, y,
|
|
112
|
+
test_size=0.3,
|
|
113
|
+
random_state=42,
|
|
114
|
+
stratify=y
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
base_classifier = DecisionTreeClassifier(random_state=42)
|
|
118
|
+
|
|
119
|
+
bagging_model = BaggingClassifier(
|
|
120
|
+
estimator=base_classifier,
|
|
121
|
+
n_estimators=10,
|
|
122
|
+
random_state=42
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
bagging_model.fit(X_train, y_train)
|
|
126
|
+
|
|
127
|
+
y_pred = bagging_model.predict(X_test)
|
|
128
|
+
|
|
129
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
130
|
+
|
|
131
|
+
print("Bagging Classifier Accuracy:", accuracy)
|
|
132
|
+
|
|
133
|
+
"""Only DT as library function"""
|
|
134
|
+
|
|
135
|
+
from sklearn.datasets import load_iris
|
|
136
|
+
from sklearn.model_selection import train_test_split
|
|
137
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
138
|
+
from sklearn.metrics import accuracy_score
|
|
139
|
+
from collections import Counter
|
|
140
|
+
import numpy as np
|
|
141
|
+
|
|
142
|
+
iris = load_iris()
|
|
143
|
+
X = iris.data
|
|
144
|
+
y = iris.target
|
|
145
|
+
|
|
146
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
147
|
+
X, y, test_size=0.3, random_state=42, stratify=y
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
np.random.seed(42)
|
|
151
|
+
n_estimators = 10
|
|
152
|
+
estimators = []
|
|
153
|
+
|
|
154
|
+
for i in range(n_estimators):
|
|
155
|
+
indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
|
|
156
|
+
X_sample = X_train[indices]
|
|
157
|
+
y_sample = y_train[indices]
|
|
158
|
+
|
|
159
|
+
tree = DecisionTreeClassifier(random_state=42)
|
|
160
|
+
tree.fit(X_sample, y_sample)
|
|
161
|
+
estimators.append(tree)
|
|
162
|
+
|
|
163
|
+
all_preds = np.array([tree.predict(X_test) for tree in estimators])
|
|
164
|
+
|
|
165
|
+
final_preds = []
|
|
166
|
+
for i in range(len(X_test)):
|
|
167
|
+
votes = Counter(all_preds[:, i])
|
|
168
|
+
final_preds.append(votes.most_common(1)[0][0])
|
|
169
|
+
final_preds = np.array(final_preds)
|
|
170
|
+
|
|
171
|
+
accuracy = accuracy_score(y_test, final_preds)
|
|
172
|
+
print("First 10 Predictions:", final_preds[:10])
|
|
173
|
+
print("Manual Bagging Accuracy:", accuracy)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/apriori.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/hunts_test.py
RENAMED
|
File without changes
|
|
File without changes
|
{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/id3_test.py
RENAMED
|
File without changes
|
{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/preprocessing.py
RENAMED
|
File without changes
|
{bm_preprocessing-0.4.0 → bm_preprocessing-0.6.0}/src/bm_preprocessing/DM/sources/tennis.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|