bm-preprocessing 0.7.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. bm_preprocessing-0.9.0/.gitignore +221 -0
  2. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/PKG-INFO +1 -1
  3. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/USAGE.md +8 -2
  4. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/pyproject.toml +1 -1
  5. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/__init__.py +3 -1
  6. bm_preprocessing-0.9.0/src/bm_preprocessing/DM/all_vis.py +30 -0
  7. bm_preprocessing-0.9.0/src/bm_preprocessing/DM/lib_doc.py +30 -0
  8. bm_preprocessing-0.9.0/src/bm_preprocessing/DM/sources/all.py +308 -0
  9. bm_preprocessing-0.9.0/src/bm_preprocessing/DM/sources/all_vis.py +400 -0
  10. bm_preprocessing-0.9.0/src/bm_preprocessing/DM/sources/lib_doc.py +223 -0
  11. bm_preprocessing-0.7.0/.gitignore +0 -24
  12. bm_preprocessing-0.7.0/src/bm_preprocessing/DM/sources/all.py +0 -159
  13. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/README.md +0 -0
  14. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/adaboost.py +0 -0
  15. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/all.py +0 -0
  16. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  17. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/bagging.py +0 -0
  18. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/hash.py +0 -0
  19. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  20. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  21. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/id3.py +0 -0
  22. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
  23. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/metrics.py +0 -0
  24. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  25. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/adaboost.py +0 -0
  26. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  27. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/bagging.py +0 -0
  28. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  29. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  30. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  31. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  32. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
  33. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
  34. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/metrics.py +0 -0
  35. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  36. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
  37. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  38. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/IR/all.py +0 -0
  39. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  40. {bm_preprocessing-0.7.0 → bm_preprocessing-0.9.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -0,0 +1,221 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ .vscode/
203
+
204
+ # Ruff stuff:
205
+ .ruff_cache/
206
+
207
+ # PyPI configuration file
208
+ .pypirc
209
+
210
+ # Marimo
211
+ marimo/_static/
212
+ marimo/_lsp/
213
+ __marimo__/
214
+
215
+ # Streamlit
216
+ .streamlit/secrets.toml
217
+
218
+ *.png
219
+ *.jpg
220
+ *.pdf
221
+ *.jpeg
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
@@ -15,7 +15,8 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, metrics, preprocessing
18
+ from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, lib_doc, metrics, preprocessing
19
+ from bm_preprocessing.DM import all, all_vis
19
20
 
20
21
  # Print the source code
21
22
  print("=== IR All Module ===")
@@ -107,6 +108,8 @@ Then in the Python REPL:
107
108
 
108
109
  ```bash
109
110
  python -c "from bm_preprocessing.IR import all; print(all)"
111
+ python -c "from bm_preprocessing.DM import all; print(all)"
112
+ python -c "from bm_preprocessing.DM import all_vis; print(all_vis)"
110
113
  python -c "from bm_preprocessing.DM import apriori; print(apriori)"
111
114
  python -c "from bm_preprocessing.DM import adaboost; print(adaboost)"
112
115
  python -c "from bm_preprocessing.DM import bagging; print(bagging)"
@@ -116,6 +119,7 @@ python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
116
119
  python -c "from bm_preprocessing.DM import id3; print(id3)"
117
120
  python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
118
121
  python -c "from bm_preprocessing.DM import metrics; print(metrics)"
122
+ python -c "from bm_preprocessing.DM import lib_doc; print(lib_doc)"
119
123
  python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
120
124
  ```
121
125
 
@@ -126,7 +130,8 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
126
130
  | Import | Description |
127
131
  |--------|-------------|
128
132
  | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
129
- | `from bm_preprocessing.DM import all` | Data Mining algorithms |
133
+ | `from bm_preprocessing.DM import all` | All DM algorithms (Hunt's, ID3, Bagging, AdaBoost, metrics) |
134
+ | `from bm_preprocessing.DM import all_vis` | All DM algorithms + graphviz & full visualization |
130
135
  | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
131
136
  | `from bm_preprocessing.DM import adaboost` | Bagging & AdaBoost ensemble classifiers |
132
137
  | `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
@@ -136,4 +141,5 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
136
141
  | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
137
142
  | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
138
143
  | `from bm_preprocessing.DM import metrics` | Classification metrics & curves |
144
+ | `from bm_preprocessing.DM import lib_doc` | Pandas/NumPy/Sklearn/DM/IR cheat sheet |
139
145
  | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.7.0"
7
+ version = "0.9.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -2,6 +2,7 @@
2
2
 
3
3
  from .adaboost import adaboost
4
4
  from .all import all
5
+ from .all_vis import all_vis
5
6
  from .apriori import apriori
6
7
  from .bagging import bagging
7
8
  from .hash import hash
@@ -9,8 +10,9 @@ from .hunts import hunts
9
10
  from .hunts_test import hunts_test
10
11
  from .id3 import id3
11
12
  from .id3_test import id3_test
13
+ from .lib_doc import lib_doc
12
14
  from .metrics import metrics
13
15
  from .preprocessing import preprocessing
14
16
 
15
- __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "metrics", "preprocessing"]
17
+ __all__ = ["adaboost", "all", "all_vis", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "lib_doc", "metrics", "preprocessing"]
16
18
 
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/all_vis.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "all_vis.py"
30
+ all_vis = SourceCodeModule("DM.all_vis", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/lib_doc.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "lib_doc.py"
30
+ lib_doc = SourceCodeModule("DM.lib_doc", _source_file)
@@ -0,0 +1,308 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import random
4
+ import os
5
+ from collections import Counter
6
+ from sklearn.datasets import load_iris
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
10
+ from sklearn.metrics import (
11
+ accuracy_score, confusion_matrix, f1_score,
12
+ precision_score, recall_score, classification_report,
13
+ )
14
+ import matplotlib.pyplot as plt
15
+
16
+
17
+ # ===========================================================
18
+ # 1. DATA LOADING & PREPROCESSING
19
+ # ===========================================================
20
+
21
+ # --- Load CSV data for Hunt's and ID3 ---
22
+ data_path = os.path.join(os.path.dirname(__file__), "data.csv")
23
+ df = pd.read_csv(data_path)
24
+ df["Annual Income"] = (
25
+ df["Annual Income"]
26
+ .astype(str)
27
+ .str.replace("K", "", regex=False)
28
+ .str.replace(" ", "", regex=False)
29
+ .astype(int)
30
+ * 1000
31
+ )
32
+
33
+ # --- Load Iris for Bagging, AdaBoost, Metrics ---
34
+ iris = load_iris()
35
+ X, y = iris.data, iris.target
36
+ X_train, X_test, y_train, y_test = train_test_split(
37
+ X, y, test_size=0.3, random_state=42, stratify=y
38
+ )
39
+
40
+ print("=" * 60)
41
+ print("DATA PREPROCESSING")
42
+ print("=" * 60)
43
+ print(f"CSV Dataset shape: {df.shape}")
44
+ print(f"CSV Columns: {df.columns.tolist()}")
45
+ print(f"Missing values:\n{df.isnull().sum()}")
46
+ print(f"\nIris Dataset: {X.shape[0]} samples, {X.shape[1]} features, {len(iris.target_names)} classes")
47
+ print(f"Train/Test split: {len(X_train)}/{len(X_test)}")
48
+
49
+
50
+ # ===========================================================
51
+ # 2. HUNT'S ALGORITHM (Random Feature Selection)
52
+ # ===========================================================
53
+
54
+ class HuntsNode:
55
+ def __init__(self, feature=None, median_value=None, label=None):
56
+ self.feature = feature
57
+ self.median_value = median_value
58
+ self.children = {}
59
+ self.label = label
60
+
61
+ def is_leaf(self):
62
+ return self.label is not None
63
+
64
+
65
+ def hunts_build_tree(df, target_column, feature_columns):
66
+ if len(df[target_column].unique()) == 1:
67
+ return HuntsNode(label=df[target_column].mode()[0])
68
+ if not feature_columns:
69
+ return HuntsNode(label=df[target_column].mode()[0])
70
+
71
+ feature = random.choice(feature_columns)
72
+ node = HuntsNode(feature=feature)
73
+ remaining_features = [col for col in feature_columns if col != feature]
74
+
75
+ if pd.api.types.is_numeric_dtype(df[feature]):
76
+ median_value = df[feature].median()
77
+ node.median_value = median_value
78
+ node.children["<= " + str(median_value)] = hunts_build_tree(
79
+ df[df[feature] <= median_value], target_column, remaining_features
80
+ )
81
+ node.children["> " + str(median_value)] = hunts_build_tree(
82
+ df[df[feature] > median_value], target_column, remaining_features
83
+ )
84
+ else:
85
+ for val in df[feature].unique():
86
+ node.children[val] = hunts_build_tree(
87
+ df[df[feature] == val], target_column, remaining_features
88
+ )
89
+ return node
90
+
91
+
92
+ def print_tree(node, indent=""):
93
+ if node.is_leaf():
94
+ print(f"{indent}Leaf: {node.label}")
95
+ return
96
+ if node.median_value is not None:
97
+ print(f"{indent}[Numeric Split] {node.feature} <= {node.median_value}")
98
+ else:
99
+ print(f"{indent}[Categorical Split] {node.feature}")
100
+ for val, child in node.children.items():
101
+ print(f"{indent}--> {val}:")
102
+ print_tree(child, indent + " ")
103
+
104
+
105
+ print("\n" + "=" * 60)
106
+ print("HUNT'S ALGORITHM - Decision Tree")
107
+ print("=" * 60)
108
+
109
+ hunts_features = [col for col in df.columns if col not in ["Default id", "Tid"]]
110
+ hunts_tree = hunts_build_tree(df, target_column="Default id", feature_columns=hunts_features)
111
+ print_tree(hunts_tree)
112
+
113
+
114
+ # ===========================================================
115
+ # 3. ID3 ALGORITHM (Entropy-based Feature Selection)
116
+ # ===========================================================
117
+
118
+ # Entropy: H(S) = -Σ p(x) * log2(p(x))
119
+ def entropy(df, target_column):
120
+ counts = df[target_column].value_counts()
121
+ probs = counts / len(df)
122
+ return -sum(probs * np.log2(probs))
123
+
124
+
125
+ # Information Gain: IG(S, A) = H(S) - Σ (|Sv|/|S|) * H(Sv)
126
+ def information_gain(df, feature, target_column):
127
+ total_entropy = entropy(df, target_column)
128
+ weighted_entropy = 0
129
+ for value in df[feature].unique():
130
+ subset = df[df[feature] == value]
131
+ weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
132
+ return total_entropy - weighted_entropy
133
+
134
+
135
+ def best_feature(df, feature_columns, target_column):
136
+ gains = {f: information_gain(df, f, target_column) for f in feature_columns}
137
+ return max(gains, key=gains.get)
138
+
139
+
140
+ class ID3Node:
141
+ def __init__(self, feature=None, value=None, label=None):
142
+ self.feature = feature
143
+ self.value = value
144
+ self.children = {}
145
+ self.label = label
146
+
147
+ def is_leaf(self):
148
+ return self.label is not None
149
+
150
+
151
+ def id3(df, target_column, feature_columns):
152
+ if len(df[target_column].unique()) == 1:
153
+ return ID3Node(label=df[target_column].mode()[0])
154
+ if not feature_columns:
155
+ return ID3Node(label=df[target_column].mode()[0])
156
+
157
+ feature = best_feature(df, feature_columns, target_column)
158
+ node = ID3Node(feature=feature)
159
+ remaining_features = [col for col in feature_columns if col != feature]
160
+
161
+ if pd.api.types.is_numeric_dtype(df[feature]):
162
+ median_value = df[feature].median()
163
+ node.value = f"{feature} <= {median_value}"
164
+ node.children["<= " + str(median_value)] = id3(
165
+ df[df[feature] <= median_value], target_column, remaining_features
166
+ )
167
+ node.children["> " + str(median_value)] = id3(
168
+ df[df[feature] > median_value], target_column, remaining_features
169
+ )
170
+ else:
171
+ for val in df[feature].unique():
172
+ node.children[val] = id3(
173
+ df[df[feature] == val], target_column, remaining_features
174
+ )
175
+ return node
176
+
177
+
178
+ def print_id3_tree(node, indent=""):
179
+ if node.is_leaf():
180
+ print(f"{indent}Leaf: {node.label}")
181
+ return
182
+ if node.value:
183
+ print(f"{indent}[Numeric Split] {node.value}")
184
+ else:
185
+ print(f"{indent}[Categorical Split] {node.feature}")
186
+ for val, child in node.children.items():
187
+ print(f"{indent}--> {val}:")
188
+ print_id3_tree(child, indent + " ")
189
+
190
+
191
+ print("\n" + "=" * 60)
192
+ print("ID3 ALGORITHM - Decision Tree (data.csv)")
193
+ print("=" * 60)
194
+
195
+ id3_features = [col for col in df.columns if col not in ["Default id", "Tid"]]
196
+ id3_tree = id3(df, target_column="Default id", feature_columns=id3_features)
197
+ print_id3_tree(id3_tree)
198
+
199
+ # Tennis dataset
200
+ tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
201
+ if os.path.exists(tennis_path):
202
+ tennis_df = pd.read_csv(tennis_path)
203
+ tennis_features = [col for col in tennis_df.columns if col != "Play"]
204
+ tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
205
+ print("\n" + "=" * 60)
206
+ print("ID3 ALGORITHM - Decision Tree (tennis.csv)")
207
+ print("=" * 60)
208
+ print_id3_tree(tennis_tree)
209
+
210
+
211
+ # ===========================================================
212
+ # 4. BAGGING CLASSIFIER
213
+ # ===========================================================
214
+
215
+ print("\n" + "=" * 60)
216
+ print("BAGGING CLASSIFIER (Iris)")
217
+ print("=" * 60)
218
+
219
+ bagging_model = BaggingClassifier(
220
+ estimator=DecisionTreeClassifier(random_state=42),
221
+ n_estimators=10, random_state=42,
222
+ )
223
+ bagging_model.fit(X_train, y_train)
224
+ y_pred_bag = bagging_model.predict(X_test)
225
+ print(f"Accuracy: {accuracy_score(y_test, y_pred_bag):.4f}")
226
+ print(f"First 10 Predictions: {y_pred_bag[:10]}")
227
+
228
+
229
+ # ===========================================================
230
+ # 5. ADABOOST CLASSIFIER
231
+ # ===========================================================
232
+
233
+ print("\n" + "=" * 60)
234
+ print("ADABOOST CLASSIFIER (Iris)")
235
+ print("=" * 60)
236
+
237
+ adaboost_model = AdaBoostClassifier(
238
+ estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
239
+ n_estimators=50, learning_rate=1.0, random_state=42,
240
+ )
241
+ adaboost_model.fit(X_train, y_train)
242
+ y_pred_ada = adaboost_model.predict(X_test)
243
+ print(f"Accuracy: {accuracy_score(y_test, y_pred_ada):.4f}")
244
+ print(f"First 10 Predictions: {y_pred_ada[:10]}")
245
+
246
+
247
+ # ===========================================================
248
+ # 6. METRICS & COMPARISON
249
+ # ===========================================================
250
+
251
+ print("\n" + "=" * 60)
252
+ print("METRICS - BAGGING")
253
+ print("=" * 60)
254
+ # Accuracy = (TP + TN) / (TP + TN + FP + FN)
255
+ print(f"Accuracy: {accuracy_score(y_test, y_pred_bag):.4f}")
256
+ # Precision = TP / (TP + FP)
257
+ print(f"Precision: {precision_score(y_test, y_pred_bag, average='weighted'):.4f}")
258
+ # Recall = TP / (TP + FN)
259
+ print(f"Recall: {recall_score(y_test, y_pred_bag, average='weighted'):.4f}")
260
+ # F1 = 2 * (Precision * Recall) / (Precision + Recall)
261
+ print(f"F1 Score: {f1_score(y_test, y_pred_bag, average='weighted'):.4f}")
262
+ print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred_bag)}")
263
+ print(f"\n{classification_report(y_test, y_pred_bag, target_names=iris.target_names)}")
264
+
265
+ print("=" * 60)
266
+ print("METRICS - ADABOOST")
267
+ print("=" * 60)
268
+ print(f"Accuracy: {accuracy_score(y_test, y_pred_ada):.4f}")
269
+ print(f"Precision: {precision_score(y_test, y_pred_ada, average='weighted'):.4f}")
270
+ print(f"Recall: {recall_score(y_test, y_pred_ada, average='weighted'):.4f}")
271
+ print(f"F1 Score: {f1_score(y_test, y_pred_ada, average='weighted'):.4f}")
272
+ print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred_ada)}")
273
+ print(f"\n{classification_report(y_test, y_pred_ada, target_names=iris.target_names)}")
274
+
275
+ print("=" * 60)
276
+ print("COMPARISON")
277
+ print("=" * 60)
278
+ print(f"Bagging Accuracy: {accuracy_score(y_test, y_pred_bag):.4f}")
279
+ print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_ada):.4f}")
280
+
281
+
282
+ # ===========================================================
283
+ # 7. VISUALIZATION (Confusion Matrix Plot - matplotlib)
284
+ # ===========================================================
285
+
286
+ fig, axes = plt.subplots(1, 2, figsize=(14, 5))
287
+
288
+ for ax, y_pred, title in zip(
289
+ axes, [y_pred_bag, y_pred_ada], ["Bagging", "AdaBoost"]
290
+ ):
291
+ cm = confusion_matrix(y_test, y_pred)
292
+ im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
293
+ ax.set_title(f"{title} - Confusion Matrix", fontsize=12, fontweight="bold")
294
+ plt.colorbar(im, ax=ax)
295
+ ticks = np.arange(len(iris.target_names))
296
+ ax.set_xticks(ticks); ax.set_xticklabels(iris.target_names, rotation=45, ha="right")
297
+ ax.set_yticks(ticks); ax.set_yticklabels(iris.target_names)
298
+ thresh = cm.max() / 2.0
299
+ for i in range(cm.shape[0]):
300
+ for j in range(cm.shape[1]):
301
+ ax.text(j, i, format(cm[i, j], "d"), ha="center", va="center",
302
+ color="white" if cm[i, j] > thresh else "black")
303
+ ax.set_ylabel("True Label"); ax.set_xlabel("Predicted Label")
304
+
305
+ plt.tight_layout()
306
+ plt.savefig("all_confusion_matrices.png", dpi=150)
307
+ plt.show()
308
+ print("\nConfusion matrix plots saved as 'all_confusion_matrices.png'")