bm-preprocessing 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/PKG-INFO +2 -1
  2. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/USAGE.md +26 -1
  3. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/pyproject.toml +2 -1
  4. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/__init__.py +5 -1
  5. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/bagging.py +30 -0
  6. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/id3.py +30 -0
  7. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/id3_test.py +30 -0
  8. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/bagging.py +173 -0
  9. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/id3.py +134 -0
  10. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/id3_test.py +148 -0
  11. bm_preprocessing-0.5.0/src/bm_preprocessing/DM/sources/tennis.csv +15 -0
  12. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/.gitignore +0 -0
  13. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/README.md +0 -0
  14. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/all.py +0 -0
  15. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  16. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/hash.py +0 -0
  17. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  18. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  19. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  20. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  21. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  22. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  23. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  24. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  25. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  26. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  27. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  28. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/IR/all.py +0 -0
  29. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  30. {bm_preprocessing-0.3.0 → bm_preprocessing-0.5.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
7
7
  Requires-Dist: graphviz>=0.20.3
8
8
  Requires-Dist: matplotlib>=3.7.5
9
9
  Requires-Dist: pandas>=2.0.3
10
+ Requires-Dist: scikit-learn>=1.3.2
10
11
  Requires-Dist: twine>=6.1.0
11
12
  Description-Content-Type: text/markdown
12
13
 
@@ -15,7 +15,7 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, preprocessing
18
+ from bm_preprocessing.DM import apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
19
19
 
20
20
  # Print the source code
21
21
  print("=== IR All Module ===")
@@ -24,6 +24,9 @@ print(all)
24
24
  print("\n=== DM Apriori Module ===")
25
25
  print(apriori)
26
26
 
27
+ print("\n=== DM Bagging Module ===")
28
+ print(bagging)
29
+
27
30
  print("\n=== DM Hash Module ===")
28
31
  print(hash)
29
32
 
@@ -33,6 +36,12 @@ print(hunts)
33
36
  print("\n=== DM Hunts Test Module ===")
34
37
  print(hunts_test)
35
38
 
39
+ print("\n=== DM ID3 Module ===")
40
+ print(id3)
41
+
42
+ print("\n=== DM ID3 Test Module ===")
43
+ print(id3_test)
44
+
36
45
  print("\n=== DM Preprocessing Module ===")
37
46
  print(preprocessing)
38
47
  ```
@@ -61,11 +70,21 @@ Then in the Python REPL:
61
70
  >>> print(apriori)
62
71
  # Prints entire DM/apriori.py source code
63
72
 
73
+ >>> from bm_preprocessing.DM import bagging
74
+ >>> print(bagging)
75
+ # Prints entire DM/bagging.py source code
76
+
64
77
  >>> from bm_preprocessing.DM import hunts, hunts_test
65
78
  >>> print(hunts)
66
79
  # Prints entire DM/hunts.py source code
67
80
  >>> print(hunts_test)
68
81
  # Prints entire DM/hunts_test.py source code
82
+
83
+ >>> from bm_preprocessing.DM import id3, id3_test
84
+ >>> print(id3)
85
+ # Prints entire DM/id3.py source code
86
+ >>> print(id3_test)
87
+ # Prints entire DM/id3_test.py source code
69
88
  ```
70
89
 
71
90
  ---
@@ -75,9 +94,12 @@ Then in the Python REPL:
75
94
  ```bash
76
95
  python -c "from bm_preprocessing.IR import all; print(all)"
77
96
  python -c "from bm_preprocessing.DM import apriori; print(apriori)"
97
+ python -c "from bm_preprocessing.DM import bagging; print(bagging)"
78
98
  python -c "from bm_preprocessing.DM import hash; print(hash)"
79
99
  python -c "from bm_preprocessing.DM import hunts; print(hunts)"
80
100
  python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
101
+ python -c "from bm_preprocessing.DM import id3; print(id3)"
102
+ python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
81
103
  python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
82
104
  ```
83
105
 
@@ -90,7 +112,10 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
90
112
  | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
91
113
  | `from bm_preprocessing.DM import all` | Data Mining algorithms |
92
114
  | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
115
+ | `from bm_preprocessing.DM import bagging` | Bagging ensemble classifier |
93
116
  | `from bm_preprocessing.DM import hash` | Hash-based mining |
94
117
  | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
95
118
  | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
119
+ | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
120
+ | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
96
121
  | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.3.0"
7
+ version = "0.5.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,6 +13,7 @@ dependencies = [
13
13
  "graphviz>=0.20.3",
14
14
  "matplotlib>=3.7.5",
15
15
  "pandas>=2.0.3",
16
+ "scikit-learn>=1.3.2",
16
17
  "twine>=6.1.0",
17
18
  ]
18
19
 
@@ -2,9 +2,13 @@
2
2
 
3
3
  from .all import all
4
4
  from .apriori import apriori
5
+ from .bagging import bagging
5
6
  from .hash import hash
6
7
  from .hunts import hunts
7
8
  from .hunts_test import hunts_test
9
+ from .id3 import id3
10
+ from .id3_test import id3_test
8
11
  from .preprocessing import preprocessing
9
12
 
10
- __all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "preprocessing"]
13
+ __all__ = ["all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
14
+
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/bagging.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "bagging.py"
30
+ bagging = SourceCodeModule("DM.bagging", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3.py"
30
+ id3 = SourceCodeModule("DM.id3", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3_test.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3_test.py"
30
+ id3_test = SourceCodeModule("DM.id3_test", _source_file)
@@ -0,0 +1,173 @@
1
+ import numpy as np
2
+ from sklearn.datasets import load_iris
3
+ from sklearn.model_selection import train_test_split
4
+ from collections import Counter
5
+
6
+ iris = load_iris()
7
+ X = iris.data
8
+ y = iris.target
9
+
10
+ X_train, X_test, y_train, y_test = train_test_split(
11
+ X, y, test_size=0.3, random_state=42, stratify=y
12
+ )
13
+
14
+ def gini(y):
15
+ counts = np.bincount(y)
16
+ probs = counts / len(y)
17
+ return 1 - np.sum(probs**2)
18
+
19
+ class TreeNode:
20
+ def __init__(self, depth=0, max_depth=3):
21
+ self.depth = depth
22
+ self.max_depth = max_depth
23
+ self.feature = None
24
+ self.threshold = None
25
+ self.left = None
26
+ self.right = None
27
+ self.value = None
28
+
29
+ def fit(self, X, y):
30
+ if len(set(y)) == 1:
31
+ self.value = y[0]
32
+ return
33
+ if self.depth >= self.max_depth or len(y) <= 2:
34
+ self.value = Counter(y).most_common(1)[0][0]
35
+ return
36
+ n_samples, n_features = X.shape
37
+ best_gini = 1.0
38
+ for feature in range(n_features):
39
+ thresholds = np.unique(X[:, feature])
40
+ for t in thresholds:
41
+ left_mask = X[:, feature] <= t
42
+ right_mask = X[:, feature] > t
43
+ if sum(left_mask) == 0 or sum(right_mask) == 0:
44
+ continue
45
+ g = (sum(left_mask)/n_samples)*gini(y[left_mask]) + \
46
+ (sum(right_mask)/n_samples)*gini(y[right_mask])
47
+ if g < best_gini:
48
+ best_gini = g
49
+ self.feature = feature
50
+ self.threshold = t
51
+ if self.feature is None:
52
+ self.value = Counter(y).most_common(1)[0][0]
53
+ return
54
+ left_mask = X[:, self.feature] <= self.threshold
55
+ right_mask = X[:, self.feature] > self.threshold
56
+ self.left = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
57
+ self.left.fit(X[left_mask], y[left_mask])
58
+ self.right = TreeNode(depth=self.depth+1, max_depth=self.max_depth)
59
+ self.right.fit(X[right_mask], y[right_mask])
60
+
61
+ def predict(self, X):
62
+ if self.value is not None:
63
+ return np.array([self.value]*len(X))
64
+ left_mask = X[:, self.feature] <= self.threshold
65
+ right_mask = X[:, self.feature] > self.threshold
66
+ y_pred = np.empty(X.shape[0], dtype=int)
67
+ if sum(left_mask) > 0:
68
+ y_pred[left_mask] = self.left.predict(X[left_mask])
69
+ if sum(right_mask) > 0:
70
+ y_pred[right_mask] = self.right.predict(X[right_mask])
71
+ return y_pred
72
+
73
+ np.random.seed(42)
74
+ n_estimators = 10
75
+ models = []
76
+
77
+ for i in range(n_estimators):
78
+ indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
79
+ X_sample = X_train[indices]
80
+ y_sample = y_train[indices]
81
+ tree = TreeNode(max_depth=3)
82
+ tree.fit(X_sample, y_sample)
83
+ models.append(tree)
84
+
85
+ all_preds = np.array([model.predict(X_test) for model in models])
86
+ final_preds = []
87
+
88
+ for i in range(len(X_test)):
89
+ votes = Counter(all_preds[:, i])
90
+ final_preds.append(votes.most_common(1)[0][0])
91
+
92
+ final_preds = np.array(final_preds)
93
+ accuracy = np.sum(final_preds == y_test) / len(y_test)
94
+
95
+ print("First 10 Predictions:", final_preds[:10])
96
+ print("Accuracy:", accuracy)
97
+
98
+ """Fully libraries"""
99
+
100
+ from sklearn.datasets import load_iris
101
+ from sklearn.model_selection import train_test_split
102
+ from sklearn.tree import DecisionTreeClassifier
103
+ from sklearn.ensemble import BaggingClassifier
104
+ from sklearn.metrics import accuracy_score
105
+
106
+ iris = load_iris()
107
+ X = iris.data
108
+ y = iris.target
109
+
110
+ X_train, X_test, y_train, y_test = train_test_split(
111
+ X, y,
112
+ test_size=0.3,
113
+ random_state=42,
114
+ stratify=y
115
+ )
116
+
117
+ base_classifier = DecisionTreeClassifier(random_state=42)
118
+
119
+ bagging_model = BaggingClassifier(
120
+ estimator=base_classifier,
121
+ n_estimators=10,
122
+ random_state=42
123
+ )
124
+
125
+ bagging_model.fit(X_train, y_train)
126
+
127
+ y_pred = bagging_model.predict(X_test)
128
+
129
+ accuracy = accuracy_score(y_test, y_pred)
130
+
131
+ print("Bagging Classifier Accuracy:", accuracy)
132
+
133
+ """Only DT as library function"""
134
+
135
+ from sklearn.datasets import load_iris
136
+ from sklearn.model_selection import train_test_split
137
+ from sklearn.tree import DecisionTreeClassifier
138
+ from sklearn.metrics import accuracy_score
139
+ from collections import Counter
140
+ import numpy as np
141
+
142
+ iris = load_iris()
143
+ X = iris.data
144
+ y = iris.target
145
+
146
+ X_train, X_test, y_train, y_test = train_test_split(
147
+ X, y, test_size=0.3, random_state=42, stratify=y
148
+ )
149
+
150
+ np.random.seed(42)
151
+ n_estimators = 10
152
+ estimators = []
153
+
154
+ for i in range(n_estimators):
155
+ indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
156
+ X_sample = X_train[indices]
157
+ y_sample = y_train[indices]
158
+
159
+ tree = DecisionTreeClassifier(random_state=42)
160
+ tree.fit(X_sample, y_sample)
161
+ estimators.append(tree)
162
+
163
+ all_preds = np.array([tree.predict(X_test) for tree in estimators])
164
+
165
+ final_preds = []
166
+ for i in range(len(X_test)):
167
+ votes = Counter(all_preds[:, i])
168
+ final_preds.append(votes.most_common(1)[0][0])
169
+ final_preds = np.array(final_preds)
170
+
171
+ accuracy = accuracy_score(y_test, final_preds)
172
+ print("First 10 Predictions:", final_preds[:10])
173
+ print("Manual Bagging Accuracy:", accuracy)
@@ -0,0 +1,134 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+
5
+
6
+ # Load and process data
7
+ data_path = os.path.join(os.path.dirname(__file__), "data.csv")
8
+ df = pd.read_csv(data_path)
9
+ df["Annual Income"] = (
10
+ df["Annual Income"]
11
+ .astype(str)
12
+ .str.replace("K", "", regex=False)
13
+ .str.replace(" ", "", regex=False)
14
+ .astype(int)
15
+ * 1000
16
+ )
17
+
18
+
19
+ # Entropy calculation
20
+ def entropy(df, target_column):
21
+ counts = df[target_column].value_counts()
22
+ probs = counts / len(df)
23
+ return -sum(probs * np.log2(probs))
24
+
25
+
26
+ # Information gain calculation
27
+ def information_gain(df, feature, target_column):
28
+ total_entropy = entropy(df, target_column)
29
+
30
+ values = df[feature].unique()
31
+
32
+ weighted_entropy = 0
33
+ for value in values:
34
+ subset = df[df[feature] == value]
35
+ weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
36
+
37
+ return total_entropy - weighted_entropy
38
+
39
+
40
+ # Best feature selection
41
+ def best_feature(df, feature_columns, target_column):
42
+ gains = {
43
+ feature: information_gain(df, feature, target_column)
44
+ for feature in feature_columns
45
+ }
46
+ return max(gains, key=gains.get)
47
+
48
+
49
+ # Node class
50
+ class ID3Node:
51
+ def __init__(self, feature=None, value=None, label=None):
52
+ self.feature = feature
53
+ self.value = value
54
+ self.children = {}
55
+ self.label = label
56
+
57
+ def is_leaf(self):
58
+ return self.label is not None
59
+
60
+
61
+ # ID3 algorithm
62
+ def id3(df, target_column, feature_columns):
63
+ # If the target column is pure, return a leaf node
64
+ if len(df[target_column].unique()) == 1:
65
+ return ID3Node(label=df[target_column].mode()[0])
66
+
67
+ # If no features left, return leaf with majority class
68
+ if not feature_columns:
69
+ return ID3Node(label=df[target_column].mode()[0])
70
+
71
+ feature = best_feature(df, feature_columns, target_column)
72
+ node = ID3Node(feature=feature)
73
+
74
+ if pd.api.types.is_numeric_dtype(df[feature]):
75
+ median_value = df[feature].median()
76
+ left_df = df[df[feature] <= median_value]
77
+ right_df = df[df[feature] > median_value]
78
+
79
+ node.value = f"{feature} <= {median_value}"
80
+
81
+ remaining_features = [col for col in feature_columns if col != feature]
82
+ node.children["<= " + str(median_value)] = id3(
83
+ left_df, target_column, remaining_features
84
+ )
85
+ node.children["> " + str(median_value)] = id3(
86
+ right_df, target_column, remaining_features
87
+ )
88
+ else:
89
+ unique_vals = df[feature].unique()
90
+ for val in unique_vals:
91
+ subset = df[df[feature] == val]
92
+ remaining_features = [col for col in feature_columns if col != feature]
93
+ node.children[val] = id3(subset, target_column, remaining_features)
94
+
95
+ return node
96
+
97
+
98
+ # Print tree function
99
+ def print_id3_tree(node, indent=""):
100
+ if node.is_leaf():
101
+ print(f"{indent}Leaf: {node.label}")
102
+ return
103
+
104
+ if node.value:
105
+ print(f"{indent}[Numeric Split] {node.value}")
106
+ else:
107
+ print(f"{indent}[Categorical Split] {node.feature}")
108
+
109
+ for val, child in node.children.items():
110
+ print(f"{indent}--> {val}:")
111
+ print_id3_tree(child, indent + " ")
112
+
113
+
114
+ def main():
115
+ feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
116
+
117
+ tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
118
+
119
+ print("=== ID3 Algorithm - Decision Tree (data.csv) ===\n")
120
+ print_id3_tree(tree_root)
121
+
122
+ # Tennis dataset
123
+ tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
124
+ tennis_df = pd.read_csv(tennis_path)
125
+
126
+ tennis_features = [col for col in tennis_df.columns if col != "Play"]
127
+ tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
128
+
129
+ print("\n\n=== ID3 Algorithm - Decision Tree (tennis.csv) ===\n")
130
+ print_id3_tree(tennis_tree)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
@@ -0,0 +1,148 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import graphviz
4
+ import os
5
+
6
+
7
+ # Load and process data
8
+ data_path = os.path.join(os.path.dirname(__file__), "data.csv")
9
+ df = pd.read_csv(data_path)
10
+ df["Annual Income"] = (
11
+ df["Annual Income"]
12
+ .astype(str)
13
+ .str.replace("K", "", regex=False)
14
+ .str.replace(" ", "", regex=False)
15
+ .astype(int)
16
+ * 1000
17
+ )
18
+
19
+
20
+ # Entropy calculation
21
+ def entropy(df, target_column):
22
+ counts = df[target_column].value_counts()
23
+ probs = counts / len(df)
24
+ return -sum(probs * np.log2(probs))
25
+
26
+
27
+ # Information gain calculation
28
+ def information_gain(df, feature, target_column):
29
+ total_entropy = entropy(df, target_column)
30
+
31
+ values = df[feature].unique()
32
+
33
+ weighted_entropy = 0
34
+ for value in values:
35
+ subset = df[df[feature] == value]
36
+ weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
37
+
38
+ return total_entropy - weighted_entropy
39
+
40
+
41
+ # Best feature selection
42
+ def best_feature(df, feature_columns, target_column):
43
+ gains = {
44
+ feature: information_gain(df, feature, target_column)
45
+ for feature in feature_columns
46
+ }
47
+ return max(gains, key=gains.get)
48
+
49
+
50
+ # Node class
51
+ class ID3Node:
52
+ def __init__(self, feature=None, value=None, label=None):
53
+ self.feature = feature
54
+ self.value = value
55
+ self.children = {}
56
+ self.label = label
57
+
58
+ def is_leaf(self):
59
+ return self.label is not None
60
+
61
+
62
+ # ID3 algorithm
63
+ def id3(df, target_column, feature_columns):
64
+ # If the target column is pure, return a leaf node
65
+ if len(df[target_column].unique()) == 1:
66
+ return ID3Node(label=df[target_column].mode()[0])
67
+
68
+ # If no features left, return leaf with majority class
69
+ if not feature_columns:
70
+ return ID3Node(label=df[target_column].mode()[0])
71
+
72
+ feature = best_feature(df, feature_columns, target_column)
73
+ node = ID3Node(feature=feature)
74
+
75
+ if pd.api.types.is_numeric_dtype(df[feature]):
76
+ median_value = df[feature].median()
77
+ left_df = df[df[feature] <= median_value]
78
+ right_df = df[df[feature] > median_value]
79
+
80
+ node.value = f"{feature} <= {median_value}"
81
+
82
+ remaining_features = [col for col in feature_columns if col != feature]
83
+ node.children["<= " + str(median_value)] = id3(
84
+ left_df, target_column, remaining_features
85
+ )
86
+ node.children["> " + str(median_value)] = id3(
87
+ right_df, target_column, remaining_features
88
+ )
89
+ else:
90
+ unique_vals = df[feature].unique()
91
+ for val in unique_vals:
92
+ subset = df[df[feature] == val]
93
+ remaining_features = [col for col in feature_columns if col != feature]
94
+ node.children[val] = id3(subset, target_column, remaining_features)
95
+
96
+ return node
97
+
98
+
99
+ # Tree visualization function using graphviz
100
+ def visualize_id3_tree(node, parent_name="Root", graph=None):
101
+ if graph is None:
102
+ graph = graphviz.Digraph(format="png", engine="dot")
103
+
104
+ if node.is_leaf():
105
+ graph.node(parent_name, label=str(node.label), shape="ellipse")
106
+ else:
107
+ if node.value:
108
+ label = node.value
109
+ else:
110
+ label = str(node.feature)
111
+ graph.node(parent_name, label=label, shape="box")
112
+
113
+ for val, child in node.children.items():
114
+ child_name = f"{parent_name}_{val}"
115
+ graph.edge(parent_name, child_name, label=str(val))
116
+ visualize_id3_tree(child, child_name, graph)
117
+
118
+ return graph
119
+
120
+
121
+ def main():
122
+ feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
123
+
124
+ tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
125
+
126
+ # Visualize the tree using graphviz
127
+ graph = visualize_id3_tree(tree_root)
128
+ output_path = os.path.join(os.path.dirname(__file__), "id3_decision_tree")
129
+ graph.render(output_path, view=True, cleanup=True)
130
+ print(f"Decision tree rendered and saved as '{output_path}.png'")
131
+
132
+ # Tennis dataset
133
+ tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
134
+ tennis_df = pd.read_csv(tennis_path)
135
+
136
+ tennis_features = [col for col in tennis_df.columns if col != "Play"]
137
+ tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
138
+
139
+ graph_tennis = visualize_id3_tree(tennis_tree)
140
+ tennis_output_path = os.path.join(
141
+ os.path.dirname(__file__), "id3_tennis_decision_tree"
142
+ )
143
+ graph_tennis.render(tennis_output_path, view=True, cleanup=True)
144
+ print(f"Tennis decision tree rendered and saved as '{tennis_output_path}.png'")
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
@@ -0,0 +1,15 @@
1
+ Outlook,Temperature,Humidity,Wind,Play
2
+ Sunny,Hot,High,Weak,No
3
+ Sunny,Hot,High,Strong,No
4
+ Overcast,Hot,High,Weak,Yes
5
+ Rain,Mild,High,Weak,Yes
6
+ Rain,Cool,Normal,Weak,Yes
7
+ Rain,Cool,Normal,Strong,No
8
+ Overcast,Cool,Normal,Strong,Yes
9
+ Sunny,Mild,High,Weak,No
10
+ Sunny,Cool,Normal,Weak,Yes
11
+ Rain,Mild,Normal,Weak,Yes
12
+ Sunny,Mild,Normal,Strong,Yes
13
+ Overcast,Mild,High,Strong,Yes
14
+ Overcast,Hot,Normal,Weak,Yes
15
+ Rain,Mild,High,Strong,No