id3-classification 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: id3-classification
3
+ Version: 0.2.0
4
+ Summary: Reusable ID3 Decision Tree Classifier
5
+ Author: nanashi
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy
11
+ Requires-Dist: pandas
12
+ Dynamic: license-file
13
+
14
+ # ID3 Classifier
15
+
16
+ Reusable ID3 Decision Tree Algorithm.
@@ -0,0 +1,7 @@
1
+ id3_classification-0.2.0.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ myid3/__init__.py,sha256=ICY5EHtGpg6LpP9ds8E2cuOuW5BWaRte1he9kEMKzCk,30
3
+ myid3/id3.py,sha256=QLOmYQ4HH2fWy0yN0W8oE03Tc6PjbQ3uzG43lz29row,3350
4
+ id3_classification-0.2.0.dist-info/METADATA,sha256=TIqxGOY0yOBaWXXF3hvQvx8yPmBzvy-agqiTDEM07sI,361
5
+ id3_classification-0.2.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
6
+ id3_classification-0.2.0.dist-info/top_level.txt,sha256=rABkUX8bSJrfUweW7BUmElF9KsRSAkSLbAuHYOsUSf0,6
7
+ id3_classification-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
File without changes
@@ -0,0 +1 @@
1
+ myid3
myid3/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .id3 import ID3Classifier
myid3/id3.py ADDED
@@ -0,0 +1,163 @@
1
+ import numpy as np
2
+ import math
3
+
4
+
5
+ class ID3Classifier:
6
+
7
+ def __init__(self):
8
+ self.tree = None
9
+ self.target = None
10
+ self.features = None
11
+
12
+
13
+ # ===========================
14
+ # ENTROPY
15
+ # ===========================
16
+ def entropy(self, data):
17
+
18
+ values, counts = np.unique(data, return_counts=True)
19
+
20
+ ent = 0
21
+
22
+ for count in counts:
23
+
24
+ prob = count / sum(counts)
25
+
26
+ if prob > 0:
27
+ ent -= prob * math.log2(prob)
28
+
29
+ return ent
30
+
31
+
32
+ # ===========================
33
+ # INFORMATION GAIN
34
+ # ===========================
35
+ def information_gain(self, df, feature):
36
+
37
+ total_entropy = self.entropy(df[self.target])
38
+
39
+ values, counts = np.unique(df[feature], return_counts=True)
40
+
41
+ weighted_entropy = 0
42
+
43
+ for i in range(len(values)):
44
+
45
+ subset = df[df[feature] == values[i]]
46
+
47
+ subset_entropy = self.entropy(subset[self.target])
48
+
49
+ weight = counts[i] / sum(counts)
50
+
51
+ weighted_entropy += weight * subset_entropy
52
+
53
+ return total_entropy - weighted_entropy
54
+
55
+
56
+ # ===========================
57
+ # ID3 TREE BUILDER
58
+ # ===========================
59
+ def _id3(self, data, features):
60
+
61
+ # Pure node
62
+ if len(np.unique(data[self.target])) == 1:
63
+
64
+ return np.unique(data[self.target])[0]
65
+
66
+ # No features left
67
+ if len(features) == 0:
68
+
69
+ return data[self.target].mode()[0]
70
+
71
+
72
+ igs = [self.information_gain(data, f) for f in features]
73
+
74
+ best = features[np.argmax(igs)]
75
+
76
+ tree = {best: {}}
77
+
78
+ remaining = [f for f in features if f != best]
79
+
80
+
81
+ for value in np.unique(data[best]):
82
+
83
+ subset = data[data[best] == value]
84
+
85
+ if len(subset) == 0:
86
+
87
+ tree[best][value] = data[self.target].mode()[0]
88
+
89
+ else:
90
+
91
+ tree[best][value] = self._id3(subset, remaining)
92
+
93
+ return tree
94
+
95
+
96
+ # ===========================
97
+ # FIT MODEL
98
+ # ===========================
99
+ def fit(self, df, features, target):
100
+
101
+ self.target = target
102
+ self.features = features
103
+
104
+ self.tree = self._id3(df, features)
105
+
106
+
107
+ # ===========================
108
+ # PRINT TREE
109
+ # ===========================
110
+ def print_tree(self, tree=None, indent=""):
111
+
112
+ if tree is None:
113
+ tree = self.tree
114
+
115
+ if not isinstance(tree, dict):
116
+
117
+ print(indent + "→", tree)
118
+
119
+ return
120
+
121
+ for key in tree:
122
+
123
+ print(indent + key)
124
+
125
+ for value in tree[key]:
126
+
127
+ print(indent + " ├─", value)
128
+
129
+ self.print_tree(tree[key][value], indent + " │ ")
130
+
131
+
132
+ # ===========================
133
+ # SINGLE PREDICT
134
+ # ===========================
135
+ def _predict(self, tree, sample):
136
+
137
+ if not isinstance(tree, dict):
138
+
139
+ return tree
140
+
141
+ root = list(tree.keys())[0]
142
+
143
+ value = sample[root]
144
+
145
+ if value not in tree[root]:
146
+
147
+ return None
148
+
149
+ return self._predict(tree[root][value], sample)
150
+
151
+
152
+ # ===========================
153
+ # MULTIPLE PREDICT
154
+ # ===========================
155
+ def predict(self, df):
156
+
157
+ preds = []
158
+
159
+ for _, row in df.iterrows():
160
+
161
+ preds.append(self._predict(self.tree, row))
162
+
163
+ return np.array(preds)