id3-classification 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: id3-classification
3
+ Version: 0.2.0
4
+ Summary: Reusable ID3 Decision Tree Classifier
5
+ Author: nanashi
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy
11
+ Requires-Dist: pandas
12
+ Dynamic: license-file
13
+
14
+ # ID3 Classifier
15
+
16
+ Reusable ID3 Decision Tree Algorithm.
@@ -0,0 +1,3 @@
1
+ # ID3 Classifier
2
+
3
+ Reusable ID3 Decision Tree Algorithm.
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+
7
+ name = "id3-classification" # MUST be unique worldwide
8
+ version = "0.2.0"
9
+
10
+ description = "Reusable ID3 Decision Tree Classifier"
11
+
12
+ readme = "README.md"
13
+
14
+ requires-python = ">=3.8"
15
+
16
+ authors = [
17
+ {name="nanashi"}
18
+ ]
19
+
20
+ license = {text="MIT"}
21
+
22
+ dependencies = [
23
+ "numpy",
24
+ "pandas"
25
+ ]
26
+
27
+ [tool.setuptools.packages.find]
28
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: id3-classification
3
+ Version: 0.2.0
4
+ Summary: Reusable ID3 Decision Tree Classifier
5
+ Author: nanashi
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: numpy
11
+ Requires-Dist: pandas
12
+ Dynamic: license-file
13
+
14
+ # ID3 Classifier
15
+
16
+ Reusable ID3 Decision Tree Algorithm.
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/id3_classification.egg-info/PKG-INFO
5
+ src/id3_classification.egg-info/SOURCES.txt
6
+ src/id3_classification.egg-info/dependency_links.txt
7
+ src/id3_classification.egg-info/requires.txt
8
+ src/id3_classification.egg-info/top_level.txt
9
+ src/myid3/__init__.py
10
+ src/myid3/id3.py
@@ -0,0 +1 @@
1
+ from .id3 import ID3Classifier
@@ -0,0 +1,163 @@
1
+ import numpy as np
2
+ import math
3
+
4
+
5
+ class ID3Classifier:
6
+
7
+ def __init__(self):
8
+ self.tree = None
9
+ self.target = None
10
+ self.features = None
11
+
12
+
13
+ # ===========================
14
+ # ENTROPY
15
+ # ===========================
16
+ def entropy(self, data):
17
+
18
+ values, counts = np.unique(data, return_counts=True)
19
+
20
+ ent = 0
21
+
22
+ for count in counts:
23
+
24
+ prob = count / sum(counts)
25
+
26
+ if prob > 0:
27
+ ent -= prob * math.log2(prob)
28
+
29
+ return ent
30
+
31
+
32
+ # ===========================
33
+ # INFORMATION GAIN
34
+ # ===========================
35
+ def information_gain(self, df, feature):
36
+
37
+ total_entropy = self.entropy(df[self.target])
38
+
39
+ values, counts = np.unique(df[feature], return_counts=True)
40
+
41
+ weighted_entropy = 0
42
+
43
+ for i in range(len(values)):
44
+
45
+ subset = df[df[feature] == values[i]]
46
+
47
+ subset_entropy = self.entropy(subset[self.target])
48
+
49
+ weight = counts[i] / sum(counts)
50
+
51
+ weighted_entropy += weight * subset_entropy
52
+
53
+ return total_entropy - weighted_entropy
54
+
55
+
56
+ # ===========================
57
+ # ID3 TREE BUILDER
58
+ # ===========================
59
+ def _id3(self, data, features):
60
+
61
+ # Pure node
62
+ if len(np.unique(data[self.target])) == 1:
63
+
64
+ return np.unique(data[self.target])[0]
65
+
66
+ # No features left
67
+ if len(features) == 0:
68
+
69
+ return data[self.target].mode()[0]
70
+
71
+
72
+ igs = [self.information_gain(data, f) for f in features]
73
+
74
+ best = features[np.argmax(igs)]
75
+
76
+ tree = {best: {}}
77
+
78
+ remaining = [f for f in features if f != best]
79
+
80
+
81
+ for value in np.unique(data[best]):
82
+
83
+ subset = data[data[best] == value]
84
+
85
+ if len(subset) == 0:
86
+
87
+ tree[best][value] = data[self.target].mode()[0]
88
+
89
+ else:
90
+
91
+ tree[best][value] = self._id3(subset, remaining)
92
+
93
+ return tree
94
+
95
+
96
+ # ===========================
97
+ # FIT MODEL
98
+ # ===========================
99
+ def fit(self, df, features, target):
100
+
101
+ self.target = target
102
+ self.features = features
103
+
104
+ self.tree = self._id3(df, features)
105
+
106
+
107
+ # ===========================
108
+ # PRINT TREE
109
+ # ===========================
110
+ def print_tree(self, tree=None, indent=""):
111
+
112
+ if tree is None:
113
+ tree = self.tree
114
+
115
+ if not isinstance(tree, dict):
116
+
117
+ print(indent + "→", tree)
118
+
119
+ return
120
+
121
+ for key in tree:
122
+
123
+ print(indent + key)
124
+
125
+ for value in tree[key]:
126
+
127
+ print(indent + " ├─", value)
128
+
129
+ self.print_tree(tree[key][value], indent + " │ ")
130
+
131
+
132
+ # ===========================
133
+ # SINGLE PREDICT
134
+ # ===========================
135
+ def _predict(self, tree, sample):
136
+
137
+ if not isinstance(tree, dict):
138
+
139
+ return tree
140
+
141
+ root = list(tree.keys())[0]
142
+
143
+ value = sample[root]
144
+
145
+ if value not in tree[root]:
146
+
147
+ return None
148
+
149
+ return self._predict(tree[root][value], sample)
150
+
151
+
152
+ # ===========================
153
+ # MULTIPLE PREDICT
154
+ # ===========================
155
+ def predict(self, df):
156
+
157
+ preds = []
158
+
159
+ for _, row in df.iterrows():
160
+
161
+ preds.append(self._predict(self.tree, row))
162
+
163
+ return np.array(preds)