id3-classification 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- id3_classification-0.2.0/LICENSE +0 -0
- id3_classification-0.2.0/PKG-INFO +16 -0
- id3_classification-0.2.0/README.md +3 -0
- id3_classification-0.2.0/pyproject.toml +28 -0
- id3_classification-0.2.0/setup.cfg +4 -0
- id3_classification-0.2.0/src/id3_classification.egg-info/PKG-INFO +16 -0
- id3_classification-0.2.0/src/id3_classification.egg-info/SOURCES.txt +10 -0
- id3_classification-0.2.0/src/id3_classification.egg-info/dependency_links.txt +1 -0
- id3_classification-0.2.0/src/id3_classification.egg-info/requires.txt +2 -0
- id3_classification-0.2.0/src/id3_classification.egg-info/top_level.txt +1 -0
- id3_classification-0.2.0/src/myid3/__init__.py +1 -0
- id3_classification-0.2.0/src/myid3/id3.py +163 -0
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: id3-classification
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Reusable ID3 Decision Tree Classifier
|
|
5
|
+
Author: nanashi
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# ID3 Classifier
|
|
15
|
+
|
|
16
|
+
Reusable ID3 Decision Tree Algorithm.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
|
|
7
|
+
name = "id3-classification" # MUST be unique worldwide
|
|
8
|
+
version = "0.2.0"
|
|
9
|
+
|
|
10
|
+
description = "Reusable ID3 Decision Tree Classifier"
|
|
11
|
+
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
|
|
16
|
+
authors = [
|
|
17
|
+
{name="nanashi"}
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
license = {text="MIT"}
|
|
21
|
+
|
|
22
|
+
dependencies = [
|
|
23
|
+
"numpy",
|
|
24
|
+
"pandas"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: id3-classification
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Reusable ID3 Decision Tree Classifier
|
|
5
|
+
Author: nanashi
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# ID3 Classifier
|
|
15
|
+
|
|
16
|
+
Reusable ID3 Decision Tree Algorithm.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/id3_classification.egg-info/PKG-INFO
|
|
5
|
+
src/id3_classification.egg-info/SOURCES.txt
|
|
6
|
+
src/id3_classification.egg-info/dependency_links.txt
|
|
7
|
+
src/id3_classification.egg-info/requires.txt
|
|
8
|
+
src/id3_classification.egg-info/top_level.txt
|
|
9
|
+
src/myid3/__init__.py
|
|
10
|
+
src/myid3/id3.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
myid3
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .id3 import ID3Classifier
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import math
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ID3Classifier:
|
|
6
|
+
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.tree = None
|
|
9
|
+
self.target = None
|
|
10
|
+
self.features = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ===========================
|
|
14
|
+
# ENTROPY
|
|
15
|
+
# ===========================
|
|
16
|
+
def entropy(self, data):
|
|
17
|
+
|
|
18
|
+
values, counts = np.unique(data, return_counts=True)
|
|
19
|
+
|
|
20
|
+
ent = 0
|
|
21
|
+
|
|
22
|
+
for count in counts:
|
|
23
|
+
|
|
24
|
+
prob = count / sum(counts)
|
|
25
|
+
|
|
26
|
+
if prob > 0:
|
|
27
|
+
ent -= prob * math.log2(prob)
|
|
28
|
+
|
|
29
|
+
return ent
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ===========================
|
|
33
|
+
# INFORMATION GAIN
|
|
34
|
+
# ===========================
|
|
35
|
+
def information_gain(self, df, feature):
|
|
36
|
+
|
|
37
|
+
total_entropy = self.entropy(df[self.target])
|
|
38
|
+
|
|
39
|
+
values, counts = np.unique(df[feature], return_counts=True)
|
|
40
|
+
|
|
41
|
+
weighted_entropy = 0
|
|
42
|
+
|
|
43
|
+
for i in range(len(values)):
|
|
44
|
+
|
|
45
|
+
subset = df[df[feature] == values[i]]
|
|
46
|
+
|
|
47
|
+
subset_entropy = self.entropy(subset[self.target])
|
|
48
|
+
|
|
49
|
+
weight = counts[i] / sum(counts)
|
|
50
|
+
|
|
51
|
+
weighted_entropy += weight * subset_entropy
|
|
52
|
+
|
|
53
|
+
return total_entropy - weighted_entropy
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ===========================
|
|
57
|
+
# ID3 TREE BUILDER
|
|
58
|
+
# ===========================
|
|
59
|
+
def _id3(self, data, features):
|
|
60
|
+
|
|
61
|
+
# Pure node
|
|
62
|
+
if len(np.unique(data[self.target])) == 1:
|
|
63
|
+
|
|
64
|
+
return np.unique(data[self.target])[0]
|
|
65
|
+
|
|
66
|
+
# No features left
|
|
67
|
+
if len(features) == 0:
|
|
68
|
+
|
|
69
|
+
return data[self.target].mode()[0]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
igs = [self.information_gain(data, f) for f in features]
|
|
73
|
+
|
|
74
|
+
best = features[np.argmax(igs)]
|
|
75
|
+
|
|
76
|
+
tree = {best: {}}
|
|
77
|
+
|
|
78
|
+
remaining = [f for f in features if f != best]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
for value in np.unique(data[best]):
|
|
82
|
+
|
|
83
|
+
subset = data[data[best] == value]
|
|
84
|
+
|
|
85
|
+
if len(subset) == 0:
|
|
86
|
+
|
|
87
|
+
tree[best][value] = data[self.target].mode()[0]
|
|
88
|
+
|
|
89
|
+
else:
|
|
90
|
+
|
|
91
|
+
tree[best][value] = self._id3(subset, remaining)
|
|
92
|
+
|
|
93
|
+
return tree
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ===========================
|
|
97
|
+
# FIT MODEL
|
|
98
|
+
# ===========================
|
|
99
|
+
def fit(self, df, features, target):
|
|
100
|
+
|
|
101
|
+
self.target = target
|
|
102
|
+
self.features = features
|
|
103
|
+
|
|
104
|
+
self.tree = self._id3(df, features)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ===========================
|
|
108
|
+
# PRINT TREE
|
|
109
|
+
# ===========================
|
|
110
|
+
def print_tree(self, tree=None, indent=""):
|
|
111
|
+
|
|
112
|
+
if tree is None:
|
|
113
|
+
tree = self.tree
|
|
114
|
+
|
|
115
|
+
if not isinstance(tree, dict):
|
|
116
|
+
|
|
117
|
+
print(indent + "→", tree)
|
|
118
|
+
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
for key in tree:
|
|
122
|
+
|
|
123
|
+
print(indent + key)
|
|
124
|
+
|
|
125
|
+
for value in tree[key]:
|
|
126
|
+
|
|
127
|
+
print(indent + " ├─", value)
|
|
128
|
+
|
|
129
|
+
self.print_tree(tree[key][value], indent + " │ ")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ===========================
|
|
133
|
+
# SINGLE PREDICT
|
|
134
|
+
# ===========================
|
|
135
|
+
def _predict(self, tree, sample):
|
|
136
|
+
|
|
137
|
+
if not isinstance(tree, dict):
|
|
138
|
+
|
|
139
|
+
return tree
|
|
140
|
+
|
|
141
|
+
root = list(tree.keys())[0]
|
|
142
|
+
|
|
143
|
+
value = sample[root]
|
|
144
|
+
|
|
145
|
+
if value not in tree[root]:
|
|
146
|
+
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
return self._predict(tree[root][value], sample)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ===========================
|
|
153
|
+
# MULTIPLE PREDICT
|
|
154
|
+
# ===========================
|
|
155
|
+
def predict(self, df):
|
|
156
|
+
|
|
157
|
+
preds = []
|
|
158
|
+
|
|
159
|
+
for _, row in df.iterrows():
|
|
160
|
+
|
|
161
|
+
preds.append(self._predict(self.tree, row))
|
|
162
|
+
|
|
163
|
+
return np.array(preds)
|