c5tree 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- c5tree-0.1.0/PKG-INFO +201 -0
- c5tree-0.1.0/README.md +167 -0
- c5tree-0.1.0/c5tree/__init__.py +22 -0
- c5tree-0.1.0/c5tree/_classifier.py +469 -0
- c5tree-0.1.0/c5tree/_pruner.py +104 -0
- c5tree-0.1.0/c5tree/_splitter.py +206 -0
- c5tree-0.1.0/c5tree/_tree.py +48 -0
- c5tree-0.1.0/c5tree.egg-info/PKG-INFO +201 -0
- c5tree-0.1.0/c5tree.egg-info/SOURCES.txt +13 -0
- c5tree-0.1.0/c5tree.egg-info/dependency_links.txt +1 -0
- c5tree-0.1.0/c5tree.egg-info/requires.txt +14 -0
- c5tree-0.1.0/c5tree.egg-info/top_level.txt +1 -0
- c5tree-0.1.0/pyproject.toml +59 -0
- c5tree-0.1.0/setup.cfg +4 -0
- c5tree-0.1.0/tests/test_c5classifier.py +266 -0
c5tree-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: c5tree
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: C5.0 Decision Tree Classifier — a pure-Python, scikit-learn-compatible implementation
|
|
5
|
+
License: GPL-3.0-or-later
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/c5tree
|
|
7
|
+
Project-URL: Repository, https://github.com/yourusername/c5tree
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/c5tree/issues
|
|
9
|
+
Keywords: machine learning,decision tree,C5.0,scikit-learn,classification
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: scikit-learn>=1.3
|
|
24
|
+
Requires-Dist: scipy>=1.10
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: pandas>=1.5; extra == "dev"
|
|
29
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
30
|
+
Provides-Extra: docs
|
|
31
|
+
Requires-Dist: sphinx; extra == "docs"
|
|
32
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
|
33
|
+
Requires-Dist: numpydoc; extra == "docs"
|
|
34
|
+
|
|
35
|
+
# c5tree 🌳
|
|
36
|
+
|
|
37
|
+
**C5.0 Decision Tree Classifier for Python** — a pure-Python, scikit-learn-compatible implementation of Ross Quinlan's C5.0 algorithm.
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/c5tree/)
|
|
40
|
+
[](https://pypi.org/project/c5tree/)
|
|
41
|
+
[](https://www.gnu.org/licenses/gpl-3.0)
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Why C5.0?
|
|
46
|
+
|
|
47
|
+
scikit-learn's `DecisionTreeClassifier` uses CART (binary splits, Gini/entropy). C5.0 offers several advantages:
|
|
48
|
+
|
|
49
|
+
| Feature | CART (sklearn) | C5.0 (c5tree) |
|
|
50
|
+
|---|---|---|
|
|
51
|
+
| Split criterion | Gini / Entropy | **Gain Ratio** |
|
|
52
|
+
| Categorical splits | Binary only | **Multi-way** |
|
|
53
|
+
| Missing values | Requires imputation | **Native support** |
|
|
54
|
+
| Pruning | Cost-complexity | **Pessimistic Error Pruning** |
|
|
55
|
+
| Tree size | Often larger | **Smaller, more interpretable** |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install c5tree
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Quick Start
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from c5tree import C5Classifier
|
|
71
|
+
from sklearn.datasets import load_iris
|
|
72
|
+
from sklearn.model_selection import train_test_split
|
|
73
|
+
|
|
74
|
+
X, y = load_iris(return_X_y=True)
|
|
75
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
76
|
+
|
|
77
|
+
clf = C5Classifier(pruning=True, cf=0.25)
|
|
78
|
+
clf.fit(X_train, y_train)
|
|
79
|
+
|
|
80
|
+
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")
|
|
81
|
+
print(f"Tree depth: {clf.get_depth()}")
|
|
82
|
+
print(f"Leaves: {clf.get_n_leaves()}")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Key Features
|
|
88
|
+
|
|
89
|
+
### Gain Ratio Splitting
|
|
90
|
+
Corrects ID3's bias toward features with many distinct values by normalising information gain by split information.
|
|
91
|
+
|
|
92
|
+
### Native Missing Value Handling
|
|
93
|
+
No imputation needed. Missing instances are distributed fractionally across branches, weighted by the proportion of known instances going each way.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import numpy as np
|
|
97
|
+
|
|
98
|
+
X_with_missing = X.copy().astype(float)
|
|
99
|
+
X_with_missing[0, 2] = np.nan # inject a missing value
|
|
100
|
+
|
|
101
|
+
clf.fit(X_with_missing, y) # works out of the box
|
|
102
|
+
clf.predict(X_with_missing) # also works
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Categorical Feature Support
|
|
106
|
+
|
|
107
|
+
Pass a pandas DataFrame and `c5tree` automatically detects object/category columns and applies multi-way splits:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
import pandas as pd
|
|
111
|
+
|
|
112
|
+
df = pd.DataFrame({
|
|
113
|
+
"outlook": ["sunny", "overcast", "rainy", "sunny", "rainy"],
|
|
114
|
+
"humidity": [85, 65, 70, 95, 80],
|
|
115
|
+
"play": [0, 1, 1, 0, 1],
|
|
116
|
+
})
|
|
117
|
+
X = df[["outlook", "humidity"]]
|
|
118
|
+
y = df["play"]
|
|
119
|
+
|
|
120
|
+
clf = C5Classifier(pruning=False).fit(X, y)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Pessimistic Error Pruning
|
|
124
|
+
After the tree is grown, subtrees are replaced by leaves if the pessimistic error estimate of the subtree is no better than a single leaf. Controlled by the `cf` parameter.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# cf=0.25 → default, moderate pruning
|
|
128
|
+
# cf=0.05 → aggressive pruning, very small tree
|
|
129
|
+
# cf=0.50 → light pruning, larger tree
|
|
130
|
+
clf = C5Classifier(pruning=True, cf=0.05)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Human-Readable Tree
|
|
134
|
+
```python
|
|
135
|
+
print(clf.text_report())
|
|
136
|
+
# [feature_2 <= 1.9000]
|
|
137
|
+
# left:
|
|
138
|
+
# → Predict: 0 [0: 1.00, 1: 0.00, 2: 0.00] (n=40.0)
|
|
139
|
+
# right:
|
|
140
|
+
# [feature_3 <= 1.7500]
|
|
141
|
+
# ...
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Full sklearn Compatibility
|
|
145
|
+
Works in Pipelines, `GridSearchCV`, `cross_val_score`, and `clone`:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from sklearn.pipeline import Pipeline
|
|
149
|
+
from sklearn.preprocessing import StandardScaler
|
|
150
|
+
from sklearn.model_selection import GridSearchCV
|
|
151
|
+
|
|
152
|
+
pipe = Pipeline([
|
|
153
|
+
("scaler", StandardScaler()),
|
|
154
|
+
("clf", C5Classifier()),
|
|
155
|
+
])
|
|
156
|
+
|
|
157
|
+
param_grid = {"clf__cf": [0.05, 0.25, 0.50], "clf__max_depth": [None, 5, 10]}
|
|
158
|
+
search = GridSearchCV(pipe, param_grid, cv=5)
|
|
159
|
+
search.fit(X_train, y_train)
|
|
160
|
+
print(search.best_params_)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Parameters
|
|
166
|
+
|
|
167
|
+
| Parameter | Type | Default | Description |
|
|
168
|
+
|---|---|---|---|
|
|
169
|
+
| `max_depth` | int or None | None | Maximum tree depth |
|
|
170
|
+
| `min_samples_split` | int | 2 | Minimum samples to split a node |
|
|
171
|
+
| `min_samples_leaf` | int | 1 | Minimum samples at a leaf |
|
|
172
|
+
| `pruning` | bool | True | Enable pessimistic error pruning |
|
|
173
|
+
| `cf` | float | 0.25 | Confidence factor for pruning (0.05–0.50) |
|
|
174
|
+
| `min_gain_ratio` | float | 0.0 | Minimum gain ratio to make a split |
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Running Tests
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
pip install c5tree[dev]
|
|
182
|
+
pytest tests/ -v --cov=c5tree
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Background
|
|
188
|
+
|
|
189
|
+
C5.0 is the successor to C4.5 and ID3, developed by Ross Quinlan. The algorithm was open-sourced in 2011 under the GPL licence. This package is a clean pure-Python reimplementation, making C5.0 accessible to the Python data-science ecosystem for the first time as a proper scikit-learn-compatible estimator.
|
|
190
|
+
|
|
191
|
+
**Reference:** Quinlan, J.R. (1993). *C4.5: Programs for Machine Learning*. Morgan Kaufmann.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Contributing
|
|
196
|
+
|
|
197
|
+
Contributions are very welcome! Please open an issue before submitting a PR.
|
|
198
|
+
|
|
199
|
+
## Licence
|
|
200
|
+
|
|
201
|
+
GNU General Public License v3.0 — see [LICENSE](LICENSE).
|
c5tree-0.1.0/README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# c5tree 🌳
|
|
2
|
+
|
|
3
|
+
**C5.0 Decision Tree Classifier for Python** — a pure-Python, scikit-learn-compatible implementation of Ross Quinlan's C5.0 algorithm.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/c5tree/)
|
|
6
|
+
[](https://pypi.org/project/c5tree/)
|
|
7
|
+
[](https://www.gnu.org/licenses/gpl-3.0)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Why C5.0?
|
|
12
|
+
|
|
13
|
+
scikit-learn's `DecisionTreeClassifier` uses CART (binary splits, Gini/entropy). C5.0 offers several advantages:
|
|
14
|
+
|
|
15
|
+
| Feature | CART (sklearn) | C5.0 (c5tree) |
|
|
16
|
+
|---|---|---|
|
|
17
|
+
| Split criterion | Gini / Entropy | **Gain Ratio** |
|
|
18
|
+
| Categorical splits | Binary only | **Multi-way** |
|
|
19
|
+
| Missing values | Requires imputation | **Native support** |
|
|
20
|
+
| Pruning | Cost-complexity | **Pessimistic Error Pruning** |
|
|
21
|
+
| Tree size | Often larger | **Smaller, more interpretable** |
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install c5tree
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from c5tree import C5Classifier
|
|
37
|
+
from sklearn.datasets import load_iris
|
|
38
|
+
from sklearn.model_selection import train_test_split
|
|
39
|
+
|
|
40
|
+
X, y = load_iris(return_X_y=True)
|
|
41
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
42
|
+
|
|
43
|
+
clf = C5Classifier(pruning=True, cf=0.25)
|
|
44
|
+
clf.fit(X_train, y_train)
|
|
45
|
+
|
|
46
|
+
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")
|
|
47
|
+
print(f"Tree depth: {clf.get_depth()}")
|
|
48
|
+
print(f"Leaves: {clf.get_n_leaves()}")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Key Features
|
|
54
|
+
|
|
55
|
+
### Gain Ratio Splitting
|
|
56
|
+
Corrects ID3's bias toward features with many distinct values by normalising information gain by split information.
|
|
57
|
+
|
|
58
|
+
### Native Missing Value Handling
|
|
59
|
+
No imputation needed. Missing instances are distributed fractionally across branches, weighted by the proportion of known instances going each way.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import numpy as np
|
|
63
|
+
|
|
64
|
+
X_with_missing = X.copy().astype(float)
|
|
65
|
+
X_with_missing[0, 2] = np.nan # inject a missing value
|
|
66
|
+
|
|
67
|
+
clf.fit(X_with_missing, y) # works out of the box
|
|
68
|
+
clf.predict(X_with_missing) # also works
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Categorical Feature Support
|
|
72
|
+
|
|
73
|
+
Pass a pandas DataFrame and `c5tree` automatically detects object/category columns and applies multi-way splits:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import pandas as pd
|
|
77
|
+
|
|
78
|
+
df = pd.DataFrame({
|
|
79
|
+
"outlook": ["sunny", "overcast", "rainy", "sunny", "rainy"],
|
|
80
|
+
"humidity": [85, 65, 70, 95, 80],
|
|
81
|
+
"play": [0, 1, 1, 0, 1],
|
|
82
|
+
})
|
|
83
|
+
X = df[["outlook", "humidity"]]
|
|
84
|
+
y = df["play"]
|
|
85
|
+
|
|
86
|
+
clf = C5Classifier(pruning=False).fit(X, y)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Pessimistic Error Pruning
|
|
90
|
+
After the tree is grown, subtrees are replaced by leaves if the pessimistic error estimate of the subtree is no better than a single leaf. Controlled by the `cf` parameter.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# cf=0.25 → default, moderate pruning
|
|
94
|
+
# cf=0.05 → aggressive pruning, very small tree
|
|
95
|
+
# cf=0.50 → light pruning, larger tree
|
|
96
|
+
clf = C5Classifier(pruning=True, cf=0.05)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Human-Readable Tree
|
|
100
|
+
```python
|
|
101
|
+
print(clf.text_report())
|
|
102
|
+
# [feature_2 <= 1.9000]
|
|
103
|
+
# left:
|
|
104
|
+
# → Predict: 0 [0: 1.00, 1: 0.00, 2: 0.00] (n=40.0)
|
|
105
|
+
# right:
|
|
106
|
+
# [feature_3 <= 1.7500]
|
|
107
|
+
# ...
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Full sklearn Compatibility
|
|
111
|
+
Works in Pipelines, `GridSearchCV`, `cross_val_score`, and `clone`:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from sklearn.pipeline import Pipeline
|
|
115
|
+
from sklearn.preprocessing import StandardScaler
|
|
116
|
+
from sklearn.model_selection import GridSearchCV
|
|
117
|
+
|
|
118
|
+
pipe = Pipeline([
|
|
119
|
+
("scaler", StandardScaler()),
|
|
120
|
+
("clf", C5Classifier()),
|
|
121
|
+
])
|
|
122
|
+
|
|
123
|
+
param_grid = {"clf__cf": [0.05, 0.25, 0.50], "clf__max_depth": [None, 5, 10]}
|
|
124
|
+
search = GridSearchCV(pipe, param_grid, cv=5)
|
|
125
|
+
search.fit(X_train, y_train)
|
|
126
|
+
print(search.best_params_)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Parameters
|
|
132
|
+
|
|
133
|
+
| Parameter | Type | Default | Description |
|
|
134
|
+
|---|---|---|---|
|
|
135
|
+
| `max_depth` | int or None | None | Maximum tree depth |
|
|
136
|
+
| `min_samples_split` | int | 2 | Minimum samples to split a node |
|
|
137
|
+
| `min_samples_leaf` | int | 1 | Minimum samples at a leaf |
|
|
138
|
+
| `pruning` | bool | True | Enable pessimistic error pruning |
|
|
139
|
+
| `cf` | float | 0.25 | Confidence factor for pruning (0.05–0.50) |
|
|
140
|
+
| `min_gain_ratio` | float | 0.0 | Minimum gain ratio to make a split |
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Running Tests
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
pip install c5tree[dev]
|
|
148
|
+
pytest tests/ -v --cov=c5tree
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Background
|
|
154
|
+
|
|
155
|
+
C5.0 is the successor to C4.5 and ID3, developed by Ross Quinlan. The algorithm was open-sourced in 2011 under the GPL licence. This package is a clean pure-Python reimplementation, making C5.0 accessible to the Python data-science ecosystem for the first time as a proper scikit-learn-compatible estimator.
|
|
156
|
+
|
|
157
|
+
**Reference:** Quinlan, J.R. (1993). *C4.5: Programs for Machine Learning*. Morgan Kaufmann.
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Contributing
|
|
162
|
+
|
|
163
|
+
Contributions are very welcome! Please open an issue before submitting a PR.
|
|
164
|
+
|
|
165
|
+
## Licence
|
|
166
|
+
|
|
167
|
+
GNU General Public License v3.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
c5tree — C5.0 Decision Tree Classifier for Python
|
|
3
|
+
===================================================
|
|
4
|
+
|
|
5
|
+
A pure-Python, scikit-learn-compatible implementation of Ross Quinlan's
|
|
6
|
+
C5.0 decision tree algorithm.
|
|
7
|
+
|
|
8
|
+
Quick start
|
|
9
|
+
-----------
|
|
10
|
+
>>> from c5tree import C5Classifier
|
|
11
|
+
>>> from sklearn.datasets import load_iris
|
|
12
|
+
>>> X, y = load_iris(return_X_y=True)
|
|
13
|
+
>>> clf = C5Classifier(pruning=True, cf=0.25).fit(X, y)
|
|
14
|
+
>>> clf.predict(X[:3])
|
|
15
|
+
array([0, 0, 0])
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from ._classifier import C5Classifier
|
|
19
|
+
|
|
20
|
+
__all__ = ["C5Classifier"]
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
__author__ = "c5tree contributors"
|