ezr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ezr-0.1.0/LICENSE +24 -0
- ezr-0.1.0/PKG-INFO +13 -0
- ezr-0.1.0/README.md +23 -0
- ezr-0.1.0/ezr.egg-info/PKG-INFO +13 -0
- ezr-0.1.0/ezr.egg-info/SOURCES.txt +9 -0
- ezr-0.1.0/ezr.egg-info/dependency_links.txt +1 -0
- ezr-0.1.0/ezr.egg-info/entry_points.txt +2 -0
- ezr-0.1.0/ezr.egg-info/top_level.txt +1 -0
- ezr-0.1.0/ezr.py +509 -0
- ezr-0.1.0/setup.cfg +4 -0
- ezr-0.1.0/setup.py +24 -0
ezr-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
BSD 2-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024, Tim Menzies
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
16
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
17
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
19
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
20
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
21
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
22
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
23
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
24
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
ezr-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ezr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semi-supervised explanations for incremental multi-objective optimization
|
|
5
|
+
Home-page: https://github.com/timm/ezr
|
|
6
|
+
Author: Tim Menzies
|
|
7
|
+
Author-email: timm@ieee.org
|
|
8
|
+
License: BSD2
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
License-File: LICENSE
|
ezr-0.1.0/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# ezr.py
|
|
2
|
+
|
|
3
|
+
Explanation system for semi=supervised multi-objective optimization.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Install
|
|
7
|
+
|
|
8
|
+
Download ez.py.
|
|
9
|
+
|
|
10
|
+
Test:
|
|
11
|
+
|
|
12
|
+
./ez.py -h
|
|
13
|
+
|
|
14
|
+
## Run
|
|
15
|
+
|
|
16
|
+
Find some csv data where the first row names the columns
|
|
17
|
+
|
|
18
|
+
- Uppercase names denote numerics (all others are symbolic)
|
|
19
|
+
- Names ending in "+" or "-" are goals to be minimized.
|
|
20
|
+
- Names ending in "!" show the klass column (there can only be one).
|
|
21
|
+
|
|
22
|
+
For examples, see the [/data](https://github.com/timm/ezr/tree/main/data)
|
|
23
|
+
directory.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ezr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semi-supervised explanations for incremental multi-objective optimization
|
|
5
|
+
Home-page: https://github.com/timm/ezr
|
|
6
|
+
Author: Tim Menzies
|
|
7
|
+
Author-email: timm@ieee.org
|
|
8
|
+
License: BSD2
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
License-File: LICENSE
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ezr
|
ezr-0.1.0/ezr.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# MARK: help
|
|
3
|
+
"""
|
|
4
|
+
ez.py: Active learning, find best/rest seen so far in a Bayes classifier
|
|
5
|
+
(c) 2024 Tim Menzies <timm@ieee.org>, BSD-2 license
|
|
6
|
+
|
|
7
|
+
OPTIONS:
|
|
8
|
+
-s --seed random number seed = 1234567891
|
|
9
|
+
-g --go start up action = help
|
|
10
|
+
-f --file data file = ../data/auto93.csv
|
|
11
|
+
|
|
12
|
+
Discretize:
|
|
13
|
+
-B --Bins max number of bins = 16
|
|
14
|
+
|
|
15
|
+
Classify:
|
|
16
|
+
-k --k low frequency kludge = 1
|
|
17
|
+
-m --m low frequency kludge = 2
|
|
18
|
+
|
|
19
|
+
Optimize:
|
|
20
|
+
-n --budget0 init evals = 4
|
|
21
|
+
-N --Budget max evals = 16
|
|
22
|
+
-b --best ratio of top = .5
|
|
23
|
+
-T --Top keep top todos = .8
|
|
24
|
+
|
|
25
|
+
Explain:
|
|
26
|
+
-l --leaf leaf size = 2 """
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations # <1> ## types
|
|
29
|
+
import sys
|
|
30
|
+
sys.dont_write_bytecode = True
|
|
31
|
+
from collections import Counter
|
|
32
|
+
import re,ast,copy,json,math,random
|
|
33
|
+
from typing import Any,Iterable,Callable
|
|
34
|
+
from fileinput import FileInput as file_or_stdin
|
|
35
|
+
|
|
36
|
+
# ----------------------------------------------------------------------------------------
|
|
37
|
+
# MARK: inits
|
|
38
|
+
|
|
39
|
+
# Some globals
|
|
40
|
+
big = 1E32
|
|
41
|
+
tiny = 1/big
|
|
42
|
+
|
|
43
|
+
# Special type annotations
|
|
44
|
+
class Row : has:list[Any]
|
|
45
|
+
class Rows : has:list[Row]
|
|
46
|
+
class Classes: has:dict[str, Rows] # a dictionary, one key for each class
|
|
47
|
+
|
|
48
|
+
# Simple base object: defines simple initialization and pretty print.
|
|
49
|
+
|
|
50
|
+
class OBJ:
|
|
51
|
+
def __init__(i,**d) : i.__dict__.update(d)
|
|
52
|
+
def __repr__(i) -> str : return i.__class__.__name__+show(i.__dict__)
|
|
53
|
+
|
|
54
|
+
def settings(s:str) -> dict:
|
|
55
|
+
return {m[1] : coerce(m[2]) for m in re.finditer(r"--(\w+)[^=]*=\s*(\S+)", s)}
|
|
56
|
+
|
|
57
|
+
# ----------------------------------------------------------------------------------------
|
|
58
|
+
# ## Classes
|
|
59
|
+
|
|
60
|
+
# MARK: BIN
|
|
61
|
+
# Stores in `ys` the klass symbols see between `lo` and `hi`.
|
|
62
|
+
#
|
|
63
|
+
# [1] `merge()` combines two BINs, if they are too small or they have similar distributions.
|
|
64
|
+
# [2] `selects()` returns true when a BIN matches a row.
|
|
65
|
+
# [3] `BIN.score()` reports how often we see `goals` symbols more than other symbols.
|
|
66
|
+
#
|
|
67
|
+
# To build decision trees, split Rows on the best scoring bin, then recurse on each half.
|
|
68
|
+
|
|
69
|
+
#ZZZ add in __repr__
|
|
70
|
+
class BIN(OBJ):
|
|
71
|
+
id=0
|
|
72
|
+
def __init__(i, at:int, txt:str, lo:float, hi:float=None, ys:Counter=None):
|
|
73
|
+
i.at,i.txt,i.lo,i.hi,i.ys = at,txt, lo,hi or lo,ys or Counter()
|
|
74
|
+
i.id = BIN.id = BIN.id + 1
|
|
75
|
+
|
|
76
|
+
def add(i, x:float, y:Any):
|
|
77
|
+
i.lo = min(x, i.lo)
|
|
78
|
+
i.hi = max(x, i.hi)
|
|
79
|
+
i.ys[y] += 1
|
|
80
|
+
|
|
81
|
+
def __repr__(i):
|
|
82
|
+
if i.lo == i.hi: return f"{i.txt}={i.hi}"
|
|
83
|
+
if i.lo == -big: return f"{i.txt} < {i.hi}"
|
|
84
|
+
if i.hi == big: return f"{i.txt} >= {i.lo}"
|
|
85
|
+
return f"{i.lo} <= {i.txt} < {i.hi}"
|
|
86
|
+
|
|
87
|
+
def merge(i, j:BIN, small:float) -> BIN: # or None if nothing merged ------------[1]
|
|
88
|
+
if i.at == j.at:
|
|
89
|
+
k = BIN(i.at, i.txt, min(i.lo,j.lo), hi=max(i.hi,j.hi), ys=i.ys+j.ys)
|
|
90
|
+
ei,ni = entropy(i.ys)
|
|
91
|
+
ej,nj = entropy(j.ys)
|
|
92
|
+
ek,nk = entropy(k.ys)
|
|
93
|
+
if ni < small or nj < small : return k # merge if bins too small
|
|
94
|
+
if ek <= (ni*ei + nj*ej)/nk: return k # merge if parts are more complex
|
|
95
|
+
|
|
96
|
+
def selects(i, row: Row) -> bool: #-----------------------------------------------[2]
|
|
97
|
+
x = row[i.at]
|
|
98
|
+
return x=="?" or i.lo == x == i.hi or i.lo <= x < i.hi
|
|
99
|
+
|
|
100
|
+
def selectsRejects(i, classes: Classes) -> tuple[Classes,Classes]:
|
|
101
|
+
yes = {k:[] for k in classes}
|
|
102
|
+
no = {k:[] for k in classes}
|
|
103
|
+
for k,rows in classes.items():
|
|
104
|
+
[(yes if i.selects(row) else no)[k].append(row) for row in rows]
|
|
105
|
+
return yes,no
|
|
106
|
+
|
|
107
|
+
# MARK: COL
|
|
108
|
+
# is an abstract class above NUM and SYM.
|
|
109
|
+
#
|
|
110
|
+
# - `bins()` reports how col values are spread over a list of BINs.
|
|
111
|
+
|
|
112
|
+
class COL(OBJ):
|
|
113
|
+
def __init__(i, at:int=0, txt:str=" "): i.n,i.at,i.txt = 0,at,txt
|
|
114
|
+
|
|
115
|
+
def bins(i, classes: Classes, small=None) -> list[BIN]:
|
|
116
|
+
def send2bin(x,y):
|
|
117
|
+
k = i.bin(x)
|
|
118
|
+
if k not in out: out[k] = BIN(i.at,i.txt,x)
|
|
119
|
+
out[k].add(x,y)
|
|
120
|
+
out = {}
|
|
121
|
+
[send2bin(row[i.at],y) for y,lst in classes.items() for row in lst if row[i.at]!="?"]
|
|
122
|
+
return i.binsComplete(sorted(out.values(), key=lambda z:z.lo),
|
|
123
|
+
small = small or (sum(len(lst) for lst in classes.values())/the.Bins))
|
|
124
|
+
|
|
125
|
+
# MARK: SYM
|
|
126
|
+
# summarizes a stream of numbers.
|
|
127
|
+
#
|
|
128
|
+
# - the `div()`ersity of a SYM summary is the `entropy`;
|
|
129
|
+
# - the `mid()`dle of a SYM summary is the mode value;
|
|
130
|
+
# - `like()` returns the likelihood of a value belongs in a SYM distribution;
|
|
131
|
+
# - `bin()` and `binsComplete()` are used for generating BINs (for SYMs there is not much to do with BINs)
|
|
132
|
+
|
|
133
|
+
class SYM(COL):
|
|
134
|
+
def __init__(i,**kw): super().__init__(**kw); i.has = {}
|
|
135
|
+
def add(i, x:Any):
|
|
136
|
+
if x != "?":
|
|
137
|
+
i.n += 1
|
|
138
|
+
i.has[x] = i.has.get(x,0) + 1
|
|
139
|
+
|
|
140
|
+
def bin(i,x:Any) -> Any : return x
|
|
141
|
+
def binsComplete(i,bins:list[BIN],**_) -> list[BIN] : return bins
|
|
142
|
+
|
|
143
|
+
def div(i) -> float : return entropy(i.has)
|
|
144
|
+
def mid(i) -> Any : return max(i.has, key=i.has.get)
|
|
145
|
+
|
|
146
|
+
def like(i, x:Any, prior:float) -> float :
|
|
147
|
+
return (i.has.get(x, 0) + the.m*prior) / (i.n + the.m)
|
|
148
|
+
|
|
149
|
+
# MARK: NUM
|
|
150
|
+
# summarizes a stream of numbers.
|
|
151
|
+
#
|
|
152
|
+
# - the `div()`ersity of a NUM summary is the standard deviation;
|
|
153
|
+
# - the `mid()`dle of a NUM summary is the mean value;
|
|
154
|
+
# - `like()` returns the likelihood of a value belongs in a NUM distribution;
|
|
155
|
+
# - `bin(n)` places `n` in one equal width bin (spread from `lo` to `hi`)
|
|
156
|
+
# `_bin(bins)` tries to merge numeric bins
|
|
157
|
+
# - `d2h(n)` reports how far n` is from `heaven` (which is 0 when minimizing, 1 otherwise
|
|
158
|
+
# - `norm(n)` maps `n` into 0..1 (min..max)
|
|
159
|
+
|
|
160
|
+
class NUM(COL):
|
|
161
|
+
def __init__(i,**kw):
|
|
162
|
+
super().__init__(**kw)
|
|
163
|
+
i.mu,i.m2,i.lo,i.hi = 0,0,big, -big
|
|
164
|
+
i.heaven = 0 if i.txt[-1]=="-" else 1
|
|
165
|
+
|
|
166
|
+
def add(i, x:Any): #= sd
|
|
167
|
+
if x != "?":
|
|
168
|
+
i.n += 1
|
|
169
|
+
d = x - i.mu
|
|
170
|
+
i.mu += d/i.n
|
|
171
|
+
i.m2 += d * (x - i.mu)
|
|
172
|
+
i.lo = min(x, i.lo)
|
|
173
|
+
i.hi = max(x, i.hi)
|
|
174
|
+
|
|
175
|
+
def bin(i, x:float) -> int:
|
|
176
|
+
return min(the.Bins - 1, int(the.Bins * i.norm(x)))
|
|
177
|
+
|
|
178
|
+
def binsComplete(i, bins: list[BIN], small=2) -> list[BIN]:
|
|
179
|
+
bins = merges(bins,merge=lambda x,y:x.merge(y,small))
|
|
180
|
+
bins[0].lo = -big
|
|
181
|
+
bins[-1].hi = big
|
|
182
|
+
for j in range(1,len(bins)): bins[j].lo = bins[j-1].hi
|
|
183
|
+
return bins
|
|
184
|
+
|
|
185
|
+
def d2h(i, x:float) -> float: return abs(i.norm(x) - i.heaven)
|
|
186
|
+
def norm(i,x:float) -> float: return x=="?" and x or (x - i.lo) / (i.hi - i.lo + tiny)
|
|
187
|
+
|
|
188
|
+
def div(i) -> float : return 0 if i.n < 2 else (i.m2 / (i.n - 1))**.5
|
|
189
|
+
def mid(i) -> float : return i.mu
|
|
190
|
+
|
|
191
|
+
def like(i, x:float, _) -> float:
|
|
192
|
+
v = i.div()**2 + tiny
|
|
193
|
+
nom = math.e**(-1*(x - i.mu)**2/(2*v)) + tiny
|
|
194
|
+
denom = (2*math.pi*v)**.5
|
|
195
|
+
return min(1, nom/(denom + tiny))
|
|
196
|
+
|
|
197
|
+
# MARK: COLS
|
|
198
|
+
# is a factory for building and storing COLs from column names. All columns are in `all`.
|
|
199
|
+
# References to the independent and dependent variables are in `x` and `y` (respectively).
|
|
200
|
+
# If there is a klass, that is referenced in `klass`. And all the names are stored in `names`.
|
|
201
|
+
|
|
202
|
+
class COLS(OBJ):
|
|
203
|
+
def __init__(i, names: list[str]):
|
|
204
|
+
i.x, i.y, i.all, i.names, i.klass = [], [], [], names, None
|
|
205
|
+
for at,txt in enumerate(names):
|
|
206
|
+
a,z = txt[0], txt[-1]
|
|
207
|
+
col = (NUM if a.isupper() else SYM)(at=at,txt=txt)
|
|
208
|
+
i.all.append(col)
|
|
209
|
+
if z != "X":
|
|
210
|
+
(i.y if z in "!+-" else i.x).append(col)
|
|
211
|
+
if z == "!": i.klass= col
|
|
212
|
+
|
|
213
|
+
def add(i,row: Row) -> Row:
|
|
214
|
+
[col.add(row[col.at]) for col in i.all if row[col.at] != "?"]
|
|
215
|
+
return row
|
|
216
|
+
|
|
217
|
+
# MARK: DATA
|
|
218
|
+
# stores `rows`, summarized into `cols`. Optionally, `rows` can be sorted by distance to
|
|
219
|
+
# heaven (`d2h()`). A `clone()` is a new `DATA` of the same structure. Can compute
|
|
220
|
+
# `loglike()`lihood of a `Row` belonging to this `DATA`.
|
|
221
|
+
|
|
222
|
+
class DATA(OBJ):
|
|
223
|
+
def __init__(i, src=Iterable[Row], order=False, fun=None):
|
|
224
|
+
i.rows, i.cols = [], None
|
|
225
|
+
[i.add(lst,fun) for lst in src]
|
|
226
|
+
if order: i.order()
|
|
227
|
+
|
|
228
|
+
def add(i, row:Row, fun:Callable=None):
|
|
229
|
+
if i.cols:
|
|
230
|
+
if fun: fun(i,row)
|
|
231
|
+
i.rows += [i.cols.add(row)]
|
|
232
|
+
else:
|
|
233
|
+
i.cols = COLS(row)
|
|
234
|
+
|
|
235
|
+
def clone(i,lst:Iterable[Row]=[],order=False) -> DATA:
|
|
236
|
+
return DATA([i.cols.names]+lst,order=order)
|
|
237
|
+
|
|
238
|
+
def stats(i, cols=None, what:str=None):
|
|
239
|
+
return {col.txt:show(getattr(col,what or "mid")())
|
|
240
|
+
for col in cols or i.cols.y}
|
|
241
|
+
|
|
242
|
+
def order(i) -> Rows:
|
|
243
|
+
i.rows = sorted(i.rows, key=i.d2h, reverse=False)
|
|
244
|
+
return i.rows
|
|
245
|
+
|
|
246
|
+
def d2h(i, row:Row) -> float:
|
|
247
|
+
d = sum(col.d2h( row[col.at] )**2 for col in i.cols.y)
|
|
248
|
+
return (d/len(i.cols.y))**.5
|
|
249
|
+
|
|
250
|
+
def loglike(i, row:Row, nall:int, nh:int) -> float:
|
|
251
|
+
prior = (len(i.rows) + the.k) / (nall + the.k*nh)
|
|
252
|
+
likes = [c.like(row[c.at],prior) for c in i.cols.x if row[c.at] != "?"]
|
|
253
|
+
return sum(math.log(x) for x in likes + [prior] if x>0)
|
|
254
|
+
|
|
255
|
+
# MARK: smo
|
|
256
|
+
def smo(data0:DATA, score=lambda B,R: B-R) -> Row:
|
|
257
|
+
def like(row,data,nall):
|
|
258
|
+
return data.loglike(row,nall,2)
|
|
259
|
+
def acquire(best, rest, rows):
|
|
260
|
+
nall = len(best.rows) + len(rest.rows)
|
|
261
|
+
rows.sort(key=lambda r: -score(like(r,best,nall),like(r,rest,nall)))
|
|
262
|
+
chop = int(len(rows) * the.Top)
|
|
263
|
+
return rows[:chop]
|
|
264
|
+
#-----------
|
|
265
|
+
random.shuffle(data0.rows)
|
|
266
|
+
done, todo = data0.rows[:the.budget0], data0.rows[the.budget0:]
|
|
267
|
+
data1 = data0.clone(done, order=True)
|
|
268
|
+
for i in range(the.Budget):
|
|
269
|
+
if len(todo) < 3: break
|
|
270
|
+
n = int(len(done)**the.best + .5)
|
|
271
|
+
top,*todo = acquire(data0.clone(data1.rows[:n]),
|
|
272
|
+
data0.clone(data1.rows[n:]),
|
|
273
|
+
todo)
|
|
274
|
+
done.append(top)
|
|
275
|
+
data1 = data0.clone(done, order=True)
|
|
276
|
+
return data1.rows[0]
|
|
277
|
+
|
|
278
|
+
# MARK: CONTRAST
|
|
279
|
+
class CONTRAST(OBJ):
|
|
280
|
+
def show(i):
|
|
281
|
+
for lvl,node in i.nodes():
|
|
282
|
+
print("|.. " * lvl,
|
|
283
|
+
counts(node.yes) if node.isLeaf else node.bin )
|
|
284
|
+
def nodes(i,lvl=0):
|
|
285
|
+
yield lvl,i
|
|
286
|
+
if not i.isLeaf:
|
|
287
|
+
for x in [i.yes,i.no]:
|
|
288
|
+
for lvl1,y in x.nodes(lvl+1): yield lvl1,y
|
|
289
|
+
|
|
290
|
+
class CONTRASTS(OBJ):
|
|
291
|
+
def __init__(i, data:DATA, classes:Classes,
|
|
292
|
+
best:str="best", rest:str="rest", score=lambda B,R: B-R):
|
|
293
|
+
i.bins = [bin for col in data.cols.x for bin in col.bins(classes)]
|
|
294
|
+
i.best, i.score, i.bests, i.rests = best, score, len(classes[best]), len(classes[rest])
|
|
295
|
+
print(counts(classes))
|
|
296
|
+
i.root = i.grow(classes, 0 ,1E30)
|
|
297
|
+
|
|
298
|
+
def grow(i, classes:Classes, lvl:int, above:int) -> OBJ:
|
|
299
|
+
myBest = len(classes[i.best])
|
|
300
|
+
if myBest <= the.leaf or myBest == above:
|
|
301
|
+
return CONTRAST(isLeaf=True, yes=classes, no={}, lvl=lvl)
|
|
302
|
+
else:
|
|
303
|
+
bin = max(i.bins, key = lambda bin: i.sorter(bin,classes))
|
|
304
|
+
yes,no = bin.selectsRejects(classes)
|
|
305
|
+
print(counts(yes), counts(no))
|
|
306
|
+
return CONTRAST(isLeaf=False, lvl=lvl, bin=bin,
|
|
307
|
+
yes = i.grow(yes, lvl+1, myBest),
|
|
308
|
+
no = i.grow(no, lvl+1, myBest))
|
|
309
|
+
|
|
310
|
+
def sorter(i, bin:BIN, classes:Classes) -> float:
|
|
311
|
+
b,r = 0,0 # counts of best,rest
|
|
312
|
+
for k,rows in classes.items():
|
|
313
|
+
for row in rows:
|
|
314
|
+
if bin.selects(row):
|
|
315
|
+
if k==i.best: b += 1
|
|
316
|
+
else : r += 1
|
|
317
|
+
return i.score( b/(i.bests+tiny), r/(i.rests+tiny) )
|
|
318
|
+
|
|
319
|
+
# MARK: NB
|
|
320
|
+
# Visitor object carried along by a DATA. Internally maintains its own `DATA` for rows
|
|
321
|
+
# from different class.
|
|
322
|
+
|
|
323
|
+
class NB(OBJ):
|
|
324
|
+
def __init__(i): i.nall=0; i.datas:Classes = {}; i.acc=0
|
|
325
|
+
|
|
326
|
+
def classify(i,data,row):
|
|
327
|
+
return max(i.datas,
|
|
328
|
+
key=lambda k: i.datas[k].loglike(row, i.nall, len(i.datas)))
|
|
329
|
+
|
|
330
|
+
def run(i, data:DATA, row:Row):
|
|
331
|
+
want = row[data.cols.klass.at]
|
|
332
|
+
i.nall += 1
|
|
333
|
+
if i.nall>10:
|
|
334
|
+
got = i.classify(data,row)
|
|
335
|
+
i.acc += (want==got)
|
|
336
|
+
if want not in i.datas: i.datas[want] = data.clone()
|
|
337
|
+
i.datas[want].add(row)
|
|
338
|
+
|
|
339
|
+
#----------------------------------------------------------------------------------------
|
|
340
|
+
# MARK: misc functions
|
|
341
|
+
|
|
342
|
+
def shuffle(lst): random.shuffle(lst); return lst
|
|
343
|
+
def counts(d): return {k:len(v) for k,v in d.items()}
|
|
344
|
+
def first(lst): return lst[0]
|
|
345
|
+
|
|
346
|
+
# ### Data mining tricks
|
|
347
|
+
def entropy(d: dict) -> float:
|
|
348
|
+
N = sum(n for n in d.values()if n>0)
|
|
349
|
+
return -sum(n/N*math.log(n/N,2) for n in d.values() if n>0), N
|
|
350
|
+
|
|
351
|
+
def merges(b4: list[BIN], merge:Callable) -> list[BIN]:
|
|
352
|
+
j, now = 0, []
|
|
353
|
+
while j < len(b4):
|
|
354
|
+
x = b4[j]
|
|
355
|
+
if j < len(b4) - 1:
|
|
356
|
+
y = b4[j+1]
|
|
357
|
+
if xy := merge(x, y):
|
|
358
|
+
x = xy
|
|
359
|
+
j = j+1 # if i can merge, jump over the merged item
|
|
360
|
+
now += [x]
|
|
361
|
+
j += 1
|
|
362
|
+
return b4 if len(now) == len(b4) else merges(now, merge)
|
|
363
|
+
|
|
364
|
+
# ### Strings to things
|
|
365
|
+
def coerce(s:str) -> Any:
|
|
366
|
+
try: return ast.literal_eval(s) # <1>
|
|
367
|
+
except Exception: return s
|
|
368
|
+
|
|
369
|
+
def csv(file=None) -> Iterable[Row]:
|
|
370
|
+
with file_or_stdin(file) as src:
|
|
371
|
+
for line in src:
|
|
372
|
+
line = re.sub(r'([\n\t\r"\’ ]|#.*)', '', line)
|
|
373
|
+
if line: yield [coerce(s.strip()) for s in line.split(",")]
|
|
374
|
+
|
|
375
|
+
def cli(d:dict) -> None:
|
|
376
|
+
for k,v in d.items():
|
|
377
|
+
v = str(v)
|
|
378
|
+
for c,arg in enumerate(sys.argv):
|
|
379
|
+
after = "" if c >= len(sys.argv) - 1 else sys.argv[c+1]
|
|
380
|
+
if arg in ["-"+k[0], "--"+k]:
|
|
381
|
+
v = "false" if v=="true" else ("true" if v=="false" else after)
|
|
382
|
+
d[k] = coerce(v)
|
|
383
|
+
if d.get("help", False): sys.text( MAIN.help() )
|
|
384
|
+
|
|
385
|
+
# ### Printing
|
|
386
|
+
def show(x:Any, n=3) -> Any:
|
|
387
|
+
if isinstance(x,(int,float)) : x= x if int(x)==x else round(x,n)
|
|
388
|
+
elif isinstance(x,(list,tuple)): x= [show(y,n) for y in x][:10]
|
|
389
|
+
elif isinstance(x,dict):
|
|
390
|
+
x= "{"+', '.join(f":{k} {show(v,n)}" for k,v in sorted(x.items()) if k[0]!="_")+"}"
|
|
391
|
+
return x
|
|
392
|
+
|
|
393
|
+
def prints(matrix: list[list],sep=' | ') -> None:
|
|
394
|
+
s = [[str(e) for e in row] for row in matrix]
|
|
395
|
+
lens = [max(map(len, col)) for col in zip(*s)]
|
|
396
|
+
fmt = sep.join('{{:>{}}}'.format(x) for x in lens)
|
|
397
|
+
[print(fmt.format(*row)) for row in s]
|
|
398
|
+
|
|
399
|
+
def asRed(pat,s) : return re.sub(pat, r"\033[91m\1\033[00m",s)
|
|
400
|
+
def asYellow(pat,s): return re.sub(pat, r"\033[93m\1\033[00m",s)
|
|
401
|
+
|
|
402
|
+
#----------------------------------------------------------------------------------------
|
|
403
|
+
# MARK: main
|
|
404
|
+
# `./trees.py _all` : run all functions , return to operating system the count of failures.
|
|
405
|
+
# `MAIN._one()` : reset all options to defaults, then run one start-up action.
|
|
406
|
+
|
|
407
|
+
class MAIN:
|
|
408
|
+
def main():
|
|
409
|
+
global the
|
|
410
|
+
if __name__=="__main__": cli(the.__dict__)
|
|
411
|
+
MAIN.one(the.go)
|
|
412
|
+
|
|
413
|
+
def one(s:str) -> any:
|
|
414
|
+
global the
|
|
415
|
+
cache = copy.deepcopy(the)
|
|
416
|
+
random.seed(the.seed)
|
|
417
|
+
out = getattr(MAIN, s, lambda :print(f"E> '{s}' unknown."))()
|
|
418
|
+
the = cache
|
|
419
|
+
return out
|
|
420
|
+
|
|
421
|
+
def all() -> None:
|
|
422
|
+
sys.exit(sum(MAIN.one(s) == False for s in sorted(dir(MAIN))
|
|
423
|
+
if s[0] != "_" and s not in ["all", "one", "main"]))
|
|
424
|
+
|
|
425
|
+
def help():
|
|
426
|
+
print(asRed(r"(\n[\s]+-\S)", asYellow(r"( --[\S]+)", __doc__)))
|
|
427
|
+
|
|
428
|
+
def opt(): print(the)
|
|
429
|
+
|
|
430
|
+
def header():
|
|
431
|
+
top=["Clndrs","Volume","HpX","Model","origin","Lbs-","Acc+","Mpg+"]
|
|
432
|
+
[print(col) for col in COLS(top).all]
|
|
433
|
+
|
|
434
|
+
def data():
|
|
435
|
+
d=DATA(csv(the.file))
|
|
436
|
+
print("mid", d.stats())
|
|
437
|
+
print("div", d.stats(cols=d.cols.all,what="div"))
|
|
438
|
+
|
|
439
|
+
def rows():
|
|
440
|
+
d1=DATA(csv(the.file))
|
|
441
|
+
d2=d1.clone(d1.rows, order=True)
|
|
442
|
+
for d in [d1,d2]:
|
|
443
|
+
print(sorted(show(d.loglike(r,len(d.rows),1)) for r in d.rows)[::50])
|
|
444
|
+
|
|
445
|
+
def nbayes():
|
|
446
|
+
the.file="../data/soybean.csv"
|
|
447
|
+
the.m,the.k = 1,0
|
|
448
|
+
nb = NB()
|
|
449
|
+
d=DATA(csv(the.file),order=False,
|
|
450
|
+
fun=nb.run)
|
|
451
|
+
print(show(nb.acc/len(d.rows)))
|
|
452
|
+
|
|
453
|
+
def bore():
|
|
454
|
+
d=DATA(csv(the.file),order=True); print("")
|
|
455
|
+
prints([d.cols.names] + [r for r in d.rows[::50]])
|
|
456
|
+
|
|
457
|
+
def bore2():
|
|
458
|
+
d = DATA(csv(the.file),order=True)
|
|
459
|
+
n = int(len(d.rows)**.5)
|
|
460
|
+
for col in d.cols.x:
|
|
461
|
+
print("")
|
|
462
|
+
for bin in col.bins(dict(best=d.rows[:n] ,rest=d.rows[-n:])):
|
|
463
|
+
print(bin, sep="\t")
|
|
464
|
+
|
|
465
|
+
def contrasts():
|
|
466
|
+
d = DATA(csv(the.file),order=True)
|
|
467
|
+
n = int(len(d.rows)**.5)
|
|
468
|
+
best = d.rows[:n]
|
|
469
|
+
rest = shuffle(d.rows[n:])[-n:]
|
|
470
|
+
|
|
471
|
+
tree = CONTRASTS(d,dict(best=best,rest=rest)).root
|
|
472
|
+
tree.show()
|
|
473
|
+
#print(json.dumps(tree, indent=2))
|
|
474
|
+
|
|
475
|
+
def guess():
|
|
476
|
+
budget = 20
|
|
477
|
+
d = DATA(csv(the.file),order=True)
|
|
478
|
+
asIs, toBe = NUM(), NUM()
|
|
479
|
+
[asIs.add(d.d2h(row)) for row in d.rows]
|
|
480
|
+
for _ in range(20):
|
|
481
|
+
tmp = [random.choice(d.rows) for _ in range(budget)]
|
|
482
|
+
toBe.add( d.d2h( sorted(tmp, key=lambda r: d.d2h(r))[0]))
|
|
483
|
+
print(show(dict(budget= budget,
|
|
484
|
+
mu= dict( asIs=asIs.mid(), guess= toBe.mid()),
|
|
485
|
+
sd= dict(asIs=asIs.div(), guess= toBe.div()))))
|
|
486
|
+
|
|
487
|
+
def smo():
|
|
488
|
+
d = DATA(csv(the.file))
|
|
489
|
+
print(d.d2h( smo( d )))
|
|
490
|
+
|
|
491
|
+
def smo20():
|
|
492
|
+
import cProfile
|
|
493
|
+
agains = 20
|
|
494
|
+
d = DATA(csv(the.file),order=True)
|
|
495
|
+
asIs, toBe = NUM(), NUM()
|
|
496
|
+
[asIs.add(d.d2h(row)) for row in d.rows]
|
|
497
|
+
pr = cProfile.Profile()
|
|
498
|
+
pr.enable()
|
|
499
|
+
[toBe.add(d.d2h(smo(d))) for _ in range(agains)]
|
|
500
|
+
pr.disable()
|
|
501
|
+
pr.print_stats(sort='time')
|
|
502
|
+
print(show(dict(agains=agains,
|
|
503
|
+
mu= dict(asIs=asIs.mid(), toBe= toBe.mid()),
|
|
504
|
+
sd= dict(asIs=asIs.div(), toBe= toBe.div()))))
|
|
505
|
+
|
|
506
|
+
# --------------------------------------------
|
|
507
|
+
# MARK: Start-up
|
|
508
|
+
the = OBJ(**settings(__doc__))
|
|
509
|
+
if __name__=="__main__": MAIN.main()
|
ezr-0.1.0/setup.cfg
ADDED
ezr-0.1.0/setup.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from setuptools import setup,find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='ezr',
|
|
5
|
+
version='0.1.0',
|
|
6
|
+
license="BSD2",
|
|
7
|
+
py_modules=['ezr'],
|
|
8
|
+
url='https://github.com/timm/ezr',
|
|
9
|
+
author='Tim Menzies',
|
|
10
|
+
author_email='timm@ieee.org',
|
|
11
|
+
description='Semi-supervised explanations for incremental multi-objective optimization',
|
|
12
|
+
install_requires=[],
|
|
13
|
+
packages=find_packages(),
|
|
14
|
+
classifiers=[
|
|
15
|
+
'Programming Language :: Python :: 3',
|
|
16
|
+
'License :: OSI Approved :: BSD License',
|
|
17
|
+
'Development Status :: 2 - Pre-Alpha',
|
|
18
|
+
'Operating System :: OS Independent',
|
|
19
|
+
],
|
|
20
|
+
entry_points='''
|
|
21
|
+
[console_scripts]
|
|
22
|
+
ezr=ezr:MAIN.main
|
|
23
|
+
''',
|
|
24
|
+
)
|