dsff 1.0.7__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsff/VERSION.txt +1 -1
- dsff/__info__.py +7 -2
- dsff/__init__.py +4 -292
- dsff/formats/__common__.py +33 -0
- dsff/formats/__init__.py +346 -0
- dsff/{arff.py → formats/arff.py} +51 -55
- dsff/{csv.py → formats/csv.py} +5 -11
- dsff/{dataset.py → formats/dataset.py} +4 -12
- dsff/formats/db.py +76 -0
- dsff/formats/pa.py +29 -0
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/METADATA +9 -10
- dsff-1.2.0.dist-info/RECORD +15 -0
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/WHEEL +1 -1
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info/licenses}/LICENSE +674 -674
- dsff/__common__.py +0 -20
- dsff-1.0.7.dist-info/RECORD +0 -12
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/top_level.txt +0 -0
dsff/formats/__init__.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
import builtins
|
|
3
|
+
import logging
|
|
4
|
+
import openpyxl.reader.excel as excelr
|
|
5
|
+
import types
|
|
6
|
+
from ast import literal_eval
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from functools import wraps
|
|
9
|
+
from getpass import getuser
|
|
10
|
+
from openpyxl import load_workbook, Workbook
|
|
11
|
+
from openpyxl.styles import Alignment, Font
|
|
12
|
+
from openpyxl.utils import get_column_letter
|
|
13
|
+
from zipfile import BadZipFile, ZipFile
|
|
14
|
+
|
|
15
|
+
from .__common__ import *
|
|
16
|
+
from .arff import *
|
|
17
|
+
from .csv import *
|
|
18
|
+
from .dataset import *
|
|
19
|
+
from .db import *
|
|
20
|
+
if PYARROW: # pragma: no cover
|
|
21
|
+
from .pa import *
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = ["DSFF"]
|
|
25
|
+
|
|
26
|
+
_FORMAT_TEXT_ALIAS = {'arff': "ARFF", 'csv': "CSV", 'db': "SQL", 'orc': "ORC"}
|
|
27
|
+
_NO_EXT_FORMAT = ["dataset"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
for name, etype in [("BadDsffFile", "OSError"), ("BadInputData", "ValueError"), ("EmptyDsffFile", "ValueError")]:
|
|
31
|
+
if not hasattr(builtins, name):
|
|
32
|
+
exec(f"class {name}({etype}): __module__ = 'builtins'")
|
|
33
|
+
setattr(builtins, name, locals()[name])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _bind_from(dsff):
|
|
37
|
+
def _adapt_name(f):
|
|
38
|
+
def _wrapper(*a, **kw):
|
|
39
|
+
path = None
|
|
40
|
+
if len(a) > 0:
|
|
41
|
+
path, a = a[0], a[1:]
|
|
42
|
+
elif 'path' in kw:
|
|
43
|
+
path = kw.pop('path')
|
|
44
|
+
path = _fix_path(path or dsff.name, f.__name__.split("_", 1)[1])
|
|
45
|
+
if path == INMEMORY:
|
|
46
|
+
raise ValueError("no path specified")
|
|
47
|
+
dsff.logger.debug(f"creating DSFF from {path}...")
|
|
48
|
+
return f(dsff, path, *a, **kw)
|
|
49
|
+
setattr(dsff, f.__name__, _wrapper)
|
|
50
|
+
return _wrapper
|
|
51
|
+
return _adapt_name
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _bind_to(dsff):
|
|
55
|
+
def _is_empty(f):
|
|
56
|
+
def _wrapper(*a, **kw):
|
|
57
|
+
if len(dsff) == 0:
|
|
58
|
+
raise EmptyDsffFile("No data")
|
|
59
|
+
path = None
|
|
60
|
+
if len(a) > 0:
|
|
61
|
+
path, a = a[0], a[1:]
|
|
62
|
+
elif 'path' in kw:
|
|
63
|
+
path = kw.pop('path')
|
|
64
|
+
path = _fix_path(path or ("undefined" if dsff.name == INMEMORY else dsff.name),
|
|
65
|
+
fmt := f.__name__.split("_", 1)[1])
|
|
66
|
+
dsff.logger.debug(f"converting DSFF to {path}..." if 'text' not in kw or kw['text'] else \
|
|
67
|
+
f"converting DSFF to {_FORMAT_TEXT_ALIAS.get(fmt, fmt.capitalize())}...")
|
|
68
|
+
return f(dsff, path, *a, **kw)
|
|
69
|
+
setattr(dsff, f.__name__, _wrapper)
|
|
70
|
+
return _wrapper
|
|
71
|
+
return _is_empty
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _fix_path(path, ext):
|
|
75
|
+
from os.path import expanduser, splitext
|
|
76
|
+
if path == INMEMORY:
|
|
77
|
+
return path
|
|
78
|
+
path = splitext(expanduser(path))[0]
|
|
79
|
+
if ext is not None and ext not in _NO_EXT_FORMAT and not path.endswith(f".{ext}"):
|
|
80
|
+
path += f".{ext}"
|
|
81
|
+
return path
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DSFF:
|
|
85
|
+
""" DataSet File Format.
|
|
86
|
+
|
|
87
|
+
Modes: r r+ w w+
|
|
88
|
+
--------------------------
|
|
89
|
+
read * * *
|
|
90
|
+
write * * *
|
|
91
|
+
create * *
|
|
92
|
+
truncate * *
|
|
93
|
+
|
|
94
|
+
Important note: XSLX format has a limitation of 1,048,576 rows per sheet.
|
|
95
|
+
"""
|
|
96
|
+
def __init__(self, path=None, mode=None, logger=None):
|
|
97
|
+
if mode is None:
|
|
98
|
+
mode = "rw"[path in [None, INMEMORY]]
|
|
99
|
+
if re.match(r"[rw]\+?$", mode) is None:
|
|
100
|
+
raise ValueError("Mode should be one of: r, r+, w, w+")
|
|
101
|
+
self.__change = False
|
|
102
|
+
self.__logger = logger or logging.getLogger("DSFF")
|
|
103
|
+
self.__name = None
|
|
104
|
+
self.__path = path
|
|
105
|
+
self.__mode = mode
|
|
106
|
+
# depending on the mode, bind the necessary methods
|
|
107
|
+
if mode in ["r+", "w", "w+"]:
|
|
108
|
+
self.save = types.MethodType(lambda dsff: dsff._DSFF__save(), self)
|
|
109
|
+
self.logger.debug("binding write methods")
|
|
110
|
+
for name, obj in globals().items():
|
|
111
|
+
if name.startswith("from_"):
|
|
112
|
+
_bind_from(self)(obj)
|
|
113
|
+
self.logger.debug("binding read methods")
|
|
114
|
+
for name, obj in globals().items():
|
|
115
|
+
if name.startswith("to_"):
|
|
116
|
+
_bind_to(self)(obj)
|
|
117
|
+
# perform checks
|
|
118
|
+
if mode in ["r", "r+"]:
|
|
119
|
+
if path is None:
|
|
120
|
+
raise ValueError("No input path to a .dsff file provided")
|
|
121
|
+
if path != INMEMORY and not isfile(path):
|
|
122
|
+
raise FileNotFoundError("Input .dsff does not exist")
|
|
123
|
+
# if the target path exists and is a file, open it
|
|
124
|
+
if mode in ["r", "r+"] and path != INMEMORY:
|
|
125
|
+
# disable archive validation as it does not recognize '.dsff'
|
|
126
|
+
tmp = excelr._validate_archive
|
|
127
|
+
excelr._validate_archive = lambda f: ZipFile(f, 'r')
|
|
128
|
+
try:
|
|
129
|
+
self.__wb = load_workbook(path)
|
|
130
|
+
except BadZipFile:
|
|
131
|
+
raise BadDsffFile("File is not a DSFF file")
|
|
132
|
+
finally:
|
|
133
|
+
excelr._validate_archive = tmp
|
|
134
|
+
# check that the file has only 2 worksheets: 'data' and 'features'
|
|
135
|
+
if [ws._WorkbookChild__title for ws in self.__wb.worksheets] != ["data", "features"]:
|
|
136
|
+
raise BadDsffFile("File is not a DSFF file")
|
|
137
|
+
# check that the 'features' worksheet has 2 columns: 'name' and 'description'
|
|
138
|
+
for headers in self.__wb['features'].rows:
|
|
139
|
+
if len(headers) != 2 or headers[0].value != "name" or headers[1].value != "description":
|
|
140
|
+
raise BadDsffFile("The features worksheet does not comply with DSFF")
|
|
141
|
+
break
|
|
142
|
+
return
|
|
143
|
+
# otherwise, create a new workbook with the default worksheets
|
|
144
|
+
if isfile(self.path):
|
|
145
|
+
remove(self.path) # re-create
|
|
146
|
+
self.__wb = Workbook()
|
|
147
|
+
del self.__wb['Sheet'] # remove the default sheet
|
|
148
|
+
for ws in ["data", "features"]:
|
|
149
|
+
self.__wb.create_sheet(ws)
|
|
150
|
+
|
|
151
|
+
def __enter__(self):
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
def __exit__(self, *args):
|
|
155
|
+
self.__save()
|
|
156
|
+
self.close()
|
|
157
|
+
|
|
158
|
+
def __getitem__(self, name):
|
|
159
|
+
if name in ["data", "features"]:
|
|
160
|
+
return self.__wb[name]
|
|
161
|
+
# the common property 'description' is used to store the metadata of the dataset, hence 'description' can be
|
|
162
|
+
# used as a key in the metadata but not from the common properties
|
|
163
|
+
try:
|
|
164
|
+
if name != "description":
|
|
165
|
+
return getattr(self.__wb.properties, name)
|
|
166
|
+
except AttributeError:
|
|
167
|
+
pass
|
|
168
|
+
return self.metadata[name]
|
|
169
|
+
|
|
170
|
+
def __len__(self):
|
|
171
|
+
return len(self.data)
|
|
172
|
+
|
|
173
|
+
def __setitem__(self, name, value):
|
|
174
|
+
if name in ["data", "features"]:
|
|
175
|
+
raise ValueError(f"'{name}' is a name reserved for a worksheet")
|
|
176
|
+
# see the note from __getitem__ related to 'description'
|
|
177
|
+
if hasattr(self.__wb.properties, name) and name != "description":
|
|
178
|
+
setattr(self.__wb.properties, name, value)
|
|
179
|
+
d = self.metadata
|
|
180
|
+
d[name] = value
|
|
181
|
+
self.__wb.properties.description = json.dumps(d)
|
|
182
|
+
self.__change = True
|
|
183
|
+
|
|
184
|
+
def __eval(self, v):
|
|
185
|
+
try:
|
|
186
|
+
return literal_eval(v)
|
|
187
|
+
except (SyntaxError, ValueError):
|
|
188
|
+
return v
|
|
189
|
+
|
|
190
|
+
def __save(self):
|
|
191
|
+
if self.mode == "r" or self.path == INMEMORY:
|
|
192
|
+
return
|
|
193
|
+
if self.__change:
|
|
194
|
+
props = self.__wb.properties
|
|
195
|
+
if props.creator is None or props.creator == "openpyxl":
|
|
196
|
+
props.creator = getuser()
|
|
197
|
+
props.title = self.name
|
|
198
|
+
props.description = self.metadata
|
|
199
|
+
if isfile(self.path) and self.mode.startswith("w"):
|
|
200
|
+
remove(self.path)
|
|
201
|
+
self.__wb.save(self.path)
|
|
202
|
+
self.__change = False
|
|
203
|
+
|
|
204
|
+
def _to_table(self):
|
|
205
|
+
if not PYARROW:
|
|
206
|
+
raise NotImplementedError("This method is available only wheh Pyarrow is installed")
|
|
207
|
+
headers = self.data[0]
|
|
208
|
+
try:
|
|
209
|
+
l = headers.index("label")
|
|
210
|
+
except ValueError:
|
|
211
|
+
l = -1
|
|
212
|
+
cols = [pyarrow.array(col if i != l else [[c, None][c == MISSING_TOKEN] for c in col]) \
|
|
213
|
+
for i, col in enumerate(zip(*self.data[1:]))]
|
|
214
|
+
meta = {k: v for k, v in self.features.items()}
|
|
215
|
+
meta['__metadata__'] = str(self.metadata).encode()
|
|
216
|
+
schema_with_meta = pyarrow.schema(pyarrow.Table.from_arrays(cols, names=headers).schema, metadata=meta)
|
|
217
|
+
return pyarrow.Table.from_arrays(cols, schema=schema_with_meta)
|
|
218
|
+
|
|
219
|
+
def close(self):
|
|
220
|
+
self.__wb.close()
|
|
221
|
+
|
|
222
|
+
def write(self, data=None, features=None, metadata=None, missing="?"):
|
|
223
|
+
""" Write data and/or features and/or metadata to the workbook.
|
|
224
|
+
|
|
225
|
+
:param data: matrix of data (including headers) OR path to data.csv OR path to Dataset folder
|
|
226
|
+
:param features: dictionary of features' names and descriptions OR path to features.json
|
|
227
|
+
:param metadata: dictionary of dataset's metadata OR path to metadata.json
|
|
228
|
+
"""
|
|
229
|
+
# get the cell coordinate from (X,Y) coordinates (e.g. (1,2) => "B1")
|
|
230
|
+
coord = lambda x, y: ws.cell(x+1, y+1).coordinate
|
|
231
|
+
# private function to auto-adjust column widths
|
|
232
|
+
def autoadjust(ws):
|
|
233
|
+
col_widths = []
|
|
234
|
+
for row in ws.rows:
|
|
235
|
+
if len(col_widths) == 0:
|
|
236
|
+
col_widths = len(row) * [0]
|
|
237
|
+
for i, cell in enumerate(row):
|
|
238
|
+
col_widths[i] = max(col_widths[i], len(str(cell.value)))
|
|
239
|
+
for i, w in enumerate(col_widths):
|
|
240
|
+
ws.column_dimensions[get_column_letter(i+1)].width = w
|
|
241
|
+
# if the first argument is a folder, assume it is a specific folder structure as follows:
|
|
242
|
+
# [name]
|
|
243
|
+
# +-- data.csv # metadata and labels of the executable
|
|
244
|
+
# +-- features.json # dictionary of selected features and their descriptions
|
|
245
|
+
# +-- metadata.json # simple statistics about the dataset
|
|
246
|
+
if data is not None and not isinstance(data, (list, dict)) and isdir(expanduser(data)):
|
|
247
|
+
self.__path, d = self.__path or basename(data), expanduser(data)
|
|
248
|
+
data, features, metadata = join(d, "data.csv"), join(d, "features.json"), join(d, "metadata.json")
|
|
249
|
+
# handle data first
|
|
250
|
+
if data is not None:
|
|
251
|
+
self.__logger.debug("writing data to DSFF...")
|
|
252
|
+
ws, d = self.__wb['data'], data
|
|
253
|
+
if not isinstance(d, list):
|
|
254
|
+
if isfile(expanduser(d)) and splitext(d)[1] == ".csv":
|
|
255
|
+
with open(expanduser(d)) as f:
|
|
256
|
+
d = [r for r in csvmod.reader(f, delimiter=CSV_DELIMITER)]
|
|
257
|
+
else:
|
|
258
|
+
raise BadInputData("'data' is not a list")
|
|
259
|
+
for r, row in enumerate(d):
|
|
260
|
+
for c, value in enumerate(row):
|
|
261
|
+
c = coord(r, c)
|
|
262
|
+
ws[c] = str({None: missing}.get(value, value))
|
|
263
|
+
if r == 0:
|
|
264
|
+
ws[c].alignment = Alignment(horizontal="center")
|
|
265
|
+
ws[c].font = Font(bold=True)
|
|
266
|
+
autoadjust(ws)
|
|
267
|
+
self.__change = True
|
|
268
|
+
# then handle features dictionary
|
|
269
|
+
if len(features or {}) > 0:
|
|
270
|
+
self.__logger.debug("writing features to DSFF...")
|
|
271
|
+
ws, headers, d = self.__wb['features'], ["name", "description"], features
|
|
272
|
+
if not isinstance(d, dict):
|
|
273
|
+
if isfile(expanduser(d)) and basename(d) == "features.json":
|
|
274
|
+
with open(expanduser(d)) as f:
|
|
275
|
+
d = json.load(f)
|
|
276
|
+
else:
|
|
277
|
+
raise BadInputData("'features' is not a dictionary")
|
|
278
|
+
try:
|
|
279
|
+
for c, header in enumerate(headers):
|
|
280
|
+
c = coord(0, c)
|
|
281
|
+
ws[c] = header
|
|
282
|
+
ws[c].alignment = Alignment(horizontal="center")
|
|
283
|
+
ws[c].font = Font(bold=True)
|
|
284
|
+
for r, pair in enumerate(d.items()):
|
|
285
|
+
ws[coord(r+1, 0)] = pair[0]
|
|
286
|
+
ws[coord(r+1, 1)] = pair[1]
|
|
287
|
+
autoadjust(ws)
|
|
288
|
+
self.__change = True
|
|
289
|
+
except Exception as e:
|
|
290
|
+
raise BadInputData(f"Unexpected error while parsing 'features' ({e})")
|
|
291
|
+
# finally handle metadata dictionary
|
|
292
|
+
if len(metadata or {}) > 0:
|
|
293
|
+
self.__logger.debug("writing metadata to DSFF...")
|
|
294
|
+
d = metadata
|
|
295
|
+
if not isinstance(d, dict):
|
|
296
|
+
if isfile(expanduser(d)) and basename(d) == "metadata.json":
|
|
297
|
+
with open(expanduser(d)) as f:
|
|
298
|
+
d = json.load(f)
|
|
299
|
+
else:
|
|
300
|
+
raise BadInputData("'metadata' is not a dictionary")
|
|
301
|
+
try:
|
|
302
|
+
self.__wb.properties.description = json.dumps(d)
|
|
303
|
+
self.__change = True
|
|
304
|
+
except Exception as e:
|
|
305
|
+
raise BadInputData(f"Unexpected error while parsing 'metadata' ({e})")
|
|
306
|
+
self.__save()
|
|
307
|
+
|
|
308
|
+
@property
|
|
309
|
+
def data(self):
|
|
310
|
+
return [[self.__eval(c.value) for c in cells] for cells in self.__wb['data'].rows]
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def features(self):
|
|
314
|
+
return {cells[0].value: cells[1].value for i, cells in enumerate(self.__wb['features'].rows) if i > 0}
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def headers(self):
|
|
318
|
+
return list(self.features.keys())
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
def logger(self):
|
|
322
|
+
return self.__logger
|
|
323
|
+
|
|
324
|
+
@property
|
|
325
|
+
def metadata(self):
|
|
326
|
+
return json.loads((self.__wb.properties.description or "{}").replace("'", "\""))
|
|
327
|
+
|
|
328
|
+
@property
|
|
329
|
+
def mode(self):
|
|
330
|
+
return self.__mode
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def name(self):
|
|
334
|
+
return self.__name or self.__wb.properties.title or splitext(basename(self.path))[0]
|
|
335
|
+
|
|
336
|
+
@name.setter
|
|
337
|
+
def name(self, name):
|
|
338
|
+
self.__name = name
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def path(self):
|
|
342
|
+
p = self.__path or "undefined"
|
|
343
|
+
if p != INMEMORY and not p.endswith(".dsff"):
|
|
344
|
+
p += ".dsff"
|
|
345
|
+
return p
|
|
346
|
+
|
dsff/{arff.py → formats/arff.py}
RENAMED
|
@@ -7,18 +7,12 @@ __all__ = ["from_arff", "to_arff"]
|
|
|
7
7
|
|
|
8
8
|
def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
|
|
9
9
|
""" Populate the DSFF file from an ARFF file. """
|
|
10
|
-
|
|
11
|
-
if not path.endswith(".arff"):
|
|
12
|
-
path += ".arff"
|
|
13
|
-
dsff.logger.debug("creating DSFF from ARFF file...")
|
|
14
|
-
d = []
|
|
10
|
+
d, features = [], {}
|
|
15
11
|
with open(path) as f:
|
|
16
12
|
relation, attributes, data = False, [False, False], False
|
|
17
13
|
for n, l in enumerate(f, 1):
|
|
18
|
-
l = l.strip()
|
|
19
|
-
#
|
|
20
|
-
if l.startswith("#"):
|
|
21
|
-
continue
|
|
14
|
+
l, pf = l.strip(), f"Line {n}: "
|
|
15
|
+
# the file shall start with "@RELATION"
|
|
22
16
|
if not relation:
|
|
23
17
|
if l.startswith("@RELATION "):
|
|
24
18
|
relation = True
|
|
@@ -26,9 +20,17 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
|
|
|
26
20
|
dsff['title'] = re.match(r"@RELATION\s+('[^']*'|\"[^\"]*\")$", l).group(1).strip("'\"")
|
|
27
21
|
continue
|
|
28
22
|
except Exception as e:
|
|
29
|
-
raise BadInputData("
|
|
23
|
+
raise BadInputData(f"{pf}failed on @RELATION ({e})")
|
|
30
24
|
else:
|
|
31
|
-
raise BadInputData("
|
|
25
|
+
raise BadInputData(f"{pf}did not find @RELATION")
|
|
26
|
+
# get metadata and feature descriptions from comments
|
|
27
|
+
if l.startswith("%"):
|
|
28
|
+
if re.match(r"^\%\s+metadata\s*\:\s*\{.*\}$", l):
|
|
29
|
+
dsff.write(metadata=literal_eval(l.split(":", 1)[1]))
|
|
30
|
+
elif (m := re.match(r"^\%\s+(.*?)\s*\:\s*(.*?)$", l)):
|
|
31
|
+
name, descr = m.groups()
|
|
32
|
+
features[name] = descr
|
|
33
|
+
continue
|
|
32
34
|
# then ignore blank lines
|
|
33
35
|
if l == "":
|
|
34
36
|
if attributes[0] and not attributes[1]:
|
|
@@ -43,7 +45,7 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
|
|
|
43
45
|
# start the attributes block
|
|
44
46
|
d.append([])
|
|
45
47
|
if attributes[1]:
|
|
46
|
-
raise BadInputData("
|
|
48
|
+
raise BadInputData(f"{pf}found @ATTRIBUTE out of the attributes block)")
|
|
47
49
|
try:
|
|
48
50
|
header = re.match(r"@ATTRIBUTE\s+([^\s]+)\s+[A-Z]+$", l).group(1)
|
|
49
51
|
if header == "class":
|
|
@@ -51,16 +53,16 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
|
|
|
51
53
|
d[0].append(header)
|
|
52
54
|
continue
|
|
53
55
|
except AttributeError:
|
|
54
|
-
raise BadInputData("
|
|
56
|
+
raise BadInputData(f"{pf}failed on @ATTRIBUTE (bad type)")
|
|
55
57
|
if not data:
|
|
56
58
|
if l == "@DATA":
|
|
57
59
|
data = True
|
|
58
60
|
continue
|
|
59
61
|
else:
|
|
60
|
-
raise BadInputData("
|
|
62
|
+
raise BadInputData(f"{pf}did not find @DATA where expected")
|
|
61
63
|
row = list(map(lambda x: x.strip("'\""), re.split(r",\s+", l)))
|
|
62
64
|
if len(row) != n_cols:
|
|
63
|
-
raise BadInputData("
|
|
65
|
+
raise BadInputData(f"{pf}this row does not match the number of columns")
|
|
64
66
|
d.append(row)
|
|
65
67
|
for i in range(n_cols):
|
|
66
68
|
values = []
|
|
@@ -75,23 +77,14 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
|
|
|
75
77
|
for j, row in enumerate(d):
|
|
76
78
|
if j > 0:
|
|
77
79
|
row[i] = {'0': "False", '1': "True"}[row[i]]
|
|
78
|
-
dsff.write(d)
|
|
79
|
-
features = {}
|
|
80
|
-
for headers in dsff['data'].rows:
|
|
81
|
-
for header in headers:
|
|
82
|
-
features[header.value] = ""
|
|
83
|
-
break
|
|
80
|
+
dsff.write(data=d)
|
|
84
81
|
dsff.write(features=features)
|
|
85
82
|
|
|
86
83
|
|
|
87
84
|
def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=MISSING_TOKEN, text=False):
|
|
88
85
|
""" Output the dataset in ARFF format, suitable for use with the Weka framework, saved as a file or output as a
|
|
89
86
|
string. """
|
|
90
|
-
path = splitext(expanduser(path or dsff.name))[0]
|
|
91
|
-
if not path.endswith(".arff"):
|
|
92
|
-
path += ".arff"
|
|
93
87
|
name = splitext(basename(path))[0]
|
|
94
|
-
dsff.logger.debug("extracting data from DSFF to ARFF file...")
|
|
95
88
|
_d = lambda c: {None: missing, '': "-", 'False': "0", 'True': "1"}.get(c.value, c.value)
|
|
96
89
|
_sanitize_n = lambda c: _d(c).replace("<", "[lt]").replace(">", "[gt]")
|
|
97
90
|
_sanitize_v = lambda c: _d(c)[1:].strip("\"'") if _d(c).startswith("=") else _d(c)
|
|
@@ -119,37 +112,40 @@ def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=M
|
|
|
119
112
|
types = []
|
|
120
113
|
# parse labels, data types and relevant data
|
|
121
114
|
for i, row in enumerate(data.rows):
|
|
122
|
-
if i
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
115
|
+
if i == 0:
|
|
116
|
+
continue # do not process headers
|
|
117
|
+
if len(types) == 0:
|
|
118
|
+
labels = ["0", "1"] if i_target == -1 else \
|
|
119
|
+
list(set(_sanitize_v(row[i_target]) for k, row in enumerate(data) if k > 0))
|
|
120
|
+
labels = [x for x in labels if x != missing]
|
|
121
|
+
# compute types
|
|
122
|
+
for j, cell in enumerate(row):
|
|
123
|
+
v = _sanitize_v(cell)
|
|
124
|
+
try:
|
|
125
|
+
float(v)
|
|
126
|
+
t = "NUMERIC"
|
|
127
|
+
except ValueError:
|
|
128
|
+
t = "STRING"
|
|
129
|
+
types.append(t)
|
|
130
|
+
# filter data types based on the relevant columns
|
|
131
|
+
types = [t for k, t in enumerate(types) if k != i_target and k not in h_excl] + \
|
|
132
|
+
[types[i_target] if i_target > -1 else []]
|
|
133
|
+
# compute the list of ARFF attribute lines based on the column names and data types
|
|
134
|
+
a = [("@ATTRIBUTE {: <%s} {}" % mlen_h).format("class" if j == len(types)-1 else \
|
|
135
|
+
_sanitize_n(headers[j]), t) for j, t in enumerate(types)]
|
|
136
|
+
mlen_c = [0] * len(types)
|
|
137
|
+
# filter data based on the relevant columns
|
|
138
|
+
row = [_sanitize_v(x) for k, x in enumerate(row) if k != i_target and k not in h_excl] + \
|
|
139
|
+
([_sanitize_v(row[i_target])] if i_target > -1 else [])
|
|
140
|
+
# compute the maximum length for each column
|
|
141
|
+
mlen_c = [max(x, len(row[k]) if types[k] == "NUMERIC" else len(row[k])+2) for k, x in enumerate(mlen_c)]
|
|
142
|
+
d.append(row)
|
|
149
143
|
# format the resulting data and output the ARFF
|
|
150
|
-
d = "\n".join(" ".join(("{: <%s}" % (mlen_c[k]+1)).format((x if types[k] == "NUMERIC" or
|
|
151
|
-
|
|
152
|
-
arff = "@RELATION \"{}\"\n\n{}\n\n@DATA\n{}
|
|
144
|
+
d = (nl := "\n").join(" ".join(("{: <%s}" % (mlen_c[k]+1)).format((x if types[k] == "NUMERIC" or \
|
|
145
|
+
x == MISSING_TOKEN else "'%s'" % x) + ",") for k, x in enumerate(row)).rstrip(" ,") for row in d)
|
|
146
|
+
arff = f"@RELATION \"{name}\"\n\n{nl.join(a)}\n\n@DATA\n{d}\n\n" \
|
|
147
|
+
f"{['', f'% metadata: {json.dumps(dsff.metadata)}'][len(dsff.metadata) > 0]}\n\n" \
|
|
148
|
+
f"{nl.join(f'% {name}: {descr}' for name, descr in dsff.features.items())}"
|
|
153
149
|
if text:
|
|
154
150
|
return arff
|
|
155
151
|
with open(path, 'w+') as f:
|
dsff/{csv.py → formats/csv.py}
RENAMED
|
@@ -6,15 +6,13 @@ __all__ = ["from_csv", "to_csv"]
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def from_csv(dsff, path=None, exclude=DEFAULT_EXCL):
|
|
9
|
-
""" Populate the DSFF file from
|
|
10
|
-
path = expanduser(path or dsff.name)
|
|
11
|
-
if not path.endswith(".csv"):
|
|
12
|
-
path += ".csv"
|
|
13
|
-
dsff.logger.debug("creating DSFF from CSV file...")
|
|
9
|
+
""" Populate the DSFF file from a CSV file. """
|
|
14
10
|
dsff.write(path)
|
|
15
11
|
features = {}
|
|
16
12
|
for headers in dsff['data'].rows:
|
|
17
13
|
for header in headers:
|
|
14
|
+
if header.value in exclude:
|
|
15
|
+
continue
|
|
18
16
|
features[header.value] = ""
|
|
19
17
|
break
|
|
20
18
|
dsff.write(features=features)
|
|
@@ -22,14 +20,10 @@ def from_csv(dsff, path=None, exclude=DEFAULT_EXCL):
|
|
|
22
20
|
|
|
23
21
|
def to_csv(dsff, path=None, text=False):
|
|
24
22
|
""" Create a CSV from the data worksheet, saved as a file or output as a string. """
|
|
25
|
-
path = splitext(expanduser(path or dsff.name))[0]
|
|
26
|
-
if not path.endswith(".csv"):
|
|
27
|
-
path += ".csv"
|
|
28
|
-
dsff.logger.debug("extracting data from DSFF to CSV file...")
|
|
29
23
|
with (StringIO() if text else open(path, 'w+')) as f:
|
|
30
24
|
writer = csvmod.writer(f, delimiter=";")
|
|
31
|
-
for
|
|
32
|
-
writer.writerow(
|
|
25
|
+
for row in dsff.data:
|
|
26
|
+
writer.writerow(row)
|
|
33
27
|
if text:
|
|
34
28
|
return f.getvalue()
|
|
35
29
|
|
|
@@ -6,31 +6,23 @@ __all__ = ["from_dataset", "to_dataset"]
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def from_dataset(dsff, path=None):
|
|
9
|
-
""" Populate the DSFF file from
|
|
10
|
-
path = expanduser(path or dsff.name)
|
|
11
|
-
dsff.logger.debug("creating DSFF from (Fileless)Dataset folder...")
|
|
9
|
+
""" Populate the DSFF file from a Dataset structure. """
|
|
12
10
|
if not isdir(path):
|
|
13
11
|
raise BadInputData("Not a folder")
|
|
14
12
|
else:
|
|
15
|
-
missing
|
|
16
|
-
|
|
17
|
-
if not isfile(join(path, f)):
|
|
18
|
-
missing.append(f)
|
|
19
|
-
if len(missing) > 0:
|
|
20
|
-
raise BadInputData("Not a valid dataset folder (missing: %s)" % ", ".join(missing))
|
|
13
|
+
if len(missing := [f for f in ["data.csv", "features.json", "metadata.json"] if not isfile(join(path, f))]) > 0:
|
|
14
|
+
raise BadInputData(f"Not a valid dataset folder (missing: {', '.join(missing)})")
|
|
21
15
|
dsff.write(path)
|
|
22
16
|
|
|
23
17
|
|
|
24
18
|
def to_dataset(dsff, path=None):
|
|
25
|
-
""" Create a dataset folder according to the
|
|
19
|
+
""" Create a dataset folder according to the following structure ;
|
|
26
20
|
name
|
|
27
21
|
+-- data.csv
|
|
28
22
|
+-- features.json
|
|
29
23
|
+-- metadata.json
|
|
30
24
|
"""
|
|
31
|
-
path = splitext(expanduser(path or dsff.name))[0]
|
|
32
25
|
makedirs(path, exist_ok=True)
|
|
33
|
-
dsff.logger.debug("converting DSFF to (Fileless)Dataset folder...")
|
|
34
26
|
# handle data first
|
|
35
27
|
dsff.logger.debug("> making data.csv...")
|
|
36
28
|
with open(join(path, "data.csv"), 'w+') as f:
|
dsff/formats/db.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
from .__common__ import *
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
__all__ = ["from_db", "to_db"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def from_db(dsff, path=None, exclude=DEFAULT_EXCL):
|
|
9
|
+
""" Populate the DSFF file from a SQLDB file. """
|
|
10
|
+
from json import loads
|
|
11
|
+
from sqlite3 import connect
|
|
12
|
+
conn = connect(path)
|
|
13
|
+
cursor = conn.cursor()
|
|
14
|
+
# list tables
|
|
15
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
16
|
+
tables = [table[0] for table in cursor.fetchall()]
|
|
17
|
+
if not all(t in tables for t in ["data", "features", "metadata"]): # pragma: no cover
|
|
18
|
+
raise BadInputData("The target SQLDB does not have the right format")
|
|
19
|
+
# import data
|
|
20
|
+
cursor.execute("PRAGMA table_info('data')")
|
|
21
|
+
headers = [[col[1] for col in cursor.fetchall()]]
|
|
22
|
+
cursor.execute("SELECT * FROM data;")
|
|
23
|
+
dsff.write(headers + [r for r in cursor.fetchall()])
|
|
24
|
+
# import feature definitions
|
|
25
|
+
cursor.execute("SELECT name,description FROM features;")
|
|
26
|
+
dsff.write(features={r[0]: r[1] for r in cursor.fetchall()})
|
|
27
|
+
# import metadata
|
|
28
|
+
cursor.execute("SELECT key,value FROM metadata;")
|
|
29
|
+
dsff.write(metadata={r[0]: loads(r[1]) if isinstance(r[1], str) else r[1] for r in cursor.fetchall()})
|
|
30
|
+
conn.close()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def to_db(dsff, path=None, text=False, primary_index=0):
|
|
34
|
+
""" Create a SQLDB from the data worksheet, saved as a file or output as a string. """
|
|
35
|
+
from json import dumps
|
|
36
|
+
from sqlite3 import connect
|
|
37
|
+
fields = []
|
|
38
|
+
rows = (data := dsff['data']).rows
|
|
39
|
+
headers, first = [c.value for c in next(rows)], next(rows)
|
|
40
|
+
for i, pair in enumerate(zip(headers, first)):
|
|
41
|
+
header, cell = pair
|
|
42
|
+
try:
|
|
43
|
+
dtype = {int: "INTEGER", float: "REAL", bool: "INTEGER"}[type(dsff._DSFF__eval(cell.value))]
|
|
44
|
+
except (KeyError, ValueError):
|
|
45
|
+
dtype = "TEXT"
|
|
46
|
+
fields.append(f"{header} {dtype}{['',' PRIMARY KEY'][i==primary_index]}")
|
|
47
|
+
# create the database
|
|
48
|
+
conn = connect(":memory:" if text else path)
|
|
49
|
+
cursor = conn.cursor()
|
|
50
|
+
# create and populate the data table
|
|
51
|
+
cursor.execute("CREATE TABLE IF NOT EXISTS data ({fields});" \
|
|
52
|
+
.format(fields="\n ".join(f"{f}," for f in fields).rstrip(",")))
|
|
53
|
+
cursor.executemany("INSERT INTO data ({fields}) VALUES ({tokens});"\
|
|
54
|
+
.format(fields=",".join(headers), tokens=",".join(["?"]*len(headers))),
|
|
55
|
+
[[v.value for v in row] for i, row in enumerate(data.rows) if i > 0])
|
|
56
|
+
# create and populate the features table
|
|
57
|
+
cursor.execute("CREATE TABLE IF NOT EXISTS features (name TEXT PRIMARY KEY, description TEXT);")
|
|
58
|
+
cursor.executemany("INSERT INTO features (name, description) VALUES (?, ?);",
|
|
59
|
+
[(r[0].value, r[1].value) for i, r in enumerate(dsff['features'].rows) if i > 0])
|
|
60
|
+
# create and populate the metadata table
|
|
61
|
+
cursor.execute("CREATE TABLE IF NOT EXISTS metadata (key TEXT PRIMARY KEY, value JSON);")
|
|
62
|
+
cursor.executemany("INSERT INTO metadata (key, value) VALUES (?, ?);",
|
|
63
|
+
[(k, dumps(v)) for k, v in dsff.metadata.items()])
|
|
64
|
+
conn.commit()
|
|
65
|
+
if text:
|
|
66
|
+
sql = {}
|
|
67
|
+
# extract SQL code
|
|
68
|
+
cursor.execute(f"SELECT sql FROM sqlite_master WHERE type='table';")
|
|
69
|
+
sql['table'] = "\n".join(row[0] for row in cursor.fetchall())
|
|
70
|
+
for t in ["data", "features", "metadata"]:
|
|
71
|
+
cursor.execute(f"SELECT * FROM {t};")
|
|
72
|
+
sql[t] = "\n".join(f"INSERT INTO {t} VALUES ('{row[0]}', '{row[1]}');" for row in cursor.fetchall())
|
|
73
|
+
# combine all SQL
|
|
74
|
+
return "\n".join(sql.values())
|
|
75
|
+
conn.close()
|
|
76
|
+
|