dsff 1.0.7__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,346 @@
1
+ # -*- coding: UTF-8 -*-
2
+ import builtins
3
+ import logging
4
+ import openpyxl.reader.excel as excelr
5
+ import types
6
+ from ast import literal_eval
7
+ from datetime import datetime
8
+ from functools import wraps
9
+ from getpass import getuser
10
+ from openpyxl import load_workbook, Workbook
11
+ from openpyxl.styles import Alignment, Font
12
+ from openpyxl.utils import get_column_letter
13
+ from zipfile import BadZipFile, ZipFile
14
+
15
+ from .__common__ import *
16
+ from .arff import *
17
+ from .csv import *
18
+ from .dataset import *
19
+ from .db import *
20
+ if PYARROW: # pragma: no cover
21
+ from .pa import *
22
+
23
+
24
+ __all__ = ["DSFF"]
25
+
26
+ _FORMAT_TEXT_ALIAS = {'arff': "ARFF", 'csv': "CSV", 'db': "SQL", 'orc': "ORC"}
27
+ _NO_EXT_FORMAT = ["dataset"]
28
+
29
+
30
+ for name, etype in [("BadDsffFile", "OSError"), ("BadInputData", "ValueError"), ("EmptyDsffFile", "ValueError")]:
31
+ if not hasattr(builtins, name):
32
+ exec(f"class {name}({etype}): __module__ = 'builtins'")
33
+ setattr(builtins, name, locals()[name])
34
+
35
+
36
+ def _bind_from(dsff):
37
+ def _adapt_name(f):
38
+ def _wrapper(*a, **kw):
39
+ path = None
40
+ if len(a) > 0:
41
+ path, a = a[0], a[1:]
42
+ elif 'path' in kw:
43
+ path = kw.pop('path')
44
+ path = _fix_path(path or dsff.name, f.__name__.split("_", 1)[1])
45
+ if path == INMEMORY:
46
+ raise ValueError("no path specified")
47
+ dsff.logger.debug(f"creating DSFF from {path}...")
48
+ return f(dsff, path, *a, **kw)
49
+ setattr(dsff, f.__name__, _wrapper)
50
+ return _wrapper
51
+ return _adapt_name
52
+
53
+
54
+ def _bind_to(dsff):
55
+ def _is_empty(f):
56
+ def _wrapper(*a, **kw):
57
+ if len(dsff) == 0:
58
+ raise EmptyDsffFile("No data")
59
+ path = None
60
+ if len(a) > 0:
61
+ path, a = a[0], a[1:]
62
+ elif 'path' in kw:
63
+ path = kw.pop('path')
64
+ path = _fix_path(path or ("undefined" if dsff.name == INMEMORY else dsff.name),
65
+ fmt := f.__name__.split("_", 1)[1])
66
+ dsff.logger.debug(f"converting DSFF to {path}..." if 'text' not in kw or kw['text'] else \
67
+ f"converting DSFF to {_FORMAT_TEXT_ALIAS.get(fmt, fmt.capitalize())}...")
68
+ return f(dsff, path, *a, **kw)
69
+ setattr(dsff, f.__name__, _wrapper)
70
+ return _wrapper
71
+ return _is_empty
72
+
73
+
74
+ def _fix_path(path, ext):
75
+ from os.path import expanduser, splitext
76
+ if path == INMEMORY:
77
+ return path
78
+ path = splitext(expanduser(path))[0]
79
+ if ext is not None and ext not in _NO_EXT_FORMAT and not path.endswith(f".{ext}"):
80
+ path += f".{ext}"
81
+ return path
82
+
83
+
84
+ class DSFF:
85
+ """ DataSet File Format.
86
+
87
+ Modes: r r+ w w+
88
+ --------------------------
89
+ read * * *
90
+ write * * *
91
+ create * *
92
+ truncate * *
93
+
94
+ Important note: XSLX format has a limitation of 1,048,576 rows per sheet.
95
+ """
96
+ def __init__(self, path=None, mode=None, logger=None):
97
+ if mode is None:
98
+ mode = "rw"[path in [None, INMEMORY]]
99
+ if re.match(r"[rw]\+?$", mode) is None:
100
+ raise ValueError("Mode should be one of: r, r+, w, w+")
101
+ self.__change = False
102
+ self.__logger = logger or logging.getLogger("DSFF")
103
+ self.__name = None
104
+ self.__path = path
105
+ self.__mode = mode
106
+ # depending on the mode, bind the necessary methods
107
+ if mode in ["r+", "w", "w+"]:
108
+ self.save = types.MethodType(lambda dsff: dsff._DSFF__save(), self)
109
+ self.logger.debug("binding write methods")
110
+ for name, obj in globals().items():
111
+ if name.startswith("from_"):
112
+ _bind_from(self)(obj)
113
+ self.logger.debug("binding read methods")
114
+ for name, obj in globals().items():
115
+ if name.startswith("to_"):
116
+ _bind_to(self)(obj)
117
+ # perform checks
118
+ if mode in ["r", "r+"]:
119
+ if path is None:
120
+ raise ValueError("No input path to a .dsff file provided")
121
+ if path != INMEMORY and not isfile(path):
122
+ raise FileNotFoundError("Input .dsff does not exist")
123
+ # if the target path exists and is a file, open it
124
+ if mode in ["r", "r+"] and path != INMEMORY:
125
+ # disable archive validation as it does not recognize '.dsff'
126
+ tmp = excelr._validate_archive
127
+ excelr._validate_archive = lambda f: ZipFile(f, 'r')
128
+ try:
129
+ self.__wb = load_workbook(path)
130
+ except BadZipFile:
131
+ raise BadDsffFile("File is not a DSFF file")
132
+ finally:
133
+ excelr._validate_archive = tmp
134
+ # check that the file has only 2 worksheets: 'data' and 'features'
135
+ if [ws._WorkbookChild__title for ws in self.__wb.worksheets] != ["data", "features"]:
136
+ raise BadDsffFile("File is not a DSFF file")
137
+ # check that the 'features' worksheet has 2 columns: 'name' and 'description'
138
+ for headers in self.__wb['features'].rows:
139
+ if len(headers) != 2 or headers[0].value != "name" or headers[1].value != "description":
140
+ raise BadDsffFile("The features worksheet does not comply with DSFF")
141
+ break
142
+ return
143
+ # otherwise, create a new workbook with the default worksheets
144
+ if isfile(self.path):
145
+ remove(self.path) # re-create
146
+ self.__wb = Workbook()
147
+ del self.__wb['Sheet'] # remove the default sheet
148
+ for ws in ["data", "features"]:
149
+ self.__wb.create_sheet(ws)
150
+
151
+ def __enter__(self):
152
+ return self
153
+
154
+ def __exit__(self, *args):
155
+ self.__save()
156
+ self.close()
157
+
158
+ def __getitem__(self, name):
159
+ if name in ["data", "features"]:
160
+ return self.__wb[name]
161
+ # the common property 'description' is used to store the metadata of the dataset, hence 'description' can be
162
+ # used as a key in the metadata but not from the common properties
163
+ try:
164
+ if name != "description":
165
+ return getattr(self.__wb.properties, name)
166
+ except AttributeError:
167
+ pass
168
+ return self.metadata[name]
169
+
170
+ def __len__(self):
171
+ return len(self.data)
172
+
173
+ def __setitem__(self, name, value):
174
+ if name in ["data", "features"]:
175
+ raise ValueError(f"'{name}' is a name reserved for a worksheet")
176
+ # see the note from __getitem__ related to 'description'
177
+ if hasattr(self.__wb.properties, name) and name != "description":
178
+ setattr(self.__wb.properties, name, value)
179
+ d = self.metadata
180
+ d[name] = value
181
+ self.__wb.properties.description = json.dumps(d)
182
+ self.__change = True
183
+
184
+ def __eval(self, v):
185
+ try:
186
+ return literal_eval(v)
187
+ except (SyntaxError, ValueError):
188
+ return v
189
+
190
+ def __save(self):
191
+ if self.mode == "r" or self.path == INMEMORY:
192
+ return
193
+ if self.__change:
194
+ props = self.__wb.properties
195
+ if props.creator is None or props.creator == "openpyxl":
196
+ props.creator = getuser()
197
+ props.title = self.name
198
+ props.description = self.metadata
199
+ if isfile(self.path) and self.mode.startswith("w"):
200
+ remove(self.path)
201
+ self.__wb.save(self.path)
202
+ self.__change = False
203
+
204
+ def _to_table(self):
205
+ if not PYARROW:
206
+ raise NotImplementedError("This method is available only wheh Pyarrow is installed")
207
+ headers = self.data[0]
208
+ try:
209
+ l = headers.index("label")
210
+ except ValueError:
211
+ l = -1
212
+ cols = [pyarrow.array(col if i != l else [[c, None][c == MISSING_TOKEN] for c in col]) \
213
+ for i, col in enumerate(zip(*self.data[1:]))]
214
+ meta = {k: v for k, v in self.features.items()}
215
+ meta['__metadata__'] = str(self.metadata).encode()
216
+ schema_with_meta = pyarrow.schema(pyarrow.Table.from_arrays(cols, names=headers).schema, metadata=meta)
217
+ return pyarrow.Table.from_arrays(cols, schema=schema_with_meta)
218
+
219
+ def close(self):
220
+ self.__wb.close()
221
+
222
+ def write(self, data=None, features=None, metadata=None, missing="?"):
223
+ """ Write data and/or features and/or metadata to the workbook.
224
+
225
+ :param data: matrix of data (including headers) OR path to data.csv OR path to Dataset folder
226
+ :param features: dictionary of features' names and descriptions OR path to features.json
227
+ :param metadata: dictionary of dataset's metadata OR path to metadata.json
228
+ """
229
+ # get the cell coordinate from (X,Y) coordinates (e.g. (1,2) => "B1")
230
+ coord = lambda x, y: ws.cell(x+1, y+1).coordinate
231
+ # private function to auto-adjust column widths
232
+ def autoadjust(ws):
233
+ col_widths = []
234
+ for row in ws.rows:
235
+ if len(col_widths) == 0:
236
+ col_widths = len(row) * [0]
237
+ for i, cell in enumerate(row):
238
+ col_widths[i] = max(col_widths[i], len(str(cell.value)))
239
+ for i, w in enumerate(col_widths):
240
+ ws.column_dimensions[get_column_letter(i+1)].width = w
241
+ # if the first argument is a folder, assume it is a specific folder structure as follows:
242
+ # [name]
243
+ # +-- data.csv # metadata and labels of the executable
244
+ # +-- features.json # dictionary of selected features and their descriptions
245
+ # +-- metadata.json # simple statistics about the dataset
246
+ if data is not None and not isinstance(data, (list, dict)) and isdir(expanduser(data)):
247
+ self.__path, d = self.__path or basename(data), expanduser(data)
248
+ data, features, metadata = join(d, "data.csv"), join(d, "features.json"), join(d, "metadata.json")
249
+ # handle data first
250
+ if data is not None:
251
+ self.__logger.debug("writing data to DSFF...")
252
+ ws, d = self.__wb['data'], data
253
+ if not isinstance(d, list):
254
+ if isfile(expanduser(d)) and splitext(d)[1] == ".csv":
255
+ with open(expanduser(d)) as f:
256
+ d = [r for r in csvmod.reader(f, delimiter=CSV_DELIMITER)]
257
+ else:
258
+ raise BadInputData("'data' is not a list")
259
+ for r, row in enumerate(d):
260
+ for c, value in enumerate(row):
261
+ c = coord(r, c)
262
+ ws[c] = str({None: missing}.get(value, value))
263
+ if r == 0:
264
+ ws[c].alignment = Alignment(horizontal="center")
265
+ ws[c].font = Font(bold=True)
266
+ autoadjust(ws)
267
+ self.__change = True
268
+ # then handle features dictionary
269
+ if len(features or {}) > 0:
270
+ self.__logger.debug("writing features to DSFF...")
271
+ ws, headers, d = self.__wb['features'], ["name", "description"], features
272
+ if not isinstance(d, dict):
273
+ if isfile(expanduser(d)) and basename(d) == "features.json":
274
+ with open(expanduser(d)) as f:
275
+ d = json.load(f)
276
+ else:
277
+ raise BadInputData("'features' is not a dictionary")
278
+ try:
279
+ for c, header in enumerate(headers):
280
+ c = coord(0, c)
281
+ ws[c] = header
282
+ ws[c].alignment = Alignment(horizontal="center")
283
+ ws[c].font = Font(bold=True)
284
+ for r, pair in enumerate(d.items()):
285
+ ws[coord(r+1, 0)] = pair[0]
286
+ ws[coord(r+1, 1)] = pair[1]
287
+ autoadjust(ws)
288
+ self.__change = True
289
+ except Exception as e:
290
+ raise BadInputData(f"Unexpected error while parsing 'features' ({e})")
291
+ # finally handle metadata dictionary
292
+ if len(metadata or {}) > 0:
293
+ self.__logger.debug("writing metadata to DSFF...")
294
+ d = metadata
295
+ if not isinstance(d, dict):
296
+ if isfile(expanduser(d)) and basename(d) == "metadata.json":
297
+ with open(expanduser(d)) as f:
298
+ d = json.load(f)
299
+ else:
300
+ raise BadInputData("'metadata' is not a dictionary")
301
+ try:
302
+ self.__wb.properties.description = json.dumps(d)
303
+ self.__change = True
304
+ except Exception as e:
305
+ raise BadInputData(f"Unexpected error while parsing 'metadata' ({e})")
306
+ self.__save()
307
+
308
+ @property
309
+ def data(self):
310
+ return [[self.__eval(c.value) for c in cells] for cells in self.__wb['data'].rows]
311
+
312
+ @property
313
+ def features(self):
314
+ return {cells[0].value: cells[1].value for i, cells in enumerate(self.__wb['features'].rows) if i > 0}
315
+
316
+ @property
317
+ def headers(self):
318
+ return list(self.features.keys())
319
+
320
+ @property
321
+ def logger(self):
322
+ return self.__logger
323
+
324
+ @property
325
+ def metadata(self):
326
+ return json.loads((self.__wb.properties.description or "{}").replace("'", "\""))
327
+
328
+ @property
329
+ def mode(self):
330
+ return self.__mode
331
+
332
+ @property
333
+ def name(self):
334
+ return self.__name or self.__wb.properties.title or splitext(basename(self.path))[0]
335
+
336
+ @name.setter
337
+ def name(self, name):
338
+ self.__name = name
339
+
340
+ @property
341
+ def path(self):
342
+ p = self.__path or "undefined"
343
+ if p != INMEMORY and not p.endswith(".dsff"):
344
+ p += ".dsff"
345
+ return p
346
+
@@ -7,18 +7,12 @@ __all__ = ["from_arff", "to_arff"]
7
7
 
8
8
  def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
9
9
  """ Populate the DSFF file from an ARFF file. """
10
- path = expanduser(path or dsff.name)
11
- if not path.endswith(".arff"):
12
- path += ".arff"
13
- dsff.logger.debug("creating DSFF from ARFF file...")
14
- d = []
10
+ d, features = [], {}
15
11
  with open(path) as f:
16
12
  relation, attributes, data = False, [False, False], False
17
13
  for n, l in enumerate(f, 1):
18
- l = l.strip()
19
- # ignore comments before @RELATION
20
- if l.startswith("#"):
21
- continue
14
+ l, pf = l.strip(), f"Line {n}: "
15
+ # the file shall start with "@RELATION"
22
16
  if not relation:
23
17
  if l.startswith("@RELATION "):
24
18
  relation = True
@@ -26,9 +20,17 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
26
20
  dsff['title'] = re.match(r"@RELATION\s+('[^']*'|\"[^\"]*\")$", l).group(1).strip("'\"")
27
21
  continue
28
22
  except Exception as e:
29
- raise BadInputData("Line %d: failed on @RELATION (%s)" % (n, e))
23
+ raise BadInputData(f"{pf}failed on @RELATION ({e})")
30
24
  else:
31
- raise BadInputData("Line %d: did not find @RELATION" % n)
25
+ raise BadInputData(f"{pf}did not find @RELATION")
26
+ # get metadata and feature descriptions from comments
27
+ if l.startswith("%"):
28
+ if re.match(r"^\%\s+metadata\s*\:\s*\{.*\}$", l):
29
+ dsff.write(metadata=literal_eval(l.split(":", 1)[1]))
30
+ elif (m := re.match(r"^\%\s+(.*?)\s*\:\s*(.*?)$", l)):
31
+ name, descr = m.groups()
32
+ features[name] = descr
33
+ continue
32
34
  # then ignore blank lines
33
35
  if l == "":
34
36
  if attributes[0] and not attributes[1]:
@@ -43,7 +45,7 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
43
45
  # start the attributes block
44
46
  d.append([])
45
47
  if attributes[1]:
46
- raise BadInputData("Line %d: found @ATTRIBUTE out of the attributes block)" % n)
48
+ raise BadInputData(f"{pf}found @ATTRIBUTE out of the attributes block)")
47
49
  try:
48
50
  header = re.match(r"@ATTRIBUTE\s+([^\s]+)\s+[A-Z]+$", l).group(1)
49
51
  if header == "class":
@@ -51,16 +53,16 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
51
53
  d[0].append(header)
52
54
  continue
53
55
  except AttributeError:
54
- raise BadInputData("Line %d: failed on @ATTRIBUTE (bad type)" % n)
56
+ raise BadInputData(f"{pf}failed on @ATTRIBUTE (bad type)")
55
57
  if not data:
56
58
  if l == "@DATA":
57
59
  data = True
58
60
  continue
59
61
  else:
60
- raise BadInputData("Line %d: did not find @DATA where expected" % n)
62
+ raise BadInputData(f"{pf}did not find @DATA where expected")
61
63
  row = list(map(lambda x: x.strip("'\""), re.split(r",\s+", l)))
62
64
  if len(row) != n_cols:
63
- raise BadInputData("Line %d: this row does not match the number of columns" % n)
65
+ raise BadInputData(f"{pf}this row does not match the number of columns")
64
66
  d.append(row)
65
67
  for i in range(n_cols):
66
68
  values = []
@@ -75,23 +77,14 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
75
77
  for j, row in enumerate(d):
76
78
  if j > 0:
77
79
  row[i] = {'0': "False", '1': "True"}[row[i]]
78
- dsff.write(d)
79
- features = {}
80
- for headers in dsff['data'].rows:
81
- for header in headers:
82
- features[header.value] = ""
83
- break
80
+ dsff.write(data=d)
84
81
  dsff.write(features=features)
85
82
 
86
83
 
87
84
  def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=MISSING_TOKEN, text=False):
88
85
  """ Output the dataset in ARFF format, suitable for use with the Weka framework, saved as a file or output as a
89
86
  string. """
90
- path = splitext(expanduser(path or dsff.name))[0]
91
- if not path.endswith(".arff"):
92
- path += ".arff"
93
87
  name = splitext(basename(path))[0]
94
- dsff.logger.debug("extracting data from DSFF to ARFF file...")
95
88
  _d = lambda c: {None: missing, '': "-", 'False': "0", 'True': "1"}.get(c.value, c.value)
96
89
  _sanitize_n = lambda c: _d(c).replace("<", "[lt]").replace(">", "[gt]")
97
90
  _sanitize_v = lambda c: _d(c)[1:].strip("\"'") if _d(c).startswith("=") else _d(c)
@@ -119,37 +112,40 @@ def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=M
119
112
  types = []
120
113
  # parse labels, data types and relevant data
121
114
  for i, row in enumerate(data.rows):
122
- if i > 0:
123
- if len(types) == 0:
124
- labels = ["0", "1"] if i_target == -1 else \
125
- list(set(_sanitize_v(row[i_target]) for k, row in enumerate(data) if k > 0))
126
- labels = [x for x in labels if x != missing]
127
- # compute types
128
- for j, cell in enumerate(row):
129
- v = _sanitize_v(cell)
130
- try:
131
- float(v)
132
- t = "NUMERIC"
133
- except ValueError:
134
- t = "STRING"
135
- types.append(t)
136
- # filter data types based on the relevant columns
137
- types = [t for k, t in enumerate(types) if k != i_target and k not in h_excl] + \
138
- [types[i_target] if i_target > -1 else []]
139
- # compute the list of ARFF attribute lines based on the column names and data types
140
- a = [("@ATTRIBUTE {: <%s} {}" % mlen_h).format("class" if j == len(types)-1 else \
141
- _sanitize_n(headers[j]), t) for j, t in enumerate(types)]
142
- mlen_c = [0] * len(types)
143
- # filter data based on the relevant columns
144
- row = [_sanitize_v(x) for k, x in enumerate(row) if k != i_target and k not in h_excl] + \
145
- ([_sanitize_v(row[i_target])] if i_target > -1 else [])
146
- # compute the maximum length for each column
147
- mlen_c = [max(x, len(row[k]) if types[k] == "NUMERIC" else len(row[k])+2) for k, x in enumerate(mlen_c)]
148
- d.append(row)
115
+ if i == 0:
116
+ continue # do not process headers
117
+ if len(types) == 0:
118
+ labels = ["0", "1"] if i_target == -1 else \
119
+ list(set(_sanitize_v(row[i_target]) for k, row in enumerate(data) if k > 0))
120
+ labels = [x for x in labels if x != missing]
121
+ # compute types
122
+ for j, cell in enumerate(row):
123
+ v = _sanitize_v(cell)
124
+ try:
125
+ float(v)
126
+ t = "NUMERIC"
127
+ except ValueError:
128
+ t = "STRING"
129
+ types.append(t)
130
+ # filter data types based on the relevant columns
131
+ types = [t for k, t in enumerate(types) if k != i_target and k not in h_excl] + \
132
+ [types[i_target] if i_target > -1 else []]
133
+ # compute the list of ARFF attribute lines based on the column names and data types
134
+ a = [("@ATTRIBUTE {: <%s} {}" % mlen_h).format("class" if j == len(types)-1 else \
135
+ _sanitize_n(headers[j]), t) for j, t in enumerate(types)]
136
+ mlen_c = [0] * len(types)
137
+ # filter data based on the relevant columns
138
+ row = [_sanitize_v(x) for k, x in enumerate(row) if k != i_target and k not in h_excl] + \
139
+ ([_sanitize_v(row[i_target])] if i_target > -1 else [])
140
+ # compute the maximum length for each column
141
+ mlen_c = [max(x, len(row[k]) if types[k] == "NUMERIC" else len(row[k])+2) for k, x in enumerate(mlen_c)]
142
+ d.append(row)
149
143
  # format the resulting data and output the ARFF
150
- d = "\n".join(" ".join(("{: <%s}" % (mlen_c[k]+1)).format((x if types[k] == "NUMERIC" or x == MISSING_TOKEN else \
151
- "'%s'" % x) + ",") for k, x in enumerate(row)).rstrip(" ,") for row in d)
152
- arff = "@RELATION \"{}\"\n\n{}\n\n@DATA\n{}".format(name, "\n".join(a), d)
144
+ d = (nl := "\n").join(" ".join(("{: <%s}" % (mlen_c[k]+1)).format((x if types[k] == "NUMERIC" or \
145
+ x == MISSING_TOKEN else "'%s'" % x) + ",") for k, x in enumerate(row)).rstrip(" ,") for row in d)
146
+ arff = f"@RELATION \"{name}\"\n\n{nl.join(a)}\n\n@DATA\n{d}\n\n" \
147
+ f"{['', f'% metadata: {json.dumps(dsff.metadata)}'][len(dsff.metadata) > 0]}\n\n" \
148
+ f"{nl.join(f'% {name}: {descr}' for name, descr in dsff.features.items())}"
153
149
  if text:
154
150
  return arff
155
151
  with open(path, 'w+') as f:
@@ -6,15 +6,13 @@ __all__ = ["from_csv", "to_csv"]
6
6
 
7
7
 
8
8
  def from_csv(dsff, path=None, exclude=DEFAULT_EXCL):
9
- """ Populate the DSFF file from an ARFF file. """
10
- path = expanduser(path or dsff.name)
11
- if not path.endswith(".csv"):
12
- path += ".csv"
13
- dsff.logger.debug("creating DSFF from CSV file...")
9
+ """ Populate the DSFF file from a CSV file. """
14
10
  dsff.write(path)
15
11
  features = {}
16
12
  for headers in dsff['data'].rows:
17
13
  for header in headers:
14
+ if header.value in exclude:
15
+ continue
18
16
  features[header.value] = ""
19
17
  break
20
18
  dsff.write(features=features)
@@ -22,14 +20,10 @@ def from_csv(dsff, path=None, exclude=DEFAULT_EXCL):
22
20
 
23
21
  def to_csv(dsff, path=None, text=False):
24
22
  """ Create a CSV from the data worksheet, saved as a file or output as a string. """
25
- path = splitext(expanduser(path or dsff.name))[0]
26
- if not path.endswith(".csv"):
27
- path += ".csv"
28
- dsff.logger.debug("extracting data from DSFF to CSV file...")
29
23
  with (StringIO() if text else open(path, 'w+')) as f:
30
24
  writer = csvmod.writer(f, delimiter=";")
31
- for cells in dsff['data'].rows:
32
- writer.writerow([dsff._DSFF__eval(c.value) for c in cells])
25
+ for row in dsff.data:
26
+ writer.writerow(row)
33
27
  if text:
34
28
  return f.getvalue()
35
29
 
@@ -6,31 +6,23 @@ __all__ = ["from_dataset", "to_dataset"]
6
6
 
7
7
 
8
8
  def from_dataset(dsff, path=None):
9
- """ Populate the DSFF file from an ARFF file. """
10
- path = expanduser(path or dsff.name)
11
- dsff.logger.debug("creating DSFF from (Fileless)Dataset folder...")
9
+ """ Populate the DSFF file from a Dataset structure. """
12
10
  if not isdir(path):
13
11
  raise BadInputData("Not a folder")
14
12
  else:
15
- missing = []
16
- for f in ["data.csv", "features.json", "metadata.json"]:
17
- if not isfile(join(path, f)):
18
- missing.append(f)
19
- if len(missing) > 0:
20
- raise BadInputData("Not a valid dataset folder (missing: %s)" % ", ".join(missing))
13
+ if len(missing := [f for f in ["data.csv", "features.json", "metadata.json"] if not isfile(join(path, f))]) > 0:
14
+ raise BadInputData(f"Not a valid dataset folder (missing: {', '.join(missing)})")
21
15
  dsff.write(path)
22
16
 
23
17
 
24
18
  def to_dataset(dsff, path=None):
25
- """ Create a dataset folder according to the Dataset structure ;
19
+ """ Create a dataset folder according to the following structure ;
26
20
  name
27
21
  +-- data.csv
28
22
  +-- features.json
29
23
  +-- metadata.json
30
24
  """
31
- path = splitext(expanduser(path or dsff.name))[0]
32
25
  makedirs(path, exist_ok=True)
33
- dsff.logger.debug("converting DSFF to (Fileless)Dataset folder...")
34
26
  # handle data first
35
27
  dsff.logger.debug("> making data.csv...")
36
28
  with open(join(path, "data.csv"), 'w+') as f:
dsff/formats/db.py ADDED
@@ -0,0 +1,76 @@
1
+ # -*- coding: UTF-8 -*-
2
+ from .__common__ import *
3
+
4
+
5
+ __all__ = ["from_db", "to_db"]
6
+
7
+
8
+ def from_db(dsff, path=None, exclude=DEFAULT_EXCL):
9
+ """ Populate the DSFF file from a SQLDB file. """
10
+ from json import loads
11
+ from sqlite3 import connect
12
+ conn = connect(path)
13
+ cursor = conn.cursor()
14
+ # list tables
15
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
16
+ tables = [table[0] for table in cursor.fetchall()]
17
+ if not all(t in tables for t in ["data", "features", "metadata"]): # pragma: no cover
18
+ raise BadInputData("The target SQLDB does not have the right format")
19
+ # import data
20
+ cursor.execute("PRAGMA table_info('data')")
21
+ headers = [[col[1] for col in cursor.fetchall()]]
22
+ cursor.execute("SELECT * FROM data;")
23
+ dsff.write(headers + [r for r in cursor.fetchall()])
24
+ # import feature definitions
25
+ cursor.execute("SELECT name,description FROM features;")
26
+ dsff.write(features={r[0]: r[1] for r in cursor.fetchall()})
27
+ # import metadata
28
+ cursor.execute("SELECT key,value FROM metadata;")
29
+ dsff.write(metadata={r[0]: loads(r[1]) if isinstance(r[1], str) else r[1] for r in cursor.fetchall()})
30
+ conn.close()
31
+
32
+
33
+ def to_db(dsff, path=None, text=False, primary_index=0):
34
+ """ Create a SQLDB from the data worksheet, saved as a file or output as a string. """
35
+ from json import dumps
36
+ from sqlite3 import connect
37
+ fields = []
38
+ rows = (data := dsff['data']).rows
39
+ headers, first = [c.value for c in next(rows)], next(rows)
40
+ for i, pair in enumerate(zip(headers, first)):
41
+ header, cell = pair
42
+ try:
43
+ dtype = {int: "INTEGER", float: "REAL", bool: "INTEGER"}[type(dsff._DSFF__eval(cell.value))]
44
+ except (KeyError, ValueError):
45
+ dtype = "TEXT"
46
+ fields.append(f"{header} {dtype}{['',' PRIMARY KEY'][i==primary_index]}")
47
+ # create the database
48
+ conn = connect(":memory:" if text else path)
49
+ cursor = conn.cursor()
50
+ # create and populate the data table
51
+ cursor.execute("CREATE TABLE IF NOT EXISTS data ({fields});" \
52
+ .format(fields="\n ".join(f"{f}," for f in fields).rstrip(",")))
53
+ cursor.executemany("INSERT INTO data ({fields}) VALUES ({tokens});"\
54
+ .format(fields=",".join(headers), tokens=",".join(["?"]*len(headers))),
55
+ [[v.value for v in row] for i, row in enumerate(data.rows) if i > 0])
56
+ # create and populate the features table
57
+ cursor.execute("CREATE TABLE IF NOT EXISTS features (name TEXT PRIMARY KEY, description TEXT);")
58
+ cursor.executemany("INSERT INTO features (name, description) VALUES (?, ?);",
59
+ [(r[0].value, r[1].value) for i, r in enumerate(dsff['features'].rows) if i > 0])
60
+ # create and populate the metadata table
61
+ cursor.execute("CREATE TABLE IF NOT EXISTS metadata (key TEXT PRIMARY KEY, value JSON);")
62
+ cursor.executemany("INSERT INTO metadata (key, value) VALUES (?, ?);",
63
+ [(k, dumps(v)) for k, v in dsff.metadata.items()])
64
+ conn.commit()
65
+ if text:
66
+ sql = {}
67
+ # extract SQL code
68
+ cursor.execute(f"SELECT sql FROM sqlite_master WHERE type='table';")
69
+ sql['table'] = "\n".join(row[0] for row in cursor.fetchall())
70
+ for t in ["data", "features", "metadata"]:
71
+ cursor.execute(f"SELECT * FROM {t};")
72
+ sql[t] = "\n".join(f"INSERT INTO {t} VALUES ('{row[0]}', '{row[1]}');" for row in cursor.fetchall())
73
+ # combine all SQL
74
+ return "\n".join(sql.values())
75
+ conn.close()
76
+