dsff 1.0.7__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsff/VERSION.txt +1 -1
- dsff/__info__.py +7 -2
- dsff/__init__.py +4 -292
- dsff/formats/__common__.py +33 -0
- dsff/formats/__init__.py +346 -0
- dsff/{arff.py → formats/arff.py} +51 -55
- dsff/{csv.py → formats/csv.py} +5 -11
- dsff/{dataset.py → formats/dataset.py} +4 -12
- dsff/formats/db.py +76 -0
- dsff/formats/pa.py +29 -0
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/METADATA +9 -10
- dsff-1.2.0.dist-info/RECORD +15 -0
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/WHEEL +1 -1
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info/licenses}/LICENSE +674 -674
- dsff/__common__.py +0 -20
- dsff-1.0.7.dist-info/RECORD +0 -12
- {dsff-1.0.7.dist-info → dsff-1.2.0.dist-info}/top_level.txt +0 -0
dsff/VERSION.txt
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.0
|
|
1
|
+
1.2.0
|
dsff/__info__.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
|
2
|
-
"""
|
|
2
|
+
"""DSFF package information.
|
|
3
3
|
|
|
4
4
|
"""
|
|
5
5
|
import os
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
__y = str(datetime.now().year)
|
|
9
|
+
__s = "2023"
|
|
6
10
|
|
|
7
11
|
__author__ = "Alexandre D'Hondt"
|
|
8
|
-
__copyright__ = "©
|
|
12
|
+
__copyright__ = f"© {[__y,__s+'-'+__y][__y != __s]} A. D'Hondt"
|
|
9
13
|
__license__ = "GPLv3+ (https://www.gnu.org/licenses/gpl-3.0.html)"
|
|
10
14
|
|
|
11
15
|
with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f:
|
|
12
16
|
__version__ = f.read().strip()
|
|
17
|
+
|
dsff/__init__.py
CHANGED
|
@@ -1,296 +1,8 @@
|
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import openpyxl.reader.excel as excelr
|
|
5
|
-
import types
|
|
6
|
-
from ast import literal_eval
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
from functools import wraps
|
|
9
|
-
from getpass import getuser
|
|
10
|
-
from openpyxl import load_workbook, Workbook
|
|
11
|
-
from openpyxl.styles import Alignment, Font
|
|
12
|
-
from openpyxl.utils import get_column_letter
|
|
13
|
-
from zipfile import BadZipFile, ZipFile
|
|
2
|
+
"""DSFF package.
|
|
14
3
|
|
|
15
|
-
|
|
4
|
+
"""
|
|
16
5
|
from .__info__ import __author__, __copyright__, __license__, __version__
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
19
|
-
from .dataset import *
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
__all__ = ["DSFF"]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
for name, etype in [("BadDsffFile", "OSError"), ("BadInputData", "ValueError"), ("EmptyDsffFile", "ValueError")]:
|
|
26
|
-
if not hasattr(builtins, name):
|
|
27
|
-
exec("class %s(%s): __module__ = 'builtins'" % (name, etype))
|
|
28
|
-
setattr(builtins, name, locals()[name])
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _bind_from(dsff):
|
|
32
|
-
def _adapt_name(f):
|
|
33
|
-
def _wrapper(path, *args, **kwargs):
|
|
34
|
-
r = f(dsff, path, *args, **kwargs)
|
|
35
|
-
if dsff.name == INMEMORY:
|
|
36
|
-
dsff.name = splitext(basename(path))[0]
|
|
37
|
-
return r
|
|
38
|
-
setattr(dsff, f.__name__, _wrapper)
|
|
39
|
-
return _wrapper
|
|
40
|
-
return _adapt_name
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def _bind_to(dsff):
|
|
44
|
-
def _is_empty(f):
|
|
45
|
-
def _wrapper(*args, **kwargs):
|
|
46
|
-
if len(dsff) == 0:
|
|
47
|
-
raise EmptyDsffFile("No data")
|
|
48
|
-
return f(dsff, *args, **kwargs)
|
|
49
|
-
setattr(dsff, f.__name__, _wrapper)
|
|
50
|
-
return _wrapper
|
|
51
|
-
return _is_empty
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class DSFF:
|
|
55
|
-
""" DataSet File Format.
|
|
56
|
-
|
|
57
|
-
Modes: r r+ w w+
|
|
58
|
-
--------------------------
|
|
59
|
-
read * * *
|
|
60
|
-
write * * *
|
|
61
|
-
create * *
|
|
62
|
-
truncate * *
|
|
63
|
-
"""
|
|
64
|
-
def __init__(self, path=None, mode=None, logger=None):
|
|
65
|
-
if mode is None:
|
|
66
|
-
mode = "rw"[path in [None, INMEMORY]]
|
|
67
|
-
if re.match(r"[rw]\+?$", mode) is None:
|
|
68
|
-
raise ValueError("Mode should be one of: r, r+, w, w+")
|
|
69
|
-
self.__change = False
|
|
70
|
-
self.__logger = logger or logging.getLogger("DSFF")
|
|
71
|
-
self.__name = None
|
|
72
|
-
self.__path = path
|
|
73
|
-
self.__mode = mode
|
|
74
|
-
# depending on the mode, bind the necessary methods
|
|
75
|
-
if mode in ["r+", "w", "w+"]:
|
|
76
|
-
self.save = types.MethodType(lambda dsff: dsff._DSFF__save(), self)
|
|
77
|
-
self.logger.debug("binding write methods")
|
|
78
|
-
for name, obj in globals().items():
|
|
79
|
-
if name.startswith("from_"):
|
|
80
|
-
_bind_from(self)(obj)
|
|
81
|
-
self.logger.debug("binding read methods")
|
|
82
|
-
for name, obj in globals().items():
|
|
83
|
-
if name.startswith("to_"):
|
|
84
|
-
_bind_to(self)(obj)
|
|
85
|
-
# perform checks
|
|
86
|
-
if mode in ["r", "r+"]:
|
|
87
|
-
if path is None:
|
|
88
|
-
raise ValueError("No input path to a .dsff file provided")
|
|
89
|
-
if path != INMEMORY and not isfile(path):
|
|
90
|
-
raise FileNotFoundError("Input .dsff does not exist")
|
|
91
|
-
# if the target path exists and is a file, open it
|
|
92
|
-
if mode in ["r", "r+"] and path != INMEMORY:
|
|
93
|
-
# disable archive validation as it does not recognize '.dsff'
|
|
94
|
-
tmp = excelr._validate_archive
|
|
95
|
-
excelr._validate_archive = lambda f: ZipFile(f, 'r')
|
|
96
|
-
try:
|
|
97
|
-
self.__wb = load_workbook(path)
|
|
98
|
-
except BadZipFile:
|
|
99
|
-
raise BadDsffFile("File is not a DSFF file")
|
|
100
|
-
finally:
|
|
101
|
-
excelr._validate_archive = tmp
|
|
102
|
-
# check that the file has only 2 worksheets: 'data' and 'features'
|
|
103
|
-
if [ws._WorkbookChild__title for ws in self.__wb.worksheets] != ["data", "features"]:
|
|
104
|
-
raise BadDsffFile("File is not a DSFF file")
|
|
105
|
-
# check that the 'features' worksheet has 2 columns: 'name' and 'description'
|
|
106
|
-
for headers in self.__wb['features'].rows:
|
|
107
|
-
if len(headers) != 2 or headers[0].value != "name" or headers[1].value != "description":
|
|
108
|
-
raise BadDsffFile("The features worksheet does not comply with DSFF")
|
|
109
|
-
break
|
|
110
|
-
return
|
|
111
|
-
# otherwise, create a new workbook with the default worksheets
|
|
112
|
-
if isfile(self.path):
|
|
113
|
-
remove(self.path) # re-create
|
|
114
|
-
self.__wb = Workbook()
|
|
115
|
-
del self.__wb['Sheet'] # remove the default sheet
|
|
116
|
-
for ws in ["data", "features"]:
|
|
117
|
-
self.__wb.create_sheet(ws)
|
|
118
|
-
|
|
119
|
-
def __enter__(self):
|
|
120
|
-
return self
|
|
121
|
-
|
|
122
|
-
def __exit__(self, *args):
|
|
123
|
-
self.__save()
|
|
124
|
-
self.close()
|
|
125
|
-
|
|
126
|
-
def __getitem__(self, name):
|
|
127
|
-
if name in ["data", "features"]:
|
|
128
|
-
return self.__wb[name]
|
|
129
|
-
# the common property 'description' is used to store the metadata of the dataset, hence 'description' can be
|
|
130
|
-
# used as a key in the metadata but not from the common properties
|
|
131
|
-
try:
|
|
132
|
-
if name != "description":
|
|
133
|
-
return getattr(self.__wb.properties, name)
|
|
134
|
-
except AttributeError:
|
|
135
|
-
pass
|
|
136
|
-
return self.metadata[name]
|
|
137
|
-
|
|
138
|
-
def __len__(self):
|
|
139
|
-
return len(self.data)
|
|
140
|
-
|
|
141
|
-
def __setitem__(self, name, value):
|
|
142
|
-
if name in ["data", "features"]:
|
|
143
|
-
raise ValueError("'%s' is a name reserved for a worksheet" % name)
|
|
144
|
-
# see the note from __getitem__ related to 'description'
|
|
145
|
-
if hasattr(self.__wb.properties, name) and name != "description":
|
|
146
|
-
setattr(self.__wb.properties, name, value)
|
|
147
|
-
d = self.metadata
|
|
148
|
-
d[name] = value
|
|
149
|
-
self.__wb.properties.description = json.dumps(d)
|
|
150
|
-
self.__change = True
|
|
151
|
-
|
|
152
|
-
def __eval(self, v):
|
|
153
|
-
try:
|
|
154
|
-
return literal_eval(v)
|
|
155
|
-
except (SyntaxError, ValueError):
|
|
156
|
-
return v
|
|
157
|
-
|
|
158
|
-
def __save(self):
|
|
159
|
-
if self.mode == "r" or self.path == INMEMORY:
|
|
160
|
-
return
|
|
161
|
-
if self.__change:
|
|
162
|
-
props = self.__wb.properties
|
|
163
|
-
if props.creator is None or props.creator == "openpyxl":
|
|
164
|
-
props.creator = getuser()
|
|
165
|
-
props.title = self.name
|
|
166
|
-
props.description = self.metadata
|
|
167
|
-
if isfile(self.path) and self.mode.startswith("w"):
|
|
168
|
-
remove(self.path)
|
|
169
|
-
self.__wb.save(self.path)
|
|
170
|
-
self.__change = False
|
|
171
|
-
|
|
172
|
-
def close(self):
|
|
173
|
-
self.__wb.close()
|
|
174
|
-
|
|
175
|
-
def write(self, data=None, features=None, metadata=None, missing="?"):
|
|
176
|
-
""" Write data and/or features and/or metadata to the workbook.
|
|
177
|
-
|
|
178
|
-
:param data: matrix of data (including headers) OR path to data.csv OR path to Dataset folder
|
|
179
|
-
:param features: dictionary of features' names and descriptions OR path to features.json
|
|
180
|
-
:param metadata: dictionary of dataset's metadata OR path to metadata.json
|
|
181
|
-
"""
|
|
182
|
-
# get the cell coordinate from (X,Y) coordinates (e.g. (1,2) => "B1")
|
|
183
|
-
coord = lambda x, y: ws.cell(x+1, y+1).coordinate
|
|
184
|
-
# private function to auto-adjust column widths
|
|
185
|
-
def autoadjust(ws):
|
|
186
|
-
col_widths = []
|
|
187
|
-
for row in ws.rows:
|
|
188
|
-
if len(col_widths) == 0:
|
|
189
|
-
col_widths = len(row) * [0]
|
|
190
|
-
for i, cell in enumerate(row):
|
|
191
|
-
col_widths[i] = max(col_widths[i], len(str(cell.value)))
|
|
192
|
-
for i, w in enumerate(col_widths):
|
|
193
|
-
ws.column_dimensions[get_column_letter(i+1)].width = w
|
|
194
|
-
# if the first argument is a folder, assume it is a Dataset structure compliant with:
|
|
195
|
-
# name
|
|
196
|
-
# +-- data.csv
|
|
197
|
-
# +-- features.json
|
|
198
|
-
# +-- metadata.json
|
|
199
|
-
if data is not None and not isinstance(data, (list, dict)) and isdir(expanduser(data)):
|
|
200
|
-
self.__path, d = self.__path or basename(data), expanduser(data)
|
|
201
|
-
data, features, metadata = join(d, "data.csv"), join(d, "features.json"), join(d, "metadata.json")
|
|
202
|
-
# handle data first
|
|
203
|
-
if data is not None:
|
|
204
|
-
self.__logger.debug("writing data to DSFF...")
|
|
205
|
-
ws, d = self.__wb['data'], data
|
|
206
|
-
if not isinstance(d, list):
|
|
207
|
-
if isfile(expanduser(d)) and splitext(d)[1] == ".csv":
|
|
208
|
-
with open(expanduser(d)) as f:
|
|
209
|
-
d = []
|
|
210
|
-
for row in csvmod.reader(f, delimiter=CSV_DELIMITER):
|
|
211
|
-
d.append(row)
|
|
212
|
-
else:
|
|
213
|
-
raise BadInputData("'data' is not a list")
|
|
214
|
-
for r, row in enumerate(d):
|
|
215
|
-
for c, value in enumerate(row):
|
|
216
|
-
c = coord(r, c)
|
|
217
|
-
ws[c] = str({None: missing}.get(value, value))
|
|
218
|
-
if r == 0:
|
|
219
|
-
ws[c].alignment = Alignment(horizontal="center")
|
|
220
|
-
ws[c].font = Font(bold=True)
|
|
221
|
-
autoadjust(ws)
|
|
222
|
-
self.__change = True
|
|
223
|
-
# then handle features dictionary
|
|
224
|
-
if features is not None:
|
|
225
|
-
self.__logger.debug("writing features to DSFF...")
|
|
226
|
-
ws, headers, d = self.__wb['features'], ["name", "description"], features
|
|
227
|
-
if not isinstance(d, dict):
|
|
228
|
-
if isfile(expanduser(d)) and basename(d) == "features.json":
|
|
229
|
-
with open(expanduser(d)) as f:
|
|
230
|
-
d = json.load(f)
|
|
231
|
-
else:
|
|
232
|
-
raise BadInputData("'features' is not a dictionary")
|
|
233
|
-
try:
|
|
234
|
-
for c, header in enumerate(headers):
|
|
235
|
-
c = coord(0, c)
|
|
236
|
-
ws[c] = header
|
|
237
|
-
ws[c].alignment = Alignment(horizontal="center")
|
|
238
|
-
ws[c].font = Font(bold=True)
|
|
239
|
-
for r, pair in enumerate(d.items()):
|
|
240
|
-
ws[coord(r+1, 0)] = pair[0]
|
|
241
|
-
ws[coord(r+1, 1)] = pair[1]
|
|
242
|
-
autoadjust(ws)
|
|
243
|
-
self.__change = True
|
|
244
|
-
except Exception as e:
|
|
245
|
-
raise BadInputData("Unexpected error while parsing 'features' (%s)" % e)
|
|
246
|
-
# finally handle metadata dictionary
|
|
247
|
-
if metadata is not None:
|
|
248
|
-
self.__logger.debug("writing metadata to DSFF...")
|
|
249
|
-
d = metadata
|
|
250
|
-
if not isinstance(d, dict):
|
|
251
|
-
if isfile(expanduser(d)) and basename(d) == "metadata.json":
|
|
252
|
-
with open(expanduser(d)) as f:
|
|
253
|
-
d = json.load(f)
|
|
254
|
-
else:
|
|
255
|
-
raise BadInputData("'metadata' is not a dictionary")
|
|
256
|
-
try:
|
|
257
|
-
self.__wb.properties.description = json.dumps(d)
|
|
258
|
-
except Exception as e:
|
|
259
|
-
raise BadInputData("Unexpected error while parsing 'metadata' (%s)" % e)
|
|
260
|
-
self.__save()
|
|
261
|
-
|
|
262
|
-
@property
|
|
263
|
-
def data(self):
|
|
264
|
-
return [[self.__eval(c.value) for c in cells] for cells in self.__wb['data'].rows]
|
|
265
|
-
|
|
266
|
-
@property
|
|
267
|
-
def features(self):
|
|
268
|
-
return {cells[0].value: cells[1].value for i, cells in enumerate(self.__wb['features'].rows) if i > 0}
|
|
269
|
-
|
|
270
|
-
@property
|
|
271
|
-
def logger(self):
|
|
272
|
-
return self.__logger
|
|
273
|
-
|
|
274
|
-
@property
|
|
275
|
-
def metadata(self):
|
|
276
|
-
return json.loads((self.__wb.properties.description or "{}").replace("'", "\""))
|
|
277
|
-
|
|
278
|
-
@property
|
|
279
|
-
def mode(self):
|
|
280
|
-
return self.__mode
|
|
281
|
-
|
|
282
|
-
@property
|
|
283
|
-
def name(self):
|
|
284
|
-
return self.__name or self.__wb.properties.title or splitext(basename(self.path))[0]
|
|
285
|
-
|
|
286
|
-
@name.setter
|
|
287
|
-
def name(self, name):
|
|
288
|
-
self.__name = name
|
|
289
|
-
|
|
290
|
-
@property
|
|
291
|
-
def path(self):
|
|
292
|
-
p = self.__path or "undefined"
|
|
293
|
-
if p != INMEMORY and not p.endswith(".dsff"):
|
|
294
|
-
p += ".dsff"
|
|
295
|
-
return p
|
|
6
|
+
from .formats import *
|
|
7
|
+
from .formats import __all__
|
|
296
8
|
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
import csv as csvmod
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from ast import literal_eval
|
|
6
|
+
from io import BytesIO, StringIO
|
|
7
|
+
from os import makedirs, remove
|
|
8
|
+
from os.path import basename, expanduser, isfile, isdir, join, splitext
|
|
9
|
+
try: # pragma: no cover
|
|
10
|
+
import pyarrow
|
|
11
|
+
import pyarrow.feather as feather
|
|
12
|
+
import pyarrow.orc as orc
|
|
13
|
+
import pyarrow.parquet as parquet
|
|
14
|
+
import pandas
|
|
15
|
+
PYARROW = True
|
|
16
|
+
except ImportError: # pragma: no cover
|
|
17
|
+
PYARROW = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = ["basename", "csvmod", "expanduser", "isfile", "isdir", "join", "json", "literal_eval", "makedirs",
|
|
21
|
+
"remove", "splitext", "re", "BytesIO", "StringIO",
|
|
22
|
+
"CSV_DELIMITER", "DEFAULT_EXCL", "INMEMORY", "META_EXCL", "MISSING_TOKEN", "PYARROW", "TARGET_NAME"]
|
|
23
|
+
if PYARROW:
|
|
24
|
+
__all__ += ["feather", "orc", "pandas", "pyarrow", "parquet"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
CSV_DELIMITER = ";"
|
|
28
|
+
DEFAULT_EXCL = ("hash", "realpath", "format", "size", "ctime", "mtime") # origin: executables used in the Packing Box
|
|
29
|
+
INMEMORY = "<memory>"
|
|
30
|
+
META_EXCL = ["created", "modified", "revision"]
|
|
31
|
+
MISSING_TOKEN = "?"
|
|
32
|
+
TARGET_NAME = "label"
|
|
33
|
+
|