dsff 1.0.7__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dsff/VERSION.txt CHANGED
@@ -1 +1 @@
1
- 1.0.7
1
+ 1.2.0
dsff/__info__.py CHANGED
@@ -1,12 +1,17 @@
1
1
  # -*- coding: UTF-8 -*-
2
- """AsciiStuff package information.
2
+ """DSFF package information.
3
3
 
4
4
  """
5
5
  import os
6
+ from datetime import datetime
7
+
8
+ __y = str(datetime.now().year)
9
+ __s = "2023"
6
10
 
7
11
  __author__ = "Alexandre D'Hondt"
8
- __copyright__ = "© 2023 A. D'Hondt"
12
+ __copyright__ = f{[__y,__s+'-'+__y][__y != __s]} A. D'Hondt"
9
13
  __license__ = "GPLv3+ (https://www.gnu.org/licenses/gpl-3.0.html)"
10
14
 
11
15
  with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f:
12
16
  __version__ = f.read().strip()
17
+
dsff/__init__.py CHANGED
@@ -1,296 +1,8 @@
1
1
  # -*- coding: UTF-8 -*-
2
- import builtins
3
- import logging
4
- import openpyxl.reader.excel as excelr
5
- import types
6
- from ast import literal_eval
7
- from datetime import datetime
8
- from functools import wraps
9
- from getpass import getuser
10
- from openpyxl import load_workbook, Workbook
11
- from openpyxl.styles import Alignment, Font
12
- from openpyxl.utils import get_column_letter
13
- from zipfile import BadZipFile, ZipFile
2
+ """DSFF package.
14
3
 
15
- from .__common__ import *
4
+ """
16
5
  from .__info__ import __author__, __copyright__, __license__, __version__
17
- from .arff import *
18
- from .csv import *
19
- from .dataset import *
20
-
21
-
22
- __all__ = ["DSFF"]
23
-
24
-
25
- for name, etype in [("BadDsffFile", "OSError"), ("BadInputData", "ValueError"), ("EmptyDsffFile", "ValueError")]:
26
- if not hasattr(builtins, name):
27
- exec("class %s(%s): __module__ = 'builtins'" % (name, etype))
28
- setattr(builtins, name, locals()[name])
29
-
30
-
31
- def _bind_from(dsff):
32
- def _adapt_name(f):
33
- def _wrapper(path, *args, **kwargs):
34
- r = f(dsff, path, *args, **kwargs)
35
- if dsff.name == INMEMORY:
36
- dsff.name = splitext(basename(path))[0]
37
- return r
38
- setattr(dsff, f.__name__, _wrapper)
39
- return _wrapper
40
- return _adapt_name
41
-
42
-
43
- def _bind_to(dsff):
44
- def _is_empty(f):
45
- def _wrapper(*args, **kwargs):
46
- if len(dsff) == 0:
47
- raise EmptyDsffFile("No data")
48
- return f(dsff, *args, **kwargs)
49
- setattr(dsff, f.__name__, _wrapper)
50
- return _wrapper
51
- return _is_empty
52
-
53
-
54
- class DSFF:
55
- """ DataSet File Format.
56
-
57
- Modes: r r+ w w+
58
- --------------------------
59
- read * * *
60
- write * * *
61
- create * *
62
- truncate * *
63
- """
64
- def __init__(self, path=None, mode=None, logger=None):
65
- if mode is None:
66
- mode = "rw"[path in [None, INMEMORY]]
67
- if re.match(r"[rw]\+?$", mode) is None:
68
- raise ValueError("Mode should be one of: r, r+, w, w+")
69
- self.__change = False
70
- self.__logger = logger or logging.getLogger("DSFF")
71
- self.__name = None
72
- self.__path = path
73
- self.__mode = mode
74
- # depending on the mode, bind the necessary methods
75
- if mode in ["r+", "w", "w+"]:
76
- self.save = types.MethodType(lambda dsff: dsff._DSFF__save(), self)
77
- self.logger.debug("binding write methods")
78
- for name, obj in globals().items():
79
- if name.startswith("from_"):
80
- _bind_from(self)(obj)
81
- self.logger.debug("binding read methods")
82
- for name, obj in globals().items():
83
- if name.startswith("to_"):
84
- _bind_to(self)(obj)
85
- # perform checks
86
- if mode in ["r", "r+"]:
87
- if path is None:
88
- raise ValueError("No input path to a .dsff file provided")
89
- if path != INMEMORY and not isfile(path):
90
- raise FileNotFoundError("Input .dsff does not exist")
91
- # if the target path exists and is a file, open it
92
- if mode in ["r", "r+"] and path != INMEMORY:
93
- # disable archive validation as it does not recognize '.dsff'
94
- tmp = excelr._validate_archive
95
- excelr._validate_archive = lambda f: ZipFile(f, 'r')
96
- try:
97
- self.__wb = load_workbook(path)
98
- except BadZipFile:
99
- raise BadDsffFile("File is not a DSFF file")
100
- finally:
101
- excelr._validate_archive = tmp
102
- # check that the file has only 2 worksheets: 'data' and 'features'
103
- if [ws._WorkbookChild__title for ws in self.__wb.worksheets] != ["data", "features"]:
104
- raise BadDsffFile("File is not a DSFF file")
105
- # check that the 'features' worksheet has 2 columns: 'name' and 'description'
106
- for headers in self.__wb['features'].rows:
107
- if len(headers) != 2 or headers[0].value != "name" or headers[1].value != "description":
108
- raise BadDsffFile("The features worksheet does not comply with DSFF")
109
- break
110
- return
111
- # otherwise, create a new workbook with the default worksheets
112
- if isfile(self.path):
113
- remove(self.path) # re-create
114
- self.__wb = Workbook()
115
- del self.__wb['Sheet'] # remove the default sheet
116
- for ws in ["data", "features"]:
117
- self.__wb.create_sheet(ws)
118
-
119
- def __enter__(self):
120
- return self
121
-
122
- def __exit__(self, *args):
123
- self.__save()
124
- self.close()
125
-
126
- def __getitem__(self, name):
127
- if name in ["data", "features"]:
128
- return self.__wb[name]
129
- # the common property 'description' is used to store the metadata of the dataset, hence 'description' can be
130
- # used as a key in the metadata but not from the common properties
131
- try:
132
- if name != "description":
133
- return getattr(self.__wb.properties, name)
134
- except AttributeError:
135
- pass
136
- return self.metadata[name]
137
-
138
- def __len__(self):
139
- return len(self.data)
140
-
141
- def __setitem__(self, name, value):
142
- if name in ["data", "features"]:
143
- raise ValueError("'%s' is a name reserved for a worksheet" % name)
144
- # see the note from __getitem__ related to 'description'
145
- if hasattr(self.__wb.properties, name) and name != "description":
146
- setattr(self.__wb.properties, name, value)
147
- d = self.metadata
148
- d[name] = value
149
- self.__wb.properties.description = json.dumps(d)
150
- self.__change = True
151
-
152
- def __eval(self, v):
153
- try:
154
- return literal_eval(v)
155
- except (SyntaxError, ValueError):
156
- return v
157
-
158
- def __save(self):
159
- if self.mode == "r" or self.path == INMEMORY:
160
- return
161
- if self.__change:
162
- props = self.__wb.properties
163
- if props.creator is None or props.creator == "openpyxl":
164
- props.creator = getuser()
165
- props.title = self.name
166
- props.description = self.metadata
167
- if isfile(self.path) and self.mode.startswith("w"):
168
- remove(self.path)
169
- self.__wb.save(self.path)
170
- self.__change = False
171
-
172
- def close(self):
173
- self.__wb.close()
174
-
175
- def write(self, data=None, features=None, metadata=None, missing="?"):
176
- """ Write data and/or features and/or metadata to the workbook.
177
-
178
- :param data: matrix of data (including headers) OR path to data.csv OR path to Dataset folder
179
- :param features: dictionary of features' names and descriptions OR path to features.json
180
- :param metadata: dictionary of dataset's metadata OR path to metadata.json
181
- """
182
- # get the cell coordinate from (X,Y) coordinates (e.g. (1,2) => "B1")
183
- coord = lambda x, y: ws.cell(x+1, y+1).coordinate
184
- # private function to auto-adjust column widths
185
- def autoadjust(ws):
186
- col_widths = []
187
- for row in ws.rows:
188
- if len(col_widths) == 0:
189
- col_widths = len(row) * [0]
190
- for i, cell in enumerate(row):
191
- col_widths[i] = max(col_widths[i], len(str(cell.value)))
192
- for i, w in enumerate(col_widths):
193
- ws.column_dimensions[get_column_letter(i+1)].width = w
194
- # if the first argument is a folder, assume it is a Dataset structure compliant with:
195
- # name
196
- # +-- data.csv
197
- # +-- features.json
198
- # +-- metadata.json
199
- if data is not None and not isinstance(data, (list, dict)) and isdir(expanduser(data)):
200
- self.__path, d = self.__path or basename(data), expanduser(data)
201
- data, features, metadata = join(d, "data.csv"), join(d, "features.json"), join(d, "metadata.json")
202
- # handle data first
203
- if data is not None:
204
- self.__logger.debug("writing data to DSFF...")
205
- ws, d = self.__wb['data'], data
206
- if not isinstance(d, list):
207
- if isfile(expanduser(d)) and splitext(d)[1] == ".csv":
208
- with open(expanduser(d)) as f:
209
- d = []
210
- for row in csvmod.reader(f, delimiter=CSV_DELIMITER):
211
- d.append(row)
212
- else:
213
- raise BadInputData("'data' is not a list")
214
- for r, row in enumerate(d):
215
- for c, value in enumerate(row):
216
- c = coord(r, c)
217
- ws[c] = str({None: missing}.get(value, value))
218
- if r == 0:
219
- ws[c].alignment = Alignment(horizontal="center")
220
- ws[c].font = Font(bold=True)
221
- autoadjust(ws)
222
- self.__change = True
223
- # then handle features dictionary
224
- if features is not None:
225
- self.__logger.debug("writing features to DSFF...")
226
- ws, headers, d = self.__wb['features'], ["name", "description"], features
227
- if not isinstance(d, dict):
228
- if isfile(expanduser(d)) and basename(d) == "features.json":
229
- with open(expanduser(d)) as f:
230
- d = json.load(f)
231
- else:
232
- raise BadInputData("'features' is not a dictionary")
233
- try:
234
- for c, header in enumerate(headers):
235
- c = coord(0, c)
236
- ws[c] = header
237
- ws[c].alignment = Alignment(horizontal="center")
238
- ws[c].font = Font(bold=True)
239
- for r, pair in enumerate(d.items()):
240
- ws[coord(r+1, 0)] = pair[0]
241
- ws[coord(r+1, 1)] = pair[1]
242
- autoadjust(ws)
243
- self.__change = True
244
- except Exception as e:
245
- raise BadInputData("Unexpected error while parsing 'features' (%s)" % e)
246
- # finally handle metadata dictionary
247
- if metadata is not None:
248
- self.__logger.debug("writing metadata to DSFF...")
249
- d = metadata
250
- if not isinstance(d, dict):
251
- if isfile(expanduser(d)) and basename(d) == "metadata.json":
252
- with open(expanduser(d)) as f:
253
- d = json.load(f)
254
- else:
255
- raise BadInputData("'metadata' is not a dictionary")
256
- try:
257
- self.__wb.properties.description = json.dumps(d)
258
- except Exception as e:
259
- raise BadInputData("Unexpected error while parsing 'metadata' (%s)" % e)
260
- self.__save()
261
-
262
- @property
263
- def data(self):
264
- return [[self.__eval(c.value) for c in cells] for cells in self.__wb['data'].rows]
265
-
266
- @property
267
- def features(self):
268
- return {cells[0].value: cells[1].value for i, cells in enumerate(self.__wb['features'].rows) if i > 0}
269
-
270
- @property
271
- def logger(self):
272
- return self.__logger
273
-
274
- @property
275
- def metadata(self):
276
- return json.loads((self.__wb.properties.description or "{}").replace("'", "\""))
277
-
278
- @property
279
- def mode(self):
280
- return self.__mode
281
-
282
- @property
283
- def name(self):
284
- return self.__name or self.__wb.properties.title or splitext(basename(self.path))[0]
285
-
286
- @name.setter
287
- def name(self, name):
288
- self.__name = name
289
-
290
- @property
291
- def path(self):
292
- p = self.__path or "undefined"
293
- if p != INMEMORY and not p.endswith(".dsff"):
294
- p += ".dsff"
295
- return p
6
+ from .formats import *
7
+ from .formats import __all__
296
8
 
@@ -0,0 +1,33 @@
1
+ # -*- coding: UTF-8 -*-
2
+ import csv as csvmod
3
+ import json
4
+ import re
5
+ from ast import literal_eval
6
+ from io import BytesIO, StringIO
7
+ from os import makedirs, remove
8
+ from os.path import basename, expanduser, isfile, isdir, join, splitext
9
+ try: # pragma: no cover
10
+ import pyarrow
11
+ import pyarrow.feather as feather
12
+ import pyarrow.orc as orc
13
+ import pyarrow.parquet as parquet
14
+ import pandas
15
+ PYARROW = True
16
+ except ImportError: # pragma: no cover
17
+ PYARROW = False
18
+
19
+
20
+ __all__ = ["basename", "csvmod", "expanduser", "isfile", "isdir", "join", "json", "literal_eval", "makedirs",
21
+ "remove", "splitext", "re", "BytesIO", "StringIO",
22
+ "CSV_DELIMITER", "DEFAULT_EXCL", "INMEMORY", "META_EXCL", "MISSING_TOKEN", "PYARROW", "TARGET_NAME"]
23
+ if PYARROW:
24
+ __all__ += ["feather", "orc", "pandas", "pyarrow", "parquet"]
25
+
26
+
27
+ CSV_DELIMITER = ";"
28
+ DEFAULT_EXCL = ("hash", "realpath", "format", "size", "ctime", "mtime") # origin: executables used in the Packing Box
29
+ INMEMORY = "<memory>"
30
+ META_EXCL = ["created", "modified", "revision"]
31
+ MISSING_TOKEN = "?"
32
+ TARGET_NAME = "label"
33
+