dcicutils 7.11.0.1b9__py3-none-any.whl → 7.12.0.1b4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of dcicutils might be problematic. Click here for more details.
- dcicutils/glacier_utils.py +16 -4
- dcicutils/license_utils.py +677 -165
- dcicutils/misc_utils.py +9 -97
- dcicutils/scripts/run_license_checker.py +77 -0
- {dcicutils-7.11.0.1b9.dist-info → dcicutils-7.12.0.1b4.dist-info}/METADATA +1 -3
- {dcicutils-7.11.0.1b9.dist-info → dcicutils-7.12.0.1b4.dist-info}/RECORD +9 -9
- {dcicutils-7.11.0.1b9.dist-info → dcicutils-7.12.0.1b4.dist-info}/entry_points.txt +1 -0
- dcicutils/sheet_utils.py +0 -1131
- {dcicutils-7.11.0.1b9.dist-info → dcicutils-7.12.0.1b4.dist-info}/LICENSE.txt +0 -0
- {dcicutils-7.11.0.1b9.dist-info → dcicutils-7.12.0.1b4.dist-info}/WHEEL +0 -0
dcicutils/sheet_utils.py
DELETED
@@ -1,1131 +0,0 @@
|
|
1
|
-
import chardet
|
2
|
-
import contextlib
|
3
|
-
import copy
|
4
|
-
import csv
|
5
|
-
import glob
|
6
|
-
import io
|
7
|
-
import json
|
8
|
-
import openpyxl
|
9
|
-
import os
|
10
|
-
import re
|
11
|
-
import subprocess
|
12
|
-
import uuid
|
13
|
-
import yaml
|
14
|
-
|
15
|
-
from openpyxl.worksheet.worksheet import Worksheet
|
16
|
-
from openpyxl.workbook.workbook import Workbook
|
17
|
-
from tempfile import TemporaryFile, TemporaryDirectory
|
18
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
|
19
|
-
from .common import AnyJsonData
|
20
|
-
from .env_utils import public_env_name, EnvUtils
|
21
|
-
from .ff_utils import get_schema
|
22
|
-
from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are
|
23
|
-
from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix
|
24
|
-
from .task_utils import pmap
|
25
|
-
|
26
|
-
|
27
|
-
Header = str
|
28
|
-
Headers = List[str]
|
29
|
-
ParsedHeader = List[Union[str, int]]
|
30
|
-
ParsedHeaders = List[ParsedHeader]
|
31
|
-
SheetCellValue = Union[int, float, str]
|
32
|
-
SheetRow = List[SheetCellValue]
|
33
|
-
CsvReader = type(csv.reader(TemporaryFile()))
|
34
|
-
SheetData = List[dict]
|
35
|
-
TabbedSheetData = Dict[str, SheetData]
|
36
|
-
Regexp = type(re.compile("sample"))
|
37
|
-
|
38
|
-
|
39
|
-
class LoadFailure(Exception):
|
40
|
-
"""
|
41
|
-
In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail,
|
42
|
-
but some errors are so confusing or so problematic that we need to just fail the load right away.
|
43
|
-
"""
|
44
|
-
pass
|
45
|
-
|
46
|
-
|
47
|
-
class LoadArgumentsError(LoadFailure):
|
48
|
-
"""
|
49
|
-
Errors of this class represent situations where we can't get started because
|
50
|
-
there's a problem with the given arguments.
|
51
|
-
"""
|
52
|
-
pass
|
53
|
-
|
54
|
-
|
55
|
-
class LoadTableError(LoadFailure):
|
56
|
-
"""
|
57
|
-
Errors of this class represent situations where we can't get started because
|
58
|
-
there's a problem with some table's syntax, for example headers that don't make sense.
|
59
|
-
"""
|
60
|
-
pass
|
61
|
-
|
62
|
-
|
63
|
-
@contextlib.contextmanager
|
64
|
-
def deferred_problems():
|
65
|
-
problems = []
|
66
|
-
|
67
|
-
def note_problems(problem):
|
68
|
-
problems.append(problem)
|
69
|
-
|
70
|
-
yield note_problems
|
71
|
-
|
72
|
-
if problems:
|
73
|
-
for problem in problems:
|
74
|
-
PRINT(f"Problem: {problem}")
|
75
|
-
raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False))
|
76
|
-
|
77
|
-
|
78
|
-
def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False):
|
79
|
-
if kwargs:
|
80
|
-
unwanted = [f"{argname}={value!r}" if detailed else argname
|
81
|
-
for argname, value in kwargs.items()
|
82
|
-
if value is not None]
|
83
|
-
if unwanted:
|
84
|
-
does_not = "don't" if context_plural else "doesn't"
|
85
|
-
raise LoadArgumentsError(f"{context} {does_not} use"
|
86
|
-
f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")
|
87
|
-
|
88
|
-
|
89
|
-
def prefer_number(value: SheetCellValue):
|
90
|
-
if isinstance(value, str): # the given value might be an int or float, in which case just fall through
|
91
|
-
if not value:
|
92
|
-
return None
|
93
|
-
value = value
|
94
|
-
ch0 = value[0]
|
95
|
-
if ch0 == '+' or ch0 == '-' or ch0.isdigit():
|
96
|
-
try:
|
97
|
-
return int(value)
|
98
|
-
except Exception:
|
99
|
-
pass
|
100
|
-
try:
|
101
|
-
return float(value)
|
102
|
-
except Exception:
|
103
|
-
pass
|
104
|
-
# If we couldn't parse it as an int or float, fall through to returning the original value
|
105
|
-
pass
|
106
|
-
return value
|
107
|
-
|
108
|
-
|
109
|
-
def expand_string_escape_sequences(text: str) -> str:
|
110
|
-
s = io.StringIO()
|
111
|
-
escaping = False
|
112
|
-
for ch in text:
|
113
|
-
if escaping:
|
114
|
-
if ch == 'r':
|
115
|
-
s.write('\r')
|
116
|
-
elif ch == 't':
|
117
|
-
s.write('\t')
|
118
|
-
elif ch == 'n':
|
119
|
-
s.write('\n')
|
120
|
-
elif ch == '\\':
|
121
|
-
s.write('\\')
|
122
|
-
else:
|
123
|
-
# Rather than err, just leave other sequences as-is.
|
124
|
-
s.write(f"\\{ch}")
|
125
|
-
escaping = False
|
126
|
-
elif ch == '\\':
|
127
|
-
escaping = True
|
128
|
-
else:
|
129
|
-
s.write(ch)
|
130
|
-
return s.getvalue()
|
131
|
-
|
132
|
-
|
133
|
-
def open_unicode_text_input_file_respecting_byte_order_mark(filename):
|
134
|
-
"""
|
135
|
-
Opens a file for text input, respecting a byte-order mark (BOM).
|
136
|
-
"""
|
137
|
-
with io.open(filename, 'rb') as fp:
|
138
|
-
leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
|
139
|
-
bom_info = chardet.detect(leading_bytes, should_rename_legacy=True)
|
140
|
-
detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
|
141
|
-
use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding
|
142
|
-
return io.open(filename, 'r', encoding=use_encoding)
|
143
|
-
|
144
|
-
|
145
|
-
class TypeHint:
|
146
|
-
def apply_hint(self, value):
|
147
|
-
return value
|
148
|
-
|
149
|
-
def __str__(self):
|
150
|
-
return f"<{self.__class__.__name__}>"
|
151
|
-
|
152
|
-
def __repr__(self):
|
153
|
-
return self.__str__()
|
154
|
-
|
155
|
-
|
156
|
-
class BoolHint(TypeHint):
|
157
|
-
|
158
|
-
def apply_hint(self, value):
|
159
|
-
if isinstance(value, str) and value:
|
160
|
-
if 'true'.startswith(value.lower()):
|
161
|
-
return True
|
162
|
-
elif 'false'.startswith(value.lower()):
|
163
|
-
return False
|
164
|
-
return super().apply_hint(value)
|
165
|
-
|
166
|
-
|
167
|
-
class EnumHint(TypeHint):
|
168
|
-
|
169
|
-
def __str__(self):
|
170
|
-
return f"<EnumHint {','.join(f'{key}={val}' for key, val in self.value_map.items())}>"
|
171
|
-
|
172
|
-
def __init__(self, value_map):
|
173
|
-
self.value_map = value_map
|
174
|
-
|
175
|
-
def apply_hint(self, value):
|
176
|
-
if isinstance(value, str):
|
177
|
-
if value in self.value_map:
|
178
|
-
result = self.value_map[value]
|
179
|
-
return result
|
180
|
-
else:
|
181
|
-
lvalue = value.lower()
|
182
|
-
found = []
|
183
|
-
for lkey, key in self.value_map.items():
|
184
|
-
if lkey.startswith(lvalue):
|
185
|
-
found.append(lkey)
|
186
|
-
if len(found) == 1:
|
187
|
-
[only_found] = found
|
188
|
-
result = self.value_map[only_found]
|
189
|
-
return result
|
190
|
-
return super().apply_hint(value)
|
191
|
-
|
192
|
-
|
193
|
-
OptionalTypeHints = List[Optional[TypeHint]]
|
194
|
-
|
195
|
-
|
196
|
-
class ItemTools:
|
197
|
-
"""
|
198
|
-
Implements operations on table-related data without pre-supposing the specific representation of the table.
|
199
|
-
It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
|
200
|
-
it does not presuppose the source of the data nor where it will be written to.
|
201
|
-
|
202
|
-
For the purpose of this class:
|
203
|
-
|
204
|
-
* a 'header' is a string representing the top of a column.
|
205
|
-
|
206
|
-
* a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
|
207
|
-
"a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
|
208
|
-
each numeric token as an int instead of a string.
|
209
|
-
|
210
|
-
* a 'headers' object is just a list of strings, each of which is a 'header'.
|
211
|
-
|
212
|
-
* a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
|
213
|
-
e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
|
214
|
-
|
215
|
-
"""
|
216
|
-
|
217
|
-
@classmethod
|
218
|
-
def parse_sheet_header(cls, header: Header) -> ParsedHeader:
|
219
|
-
result = []
|
220
|
-
token = ""
|
221
|
-
for i in range(len(header)):
|
222
|
-
ch = header[i]
|
223
|
-
if ch == '.' or ch == '#':
|
224
|
-
if token:
|
225
|
-
result.append(int(token) if token.isdigit() else token)
|
226
|
-
token = ""
|
227
|
-
else:
|
228
|
-
token += ch
|
229
|
-
if token:
|
230
|
-
result.append(int(token) if token.isdigit() else token)
|
231
|
-
return result
|
232
|
-
|
233
|
-
@classmethod
|
234
|
-
def parse_sheet_headers(cls, headers: Headers):
|
235
|
-
return [cls.parse_sheet_header(header)
|
236
|
-
for header in headers]
|
237
|
-
|
238
|
-
@classmethod
|
239
|
-
def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
|
240
|
-
prototype = {}
|
241
|
-
for parsed_header in parsed_headers:
|
242
|
-
parsed_header0 = parsed_header[0]
|
243
|
-
if isinstance(parsed_header0, int):
|
244
|
-
raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}")
|
245
|
-
cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
|
246
|
-
return prototype
|
247
|
-
|
248
|
-
@classmethod
|
249
|
-
def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
|
250
|
-
[key0, *more_keys] = keys
|
251
|
-
key1 = more_keys[0] if more_keys else None
|
252
|
-
if isinstance(key1, int):
|
253
|
-
placeholder = []
|
254
|
-
elif isinstance(key1, str):
|
255
|
-
placeholder = {}
|
256
|
-
else:
|
257
|
-
placeholder = None
|
258
|
-
if isinstance(key0, int):
|
259
|
-
n = len(parent)
|
260
|
-
if key0 == n:
|
261
|
-
parent.append(placeholder)
|
262
|
-
elif key0 > n:
|
263
|
-
raise LoadTableError("Numeric items must occur sequentially.")
|
264
|
-
elif isinstance(key0, str):
|
265
|
-
if key0 not in parent:
|
266
|
-
parent[key0] = placeholder
|
267
|
-
if key1 is not None:
|
268
|
-
cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
|
269
|
-
return parent
|
270
|
-
|
271
|
-
INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default
|
272
|
-
|
273
|
-
@classmethod
|
274
|
-
def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData:
|
275
|
-
# TODO: Remodularize this for easier testing and more Schema-driven effect
|
276
|
-
# Doug asks that this be broken up into different mechanisms, more modular and separately testable.
|
277
|
-
# I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired.
|
278
|
-
if isinstance(value, str):
|
279
|
-
lvalue = value.lower()
|
280
|
-
# TODO: We could consult a schema to make this less heuristic, but this may do for now
|
281
|
-
if lvalue == 'true':
|
282
|
-
return True
|
283
|
-
elif lvalue == 'false':
|
284
|
-
return False
|
285
|
-
elif lvalue == 'null' or lvalue == '':
|
286
|
-
return None
|
287
|
-
elif '|' in value:
|
288
|
-
if value == '|': # Use '|' for []
|
289
|
-
return []
|
290
|
-
else:
|
291
|
-
if value.endswith("|"): # Use 'foo|' for ['foo']
|
292
|
-
value = value[:-1]
|
293
|
-
return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')]
|
294
|
-
elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'):
|
295
|
-
# Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid
|
296
|
-
return cls.get_instaguid(value, context=context)
|
297
|
-
else:
|
298
|
-
# Doug points out that the schema might not agree, might want a string representation of a number.
|
299
|
-
# At this semantic layer, this might be a bad choice.
|
300
|
-
return prefer_number(value)
|
301
|
-
else: # presumably a number (int or float)
|
302
|
-
return value
|
303
|
-
|
304
|
-
@classmethod
|
305
|
-
def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None):
|
306
|
-
if context is None:
|
307
|
-
return guid_placeholder
|
308
|
-
else:
|
309
|
-
referent = context.get(guid_placeholder)
|
310
|
-
if not referent:
|
311
|
-
context[guid_placeholder] = referent = str(uuid.uuid4())
|
312
|
-
return referent
|
313
|
-
|
314
|
-
@classmethod
|
315
|
-
def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
|
316
|
-
if (value is None or value == '') and not force:
|
317
|
-
return
|
318
|
-
[key, *more_path] = path
|
319
|
-
if not more_path:
|
320
|
-
datum[key] = value
|
321
|
-
else:
|
322
|
-
cls.set_path_value(datum[key], more_path, value)
|
323
|
-
|
324
|
-
@classmethod
|
325
|
-
def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any):
|
326
|
-
|
327
|
-
def finder(subheader, subschema):
|
328
|
-
if not parsed_header:
|
329
|
-
return None
|
330
|
-
else:
|
331
|
-
[key1, *other_headers] = subheader
|
332
|
-
if isinstance(key1, str) and isinstance(subschema, dict):
|
333
|
-
if subschema.get('type') == 'object':
|
334
|
-
def1 = subschema.get('properties', {}).get(key1)
|
335
|
-
if not other_headers:
|
336
|
-
if def1 is not None:
|
337
|
-
t = def1.get('type')
|
338
|
-
if t == 'string':
|
339
|
-
enum = def1.get('enum')
|
340
|
-
if enum:
|
341
|
-
mapping = {e.lower(): e for e in enum}
|
342
|
-
return EnumHint(mapping)
|
343
|
-
elif t == 'boolean':
|
344
|
-
return BoolHint()
|
345
|
-
else:
|
346
|
-
pass # fall through to asking super()
|
347
|
-
else:
|
348
|
-
pass # fall through to asking super()
|
349
|
-
else:
|
350
|
-
return finder(subheader=other_headers, subschema=def1)
|
351
|
-
|
352
|
-
return finder(subheader=parsed_header, subschema=schema)
|
353
|
-
|
354
|
-
@classmethod
|
355
|
-
def infer_tab_name(cls, filename):
|
356
|
-
return os.path.basename(filename).split('.')[0]
|
357
|
-
|
358
|
-
|
359
|
-
# TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
|
360
|
-
#
|
361
|
-
# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
|
362
|
-
# I am less certain but open to discussion. Among other things, as implemented now,
|
363
|
-
# the __init__ method here needs to run and the documentation says that ABC's won't appear
|
364
|
-
# in the method resolution order. -kmp 17-Aug-2023
|
365
|
-
# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
|
366
|
-
class AbstractTableSetManager:
|
367
|
-
"""
|
368
|
-
The TableSetManager is the spanning class of anything that wants to be able to load a table set,
|
369
|
-
regardless of what it wants to load it from. To do this, it must support a load method
|
370
|
-
that takes a filename and returns the file content in the form:
|
371
|
-
{
|
372
|
-
"Sheet1": [
|
373
|
-
{...representation of row1 as some kind of dict...},
|
374
|
-
{...representation of row2 as some kind of dict...}
|
375
|
-
],
|
376
|
-
"Sheet2": [...],
|
377
|
-
...,
|
378
|
-
}
|
379
|
-
It also needs some implementation of the .tab_names property.
|
380
|
-
Note that at this level of abstraction, we take no position on what form of representation is used
|
381
|
-
for the rows, as long as it is JSON data of some kind. It might be
|
382
|
-
{"col1": "val1", "col2": "val2", ...}
|
383
|
-
or it might be something more structured like
|
384
|
-
{"something": "val1", {"something_else": ["val2"]}}
|
385
|
-
Additionally, the values stored might be altered as well. In particular, the most likely alteration
|
386
|
-
is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
|
387
|
-
happen is not constrained by this class.
|
388
|
-
"""
|
389
|
-
|
390
|
-
ALLOWED_FILE_EXTENSIONS: List[str] = []
|
391
|
-
|
392
|
-
def __init__(self, filename: str, **kwargs):
|
393
|
-
self.filename: str = filename
|
394
|
-
unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs)
|
395
|
-
|
396
|
-
# TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
|
397
|
-
@classmethod
|
398
|
-
def load(cls, filename: str, **kwargs) -> TabbedSheetData:
|
399
|
-
"""
|
400
|
-
Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
|
401
|
-
For more information, see documentation of AbstractTableSetManager.
|
402
|
-
"""
|
403
|
-
raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA
|
404
|
-
|
405
|
-
@property
|
406
|
-
def tab_names(self) -> List[str]:
|
407
|
-
raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA
|
408
|
-
|
409
|
-
def load_content(self) -> Any:
|
410
|
-
raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA
|
411
|
-
|
412
|
-
|
413
|
-
class BasicTableSetManager(AbstractTableSetManager):
|
414
|
-
"""
|
415
|
-
A BasicTableManager provides some structure that most kinds of parsers will need.
|
416
|
-
In particular, everything will likely need some way of storing headers and some way of storing content
|
417
|
-
of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
|
418
|
-
of this where there's only one set of headers and only one block of content.
|
419
|
-
"""
|
420
|
-
|
421
|
-
def __init__(self, filename: str, **kwargs):
|
422
|
-
super().__init__(filename=filename, **kwargs)
|
423
|
-
self.headers_by_tab_name: Dict[str, Headers] = {}
|
424
|
-
self.content_by_tab_name: Dict[str, SheetData] = {}
|
425
|
-
self.reader_agent: Any = self._get_reader_agent()
|
426
|
-
|
427
|
-
def tab_headers(self, tab_name: str) -> Headers:
|
428
|
-
return self.headers_by_tab_name[tab_name]
|
429
|
-
|
430
|
-
def tab_content(self, tab_name: str) -> List[AnyJsonData]:
|
431
|
-
return self.content_by_tab_name[tab_name]
|
432
|
-
|
433
|
-
@classmethod
|
434
|
-
def _create_tab_processor_state(cls, tab_name: str) -> Any:
|
435
|
-
"""
|
436
|
-
This method provides for the possibility that some parsers will want auxiliary state,
|
437
|
-
(such as parsed headers or a line count or a table of temporary names for objects to cross-link
|
438
|
-
or some other such feature) that it carries with it as it moves from line to line parsing things.
|
439
|
-
Subclasses might therefore want to make this do something more interesting.
|
440
|
-
"""
|
441
|
-
ignored(tab_name) # subclasses might need this, but we don't
|
442
|
-
return None
|
443
|
-
|
444
|
-
def _get_reader_agent(self) -> Any:
|
445
|
-
"""This function is responsible for opening the workbook and returning a workbook object."""
|
446
|
-
raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA
|
447
|
-
|
448
|
-
|
449
|
-
class SemanticTableSetManager(BasicTableSetManager):
|
450
|
-
"""
|
451
|
-
This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing.
|
452
|
-
Those may be:
|
453
|
-
* Excel workbook readers (.xlsx)
|
454
|
-
* Comma-separated file readers (.csv)
|
455
|
-
* Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright
|
456
|
-
refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt)
|
457
|
-
There are two levels to each of these: a class that is not semantically interpreted,
|
458
|
-
and a class that is semantically interpreted as an "item".
|
459
|
-
|
460
|
-
This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing
|
461
|
-
were already done (in part so that they can be used to test the results of other formats):
|
462
|
-
* Json files
|
463
|
-
* Yaml files
|
464
|
-
* Inserts directories
|
465
|
-
* JsonLines files
|
466
|
-
"""
|
467
|
-
|
468
|
-
@classmethod
|
469
|
-
def load(cls, filename: str, **kwargs) -> AnyJsonData:
|
470
|
-
if cls.ALLOWED_FILE_EXTENSIONS:
|
471
|
-
if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
|
472
|
-
raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
|
473
|
-
f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")
|
474
|
-
|
475
|
-
table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs)
|
476
|
-
return table_set_manager.load_content()
|
477
|
-
|
478
|
-
def __init__(self, filename: str, **kwargs):
|
479
|
-
super().__init__(filename=filename, **kwargs)
|
480
|
-
|
481
|
-
def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
|
482
|
-
"""
|
483
|
-
Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values.
|
484
|
-
"""
|
485
|
-
raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA
|
486
|
-
|
487
|
-
def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
|
488
|
-
"""
|
489
|
-
This needs to take a state and whatever represents a row and
|
490
|
-
must return a list of objects representing column values.
|
491
|
-
What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
|
492
|
-
"""
|
493
|
-
raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA
|
494
|
-
|
495
|
-
def load_content(self) -> AnyJsonData:
|
496
|
-
for tab_name in self.tab_names:
|
497
|
-
sheet_content = []
|
498
|
-
state = self._create_tab_processor_state(tab_name)
|
499
|
-
for row_data in self._raw_row_generator_for_tab_name(tab_name):
|
500
|
-
processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data)
|
501
|
-
sheet_content.append(processed_row_data)
|
502
|
-
self.content_by_tab_name[tab_name] = sheet_content
|
503
|
-
return self.content_by_tab_name
|
504
|
-
|
505
|
-
@classmethod
|
506
|
-
def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
|
507
|
-
return prefer_number(value)
|
508
|
-
|
509
|
-
|
510
|
-
class AbstractItemManager(AbstractTableSetManager):
|
511
|
-
|
512
|
-
pass
|
513
|
-
|
514
|
-
|
515
|
-
class TableSetManagerRegistry:
|
516
|
-
|
517
|
-
def __init__(self):
|
518
|
-
self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {}
|
519
|
-
self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = []
|
520
|
-
|
521
|
-
def register(self, regexp: Optional[str] = None):
|
522
|
-
def _wrapped_register(class_to_register: Type[AbstractTableSetManager]):
|
523
|
-
if regexp:
|
524
|
-
self.regexp_mappings.append((re.compile(regexp), class_to_register))
|
525
|
-
for ext in class_to_register.ALLOWED_FILE_EXTENSIONS:
|
526
|
-
existing = self.manager_table.get(ext)
|
527
|
-
if existing:
|
528
|
-
raise Exception(f"Tried to define {class_to_register} to extension {ext},"
|
529
|
-
f" but {existing} already claimed that.")
|
530
|
-
self.manager_table[ext] = class_to_register
|
531
|
-
return class_to_register
|
532
|
-
return _wrapped_register
|
533
|
-
|
534
|
-
register1 = register
|
535
|
-
|
536
|
-
def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]:
|
537
|
-
base: str = os.path.basename(filename)
|
538
|
-
suffix_parts = base.split('.')[1:]
|
539
|
-
if suffix_parts:
|
540
|
-
for i in range(0, len(suffix_parts)):
|
541
|
-
suffix = f".{'.'.join(suffix_parts[i:])}"
|
542
|
-
found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix)
|
543
|
-
if found:
|
544
|
-
return found
|
545
|
-
else:
|
546
|
-
special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename)
|
547
|
-
if special_case:
|
548
|
-
return special_case
|
549
|
-
raise LoadArgumentsError(f"Unknown file type: {filename}")
|
550
|
-
|
551
|
-
def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]:
|
552
|
-
for pattern, manager_class in self.regexp_mappings:
|
553
|
-
if pattern.match(filename):
|
554
|
-
return manager_class
|
555
|
-
return None
|
556
|
-
|
557
|
-
|
558
|
-
TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry()
|
559
|
-
ITEM_MANAGER_REGISTRY = TableSetManagerRegistry()
|
560
|
-
|
561
|
-
|
562
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
563
|
-
class XlsxManager(SemanticTableSetManager):
|
564
|
-
"""
|
565
|
-
This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
|
566
|
-
"""
|
567
|
-
|
568
|
-
ALLOWED_FILE_EXTENSIONS = ['.xlsx']
|
569
|
-
|
570
|
-
@classmethod
|
571
|
-
def _all_rows(cls, sheet: Worksheet):
|
572
|
-
row_max = sheet.max_row
|
573
|
-
for row in range(2, row_max + 1):
|
574
|
-
yield row
|
575
|
-
|
576
|
-
@classmethod
|
577
|
-
def _all_cols(cls, sheet: Worksheet):
|
578
|
-
col_max = sheet.max_column
|
579
|
-
for col in range(1, col_max + 1):
|
580
|
-
yield col
|
581
|
-
|
582
|
-
@property
|
583
|
-
def tab_names(self) -> List[str]:
|
584
|
-
return self.reader_agent.sheetnames
|
585
|
-
|
586
|
-
def _get_reader_agent(self) -> Workbook:
|
587
|
-
return openpyxl.load_workbook(self.filename)
|
588
|
-
|
589
|
-
def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
|
590
|
-
sheet = self.reader_agent[tab_name]
|
591
|
-
return (self._get_raw_row_content_tuple(sheet, row)
|
592
|
-
for row in self._all_rows(sheet))
|
593
|
-
|
594
|
-
def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
|
595
|
-
return [sheet.cell(row=row, column=col).value
|
596
|
-
for col in self._all_cols(sheet)]
|
597
|
-
|
598
|
-
def _create_tab_processor_state(self, tab_name: str) -> Headers:
|
599
|
-
sheet = self.reader_agent[tab_name]
|
600
|
-
headers: Headers = [str(sheet.cell(row=1, column=col).value)
|
601
|
-
for col in self._all_cols(sheet)]
|
602
|
-
self.headers_by_tab_name[sheet.title] = headers
|
603
|
-
return headers
|
604
|
-
|
605
|
-
def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
606
|
-
ignored(tab_name)
|
607
|
-
return {headers[i]: self.parse_cell_value(row_datum)
|
608
|
-
for i, row_datum in enumerate(row_data)}
|
609
|
-
|
610
|
-
|
611
|
-
class SchemaAutoloadMixin(AbstractTableSetManager):
|
612
|
-
|
613
|
-
SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
|
614
|
-
CACHE_SCHEMAS = True # Controls whether we're doing caching at all
|
615
|
-
AUTOLOAD_SCHEMAS_DEFAULT = True
|
616
|
-
|
617
|
-
def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
|
618
|
-
portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs):
|
619
|
-
# This setup must be in place before the class initialization is done (via the super call).
|
620
|
-
self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas
|
621
|
-
if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting.
|
622
|
-
if portal_env is None and portal_vapp is None:
|
623
|
-
portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
|
624
|
-
PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
|
625
|
-
self.portal_env: Optional[str] = portal_env
|
626
|
-
self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp
|
627
|
-
super().__init__(filename=filename, **kwargs)
|
628
|
-
|
629
|
-
def fetch_relevant_schemas(self, schema_names: List[str]):
|
630
|
-
# The schema_names argument is not normally given, but it is there for easier testing
|
631
|
-
def fetch_schema(schema_name):
|
632
|
-
schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
|
633
|
-
return schema_name, schema
|
634
|
-
if self.autoload_schemas and (self.portal_env or self.portal_vapp):
|
635
|
-
autoloaded = {tab_name: schema
|
636
|
-
for tab_name, schema in pmap(fetch_schema, schema_names)}
|
637
|
-
return autoloaded
|
638
|
-
else:
|
639
|
-
return {}
|
640
|
-
|
641
|
-
@classmethod
|
642
|
-
def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None,
|
643
|
-
portal_vapp: Optional[AbstractVirtualApp] = None):
|
644
|
-
def just_fetch_it():
|
645
|
-
return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp)
|
646
|
-
if cls.CACHE_SCHEMAS:
|
647
|
-
schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name)
|
648
|
-
if schema is None:
|
649
|
-
cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it()
|
650
|
-
return schema
|
651
|
-
else:
|
652
|
-
return just_fetch_it()
|
653
|
-
|
654
|
-
@classmethod
|
655
|
-
def clear_schema_cache(cls):
|
656
|
-
for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first
|
657
|
-
cls.SCHEMA_CACHE.pop(key, None)
|
658
|
-
|
659
|
-
|
660
|
-
class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager):
|
661
|
-
"""
|
662
|
-
This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
|
663
|
-
get handled like Items instead of just flat table rows.
|
664
|
-
"""
|
665
|
-
|
666
|
-
def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs):
|
667
|
-
super().__init__(filename=filename, **kwargs)
|
668
|
-
self.patch_prototypes_by_tab_name: Dict[str, Dict] = {}
|
669
|
-
self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {}
|
670
|
-
self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {}
|
671
|
-
self._schemas = schemas
|
672
|
-
self._instaguid_context_table: Dict[str, str] = {}
|
673
|
-
|
674
|
-
@property
|
675
|
-
def schemas(self):
|
676
|
-
schemas = self._schemas
|
677
|
-
if schemas is None:
|
678
|
-
self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names)
|
679
|
-
return schemas
|
680
|
-
|
681
|
-
def sheet_patch_prototype(self, tab_name: str) -> Dict:
|
682
|
-
return self.patch_prototypes_by_tab_name[tab_name]
|
683
|
-
|
684
|
-
def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders:
|
685
|
-
return self.parsed_headers_by_tab_name[tab_name]
|
686
|
-
|
687
|
-
def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints:
|
688
|
-
return self.type_hints_by_tab_name[tab_name]
|
689
|
-
|
690
|
-
class SheetState:
|
691
|
-
|
692
|
-
def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints):
|
693
|
-
self.parsed_headers = parsed_headers
|
694
|
-
self.type_hints = type_hints
|
695
|
-
|
696
|
-
def _compile_type_hints(self, tab_name: str):
|
697
|
-
parsed_headers = self.sheet_parsed_headers(tab_name)
|
698
|
-
schema = self.schemas.get(tab_name)
|
699
|
-
with deferred_problems() as note_problem:
|
700
|
-
for required_header in self._schema_required_headers(schema):
|
701
|
-
if required_header not in parsed_headers:
|
702
|
-
note_problem("Missing required header")
|
703
|
-
type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None
|
704
|
-
for parsed_header in parsed_headers]
|
705
|
-
self.type_hints_by_tab_name[tab_name] = type_hints
|
706
|
-
|
707
|
-
@classmethod
|
708
|
-
def _schema_required_headers(cls, schema):
|
709
|
-
ignored(schema)
|
710
|
-
return [] # TODO: Make this compute a list of required headers (in parsed header form)
|
711
|
-
|
712
|
-
def _compile_sheet_headers(self, tab_name: str):
|
713
|
-
headers = self.headers_by_tab_name[tab_name]
|
714
|
-
parsed_headers = ItemTools.parse_sheet_headers(headers)
|
715
|
-
self.parsed_headers_by_tab_name[tab_name] = parsed_headers
|
716
|
-
prototype = ItemTools.compute_patch_prototype(parsed_headers)
|
717
|
-
self.patch_prototypes_by_tab_name[tab_name] = prototype
|
718
|
-
|
719
|
-
def _create_tab_processor_state(self, tab_name: str) -> SheetState:
|
720
|
-
super()._create_tab_processor_state(tab_name)
|
721
|
-
# This will create state that allows us to efficiently assign values in the right place on each row
|
722
|
-
# by setting up a prototype we can copy and then drop values into.
|
723
|
-
self._compile_sheet_headers(tab_name)
|
724
|
-
self._compile_type_hints(tab_name)
|
725
|
-
return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name),
|
726
|
-
type_hints=self.sheet_type_hints(tab_name))
|
727
|
-
|
728
|
-
def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData:
|
729
|
-
parsed_headers = state.parsed_headers
|
730
|
-
type_hints = state.type_hints
|
731
|
-
patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name))
|
732
|
-
for i, value in enumerate(row_data):
|
733
|
-
parsed_value = self.parse_cell_value(value)
|
734
|
-
type_hint = type_hints[i]
|
735
|
-
if type_hint:
|
736
|
-
parsed_value = type_hint.apply_hint(parsed_value)
|
737
|
-
ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
|
738
|
-
return patch_item
|
739
|
-
|
740
|
-
def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
|
741
|
-
return ItemTools.parse_item_value(value, context=self._instaguid_context_table)
|
742
|
-
|
743
|
-
|
744
|
-
@ITEM_MANAGER_REGISTRY.register()
|
745
|
-
class XlsxItemManager(ItemManagerMixin, XlsxManager):
|
746
|
-
"""
|
747
|
-
This layers item-style row processing functionality on an XLSX file.
|
748
|
-
"""
|
749
|
-
pass
|
750
|
-
|
751
|
-
|
752
|
-
class SingleTableMixin(AbstractTableSetManager):
|
753
|
-
|
754
|
-
def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
|
755
|
-
self._tab_name = tab_name or ItemTools.infer_tab_name(filename)
|
756
|
-
super().__init__(filename=filename, **kwargs)
|
757
|
-
|
758
|
-
@property
|
759
|
-
def tab_names(self) -> List[str]:
|
760
|
-
return [self._tab_name]
|
761
|
-
|
762
|
-
|
763
|
-
class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here
|
764
|
-
|
765
|
-
ALLOWED_FILE_EXTENSIONS = []
|
766
|
-
|
767
|
-
def _parse_inserts_data(self, filename: str) -> AnyJsonData:
|
768
|
-
raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA
|
769
|
-
|
770
|
-
def _load_inserts_data(self, filename: str) -> TabbedSheetData:
|
771
|
-
data: AnyJsonData = self._parse_inserts_data(filename)
|
772
|
-
tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data)
|
773
|
-
if (not isinstance(tabbed_inserts, dict)
|
774
|
-
or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys())
|
775
|
-
or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content)
|
776
|
-
for content in tabbed_inserts.values())):
|
777
|
-
raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).")
|
778
|
-
tabbed_inserts: TabbedSheetData # we've just checked that
|
779
|
-
return tabbed_inserts
|
780
|
-
|
781
|
-
@classmethod
|
782
|
-
def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData:
|
783
|
-
ignored(filename)
|
784
|
-
return data
|
785
|
-
|
786
|
-
@property
|
787
|
-
def tab_names(self) -> List[str]:
|
788
|
-
return list(self.content_by_tab_name.keys())
|
789
|
-
|
790
|
-
def _get_reader_agent(self) -> Any:
|
791
|
-
return self
|
792
|
-
|
793
|
-
def load_content(self) -> Dict[str, AnyJsonData]:
|
794
|
-
data = self._load_inserts_data(self.filename)
|
795
|
-
for tab_name, tab_content in data.items():
|
796
|
-
self.content_by_tab_name[tab_name] = tab_content
|
797
|
-
if not tab_content:
|
798
|
-
self.headers_by_tab_name[tab_name] = []
|
799
|
-
else:
|
800
|
-
self.headers_by_tab_name[tab_name] = list(tab_content[0].keys())
|
801
|
-
return self.content_by_tab_name
|
802
|
-
|
803
|
-
|
804
|
-
class SimpleInsertsMixin(SingleTableMixin):
|
805
|
-
|
806
|
-
def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData:
|
807
|
-
if (not isinstance(data, list)
|
808
|
-
or not all(isinstance(item, dict) for item in data)):
|
809
|
-
raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).")
|
810
|
-
return {self._tab_name: data}
|
811
|
-
|
812
|
-
|
813
|
-
class JsonInsertsMixin:
|
814
|
-
|
815
|
-
@classmethod
|
816
|
-
def _parse_inserts_data(cls, filename: str) -> AnyJsonData:
|
817
|
-
return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
|
818
|
-
|
819
|
-
|
820
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
821
|
-
class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager):
|
822
|
-
|
823
|
-
ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension
|
824
|
-
|
825
|
-
|
826
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
827
|
-
class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager):
|
828
|
-
|
829
|
-
ALLOWED_FILE_EXTENSIONS = [".json"]
|
830
|
-
|
831
|
-
|
832
|
-
class YamlInsertsMixin:
|
833
|
-
|
834
|
-
def _parse_inserts_data(self, filename) -> AnyJsonData:
|
835
|
-
return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
|
836
|
-
|
837
|
-
|
838
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
839
|
-
class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager):
|
840
|
-
|
841
|
-
ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"]
|
842
|
-
|
843
|
-
def _parse_inserts_data(self, filename) -> AnyJsonData:
|
844
|
-
return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
|
845
|
-
|
846
|
-
|
847
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
848
|
-
class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager):
|
849
|
-
|
850
|
-
ALLOWED_FILE_EXTENSIONS = [".yaml"]
|
851
|
-
|
852
|
-
|
853
|
-
class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here
|
854
|
-
"""
|
855
|
-
This class is used for inserts directories and other JSON-like data that will be literally used as an Item
|
856
|
-
without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness
|
857
|
-
but instead assumed to have been checked by other means.
|
858
|
-
"""
|
859
|
-
|
860
|
-
AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value.
|
861
|
-
|
862
|
-
def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
|
863
|
-
portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None,
|
864
|
-
**kwargs):
|
865
|
-
ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that.
|
866
|
-
if schemas not in [None, {}]:
|
867
|
-
raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.")
|
868
|
-
if autoload_schemas not in [None, False]:
|
869
|
-
raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.")
|
870
|
-
super().__init__(filename=filename, **kwargs)
|
871
|
-
|
872
|
-
|
873
|
-
@ITEM_MANAGER_REGISTRY.register()
|
874
|
-
class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager):
|
875
|
-
pass
|
876
|
-
|
877
|
-
|
878
|
-
@ITEM_MANAGER_REGISTRY.register()
|
879
|
-
class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager):
|
880
|
-
pass
|
881
|
-
|
882
|
-
|
883
|
-
@ITEM_MANAGER_REGISTRY.register()
|
884
|
-
class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager):
|
885
|
-
pass
|
886
|
-
|
887
|
-
|
888
|
-
@ITEM_MANAGER_REGISTRY.register()
|
889
|
-
class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager):
|
890
|
-
pass
|
891
|
-
|
892
|
-
|
893
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
894
|
-
class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager):
|
895
|
-
|
896
|
-
ALLOWED_FILE_EXTENSIONS = [".jsonl"]
|
897
|
-
|
898
|
-
def _parse_inserts_data(self, filename: str) -> AnyJsonData:
|
899
|
-
return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))]
|
900
|
-
|
901
|
-
|
902
|
-
@ITEM_MANAGER_REGISTRY.register()
|
903
|
-
class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager):
|
904
|
-
pass
|
905
|
-
|
906
|
-
|
907
|
-
@TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
|
908
|
-
class InsertsDirectoryManager(InsertsManager):
|
909
|
-
|
910
|
-
ALLOWED_FILE_EXTENSIONS = []
|
911
|
-
|
912
|
-
def _parse_inserts_data(self, filename: str) -> AnyJsonData:
|
913
|
-
if not os.path.isdir(filename):
|
914
|
-
raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.")
|
915
|
-
tab_files = glob.glob(os.path.join(filename, "*.json"))
|
916
|
-
data = {}
|
917
|
-
for tab_file in tab_files:
|
918
|
-
tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file))
|
919
|
-
# Here we don't use os.path.splitext because we want to split on the first dot.
|
920
|
-
# e.g., for foo.bar.baz, return just foo
|
921
|
-
# this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes
|
922
|
-
# for things unrelated to the tab name.
|
923
|
-
tab_name = os.path.basename(tab_file).split('.')[0]
|
924
|
-
data[tab_name] = tab_content
|
925
|
-
return data
|
926
|
-
|
927
|
-
|
928
|
-
@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
|
929
|
-
class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager):
|
930
|
-
pass
|
931
|
-
|
932
|
-
|
933
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
934
|
-
class CsvManager(SingleTableMixin, SemanticTableSetManager):
|
935
|
-
"""
|
936
|
-
This implements the mechanism to get a series of rows out of the sheet in a csv file,
|
937
|
-
returning a result that still looks like there could have been multiple tabs.
|
938
|
-
"""
|
939
|
-
|
940
|
-
ALLOWED_FILE_EXTENSIONS = ['.csv']
|
941
|
-
|
942
|
-
def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs):
|
943
|
-
super().__init__(filename=filename, **kwargs)
|
944
|
-
self.escaping: bool = escaping or False
|
945
|
-
|
946
|
-
def _get_reader_agent(self) -> CsvReader:
|
947
|
-
return self._get_reader_agent_for_filename(self.filename)
|
948
|
-
|
949
|
-
@classmethod
|
950
|
-
def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
|
951
|
-
return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename))
|
952
|
-
|
953
|
-
PAD_TRAILING_TABS = True
|
954
|
-
|
955
|
-
def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
|
956
|
-
headers = self.tab_headers(tab_name)
|
957
|
-
n_headers = len(headers)
|
958
|
-
for row_data in self.reader_agent:
|
959
|
-
if self.PAD_TRAILING_TABS:
|
960
|
-
row_data = pad_to(n_headers, row_data, padding='')
|
961
|
-
yield row_data
|
962
|
-
|
963
|
-
def _create_tab_processor_state(self, tab_name: str) -> Headers:
|
964
|
-
headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name)
|
965
|
-
if headers is None:
|
966
|
-
self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__()
|
967
|
-
return headers
|
968
|
-
|
969
|
-
@classmethod
|
970
|
-
def _escape_cell_text(cls, cell_text):
|
971
|
-
if '\\' in cell_text:
|
972
|
-
return expand_string_escape_sequences(cell_text)
|
973
|
-
else:
|
974
|
-
return cell_text
|
975
|
-
|
976
|
-
def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
977
|
-
ignored(tab_name)
|
978
|
-
if self.escaping:
|
979
|
-
return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text))
|
980
|
-
for i, cell_text in enumerate(row_data)}
|
981
|
-
else:
|
982
|
-
return {headers[i]: self.parse_cell_value(cell_text)
|
983
|
-
for i, cell_text in enumerate(row_data)}
|
984
|
-
|
985
|
-
|
986
|
-
@ITEM_MANAGER_REGISTRY.register()
|
987
|
-
class CsvItemManager(ItemManagerMixin, CsvManager):
|
988
|
-
"""
|
989
|
-
This layers item-style row processing functionality on a CSV file.
|
990
|
-
"""
|
991
|
-
pass
|
992
|
-
|
993
|
-
|
994
|
-
@TABLE_SET_MANAGER_REGISTRY.register()
|
995
|
-
class TsvManager(CsvManager):
|
996
|
-
"""
|
997
|
-
TSV files are just CSV files with tabs instead of commas as separators.
|
998
|
-
(We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
|
999
|
-
"""
|
1000
|
-
ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt']
|
1001
|
-
|
1002
|
-
@classmethod
|
1003
|
-
def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
|
1004
|
-
return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
|
1005
|
-
|
1006
|
-
|
1007
|
-
@ITEM_MANAGER_REGISTRY.register()
|
1008
|
-
class TsvItemManager(ItemManagerMixin, TsvManager):
|
1009
|
-
"""
|
1010
|
-
This layers item-style row processing functionality on a TSV file.
|
1011
|
-
"""
|
1012
|
-
pass
|
1013
|
-
|
1014
|
-
|
1015
|
-
def _do_shell_command(command, cwd=None):
|
1016
|
-
# This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023
|
1017
|
-
subprocess.check_output(command, cwd=cwd)
|
1018
|
-
|
1019
|
-
|
1020
|
-
@contextlib.contextmanager
|
1021
|
-
def maybe_unpack(filename): # Maybe move to another module
|
1022
|
-
"""
|
1023
|
-
If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not).
|
1024
|
-
"""
|
1025
|
-
unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip']
|
1026
|
-
ext = None
|
1027
|
-
for unpackable in unpackables:
|
1028
|
-
if filename.endswith(unpackable):
|
1029
|
-
ext = unpackable
|
1030
|
-
break
|
1031
|
-
if not ext:
|
1032
|
-
yield filename
|
1033
|
-
return
|
1034
|
-
if not os.path.exists(filename):
|
1035
|
-
# We don't bother to raise this error if we're not planning to do any unpacking.
|
1036
|
-
# The caller can decide if/when such errors are needed in that case.
|
1037
|
-
# But if we are going to have to move bits around, they'll need to actually be there.
|
1038
|
-
# -kmp 12-Sep-2023
|
1039
|
-
raise ValueError(f"The file {filename!r} does not exist.")
|
1040
|
-
target_base_part = remove_suffix(ext, os.path.basename(filename), required=True)
|
1041
|
-
target_ext = '.tar.gz' if ext == '.tgz' else ext
|
1042
|
-
with TemporaryDirectory() as temp_dir:
|
1043
|
-
temp_base = os.path.join(temp_dir, target_base_part)
|
1044
|
-
temp_filename = temp_base + target_ext
|
1045
|
-
_do_shell_command(['cp', filename, temp_filename])
|
1046
|
-
if temp_filename.endswith('.gz'):
|
1047
|
-
_do_shell_command(['gunzip', temp_filename], cwd=temp_dir)
|
1048
|
-
temp_filename = remove_suffix('.gz', temp_filename)
|
1049
|
-
elif temp_filename.endswith(".zip"):
|
1050
|
-
_do_shell_command(['unzip', temp_filename], cwd=temp_dir)
|
1051
|
-
temp_filename = remove_suffix('.zip', temp_filename)
|
1052
|
-
if temp_filename.endswith(".tar"):
|
1053
|
-
_do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir)
|
1054
|
-
tar_file = temp_filename
|
1055
|
-
temp_filename = remove_suffix(".tar", temp_filename, required=True)
|
1056
|
-
if not os.path.isdir(temp_filename):
|
1057
|
-
raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}")
|
1058
|
-
# print(f"Unpacked {filename} to {temp_filename}")
|
1059
|
-
yield temp_filename
|
1060
|
-
|
1061
|
-
|
1062
|
-
class TableSetManager(AbstractTableSetManager):
|
1063
|
-
"""
|
1064
|
-
This class will open a .xlsx or .csv file and load its content in our standard format.
|
1065
|
-
(See more detailed description in AbstractTableManager.)
|
1066
|
-
"""
|
1067
|
-
|
1068
|
-
@classmethod
|
1069
|
-
def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager:
|
1070
|
-
reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename)
|
1071
|
-
if issubclass(reader_agent_class, AbstractItemManager):
|
1072
|
-
raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.")
|
1073
|
-
reader_agent = reader_agent_class(filename=filename, **kwargs)
|
1074
|
-
return reader_agent
|
1075
|
-
|
1076
|
-
@classmethod
|
1077
|
-
def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
|
1078
|
-
**kwargs) -> TabbedSheetData:
|
1079
|
-
"""
|
1080
|
-
Given a filename and various options
|
1081
|
-
"""
|
1082
|
-
with maybe_unpack(filename) as filename:
|
1083
|
-
manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
|
1084
|
-
**kwargs)
|
1085
|
-
return manager.load_content()
|
1086
|
-
|
1087
|
-
|
1088
|
-
class ItemManager(AbstractTableSetManager):
|
1089
|
-
"""
|
1090
|
-
This class will open a .xlsx or .csv file and load its content in our standard format.
|
1091
|
-
(See more detailed description in AbstractTableManager.)
|
1092
|
-
"""
|
1093
|
-
|
1094
|
-
@classmethod
|
1095
|
-
def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager:
|
1096
|
-
reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename)
|
1097
|
-
if not issubclass(reader_agent_class, AbstractItemManager):
|
1098
|
-
raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.")
|
1099
|
-
reader_agent_class: Type[AbstractItemManager]
|
1100
|
-
reader_agent = reader_agent_class(filename=filename, **kwargs)
|
1101
|
-
return reader_agent
|
1102
|
-
|
1103
|
-
@classmethod
|
1104
|
-
def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
|
1105
|
-
schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None,
|
1106
|
-
portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
|
1107
|
-
**kwargs) -> TabbedSheetData:
|
1108
|
-
"""
|
1109
|
-
Given a filename and various options, loads the items associated with that filename.
|
1110
|
-
|
1111
|
-
:param filename: The name of the file to load.
|
1112
|
-
:param tab_name: For files that lack multiple tabs (such as .csv or .tsv),
|
1113
|
-
the tab name to associate with the data.
|
1114
|
-
:param escaping: Whether to perform escape processing on backslashes.
|
1115
|
-
:param schemas: A set of schemas to use instead of trying to load them.
|
1116
|
-
:param autoload_schemas: Whether to try autoloading schemas.
|
1117
|
-
:param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal).
|
1118
|
-
:param portal_vapp: A vapp to use (usually if calling from within a portal).
|
1119
|
-
"""
|
1120
|
-
|
1121
|
-
with maybe_unpack(filename) as filename:
|
1122
|
-
|
1123
|
-
manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
|
1124
|
-
schemas=schemas, autoload_schemas=autoload_schemas,
|
1125
|
-
portal_env=portal_env, portal_vapp=portal_vapp,
|
1126
|
-
**kwargs)
|
1127
|
-
return manager.load_content()
|
1128
|
-
|
1129
|
-
|
1130
|
-
load_table_set = TableSetManager.load
|
1131
|
-
load_items = ItemManager.load
|