dcicutils 7.11.0__py3-none-any.whl → 7.11.0.1b9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
dcicutils/misc_utils.py CHANGED
@@ -9,6 +9,7 @@ import hashlib
9
9
  import inspect
10
10
  import math
11
11
  import io
12
+ import json
12
13
  import os
13
14
  import logging
14
15
  import pytz
@@ -191,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp):
191
192
  pass
192
193
 
193
194
 
194
- class VirtualApp:
195
+ class AbstractVirtualApp:
196
+ pass
197
+
198
+
199
+ class VirtualApp(AbstractVirtualApp):
195
200
  """
196
201
  Wrapper class for TestApp, to allow custom control over submitting Encoded requests,
197
202
  simulating a number of conditions, including permissions.
@@ -1352,6 +1357,25 @@ def capitalize1(s):
1352
1357
  return s[:1].upper() + s[1:]
1353
1358
 
1354
1359
 
1360
+ """
1361
+ Python's UUID ignores all dashes, whereas Postgres is more strict
1362
+ http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
1363
+ See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
1364
+ And, anyway, this pattern is what our portals have been doing
1365
+ for quite a while, so it's the most stable choice for us now.
1366
+ """
1367
+
1368
+ uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?')
1369
+
1370
+
1371
+ def is_uuid(instance):
1372
+ """
1373
+ Predicate returns true for any group of 32 hex characters with optional hyphens every four characters.
1374
+ We insist on lowercase to make matching faster. See other notes on this design choice above.
1375
+ """
1376
+ return bool(uuid_re.match(instance))
1377
+
1378
+
1355
1379
  def string_list(s):
1356
1380
  """
1357
1381
  Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls.
@@ -2313,3 +2337,73 @@ def parse_in_radix(text: str, *, radix: int):
2313
2337
  except Exception:
2314
2338
  pass
2315
2339
  raise ValueError(f"Unable to parse: {text!r}")
2340
+
2341
+
2342
+ def pad_to(target_size: int, data: list, *, padding=None):
2343
+ """
2344
+ This will pad to a given target size, a list of a potentially different actual size, using given padding.
2345
+ e.g., pad_to(3, [1, 2]) will return [1, 2, None]
2346
+ """
2347
+ actual_size = len(data)
2348
+ if actual_size < target_size:
2349
+ data = data + [padding] * (target_size - actual_size)
2350
+ return data
2351
+
2352
+
2353
+ class JsonLinesReader:
2354
+
2355
+ def __init__(self, fp, padded=False, padding=None):
2356
+ """
2357
+ Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns,
2358
+ this creates an object that can be used to iterate across the lines in the JSON lines file
2359
+ that the fp is reading from.
2360
+
2361
+ There are two possible formats that this will return.
2362
+
2363
+ For files that contain a series of dictionaries, such as:
2364
+ {"something": 1, "else": "a"}
2365
+ {"something": 2, "else": "b"}
2366
+ ...etc
2367
+ this will just return thos those dictionaries one-by-one when iterated over.
2368
+
2369
+ The same set of dictionaries will also be yielded by a file containing:
2370
+ ["something", "else"]
2371
+ [1, "a"]
2372
+ [2, "b"]
2373
+ ...etc
2374
+ this will just return thos those dictionaries one-by-one when iterated over.
2375
+
2376
+ NOTES:
2377
+
2378
+ * In the second case, shorter lists on subsequent lines return only partial dictionaries.
2379
+ * In the second case, longer lists on subsequent lines will quietly drop any extra elements.
2380
+ """
2381
+
2382
+ self.fp = fp
2383
+ self.padded: bool = padded
2384
+ self.padding = padding
2385
+ self.headers = None # Might change after we see first line
2386
+
2387
+ def __iter__(self):
2388
+ first_line = True
2389
+ n_headers = 0
2390
+ for raw_line in self.fp:
2391
+ line = json.loads(raw_line)
2392
+ if first_line:
2393
+ first_line = False
2394
+ if isinstance(line, list):
2395
+ self.headers = line
2396
+ n_headers = len(line)
2397
+ continue
2398
+ # If length of line is more than we expect, ignore it. Let user put comments beyond our table
2399
+ # But if length of line is less than we expect, extend the line with None
2400
+ if self.headers:
2401
+ if not isinstance(line, list):
2402
+ raise Exception("If the first line is a list, all lines must be.")
2403
+ if self.padded and len(line) < n_headers:
2404
+ line = pad_to(n_headers, line, padding=self.padding)
2405
+ yield dict(zip(self.headers, line))
2406
+ elif isinstance(line, dict):
2407
+ yield line
2408
+ else:
2409
+ raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")
@@ -0,0 +1,1131 @@
1
+ import chardet
2
+ import contextlib
3
+ import copy
4
+ import csv
5
+ import glob
6
+ import io
7
+ import json
8
+ import openpyxl
9
+ import os
10
+ import re
11
+ import subprocess
12
+ import uuid
13
+ import yaml
14
+
15
+ from openpyxl.worksheet.worksheet import Worksheet
16
+ from openpyxl.workbook.workbook import Workbook
17
+ from tempfile import TemporaryFile, TemporaryDirectory
18
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
19
+ from .common import AnyJsonData
20
+ from .env_utils import public_env_name, EnvUtils
21
+ from .ff_utils import get_schema
22
+ from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are
23
+ from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix
24
+ from .task_utils import pmap
25
+
26
+
27
+ Header = str
28
+ Headers = List[str]
29
+ ParsedHeader = List[Union[str, int]]
30
+ ParsedHeaders = List[ParsedHeader]
31
+ SheetCellValue = Union[int, float, str]
32
+ SheetRow = List[SheetCellValue]
33
+ CsvReader = type(csv.reader(TemporaryFile()))
34
+ SheetData = List[dict]
35
+ TabbedSheetData = Dict[str, SheetData]
36
+ Regexp = type(re.compile("sample"))
37
+
38
+
39
+ class LoadFailure(Exception):
40
+ """
41
+ In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail,
42
+ but some errors are so confusing or so problematic that we need to just fail the load right away.
43
+ """
44
+ pass
45
+
46
+
47
+ class LoadArgumentsError(LoadFailure):
48
+ """
49
+ Errors of this class represent situations where we can't get started because
50
+ there's a problem with the given arguments.
51
+ """
52
+ pass
53
+
54
+
55
+ class LoadTableError(LoadFailure):
56
+ """
57
+ Errors of this class represent situations where we can't get started because
58
+ there's a problem with some table's syntax, for example headers that don't make sense.
59
+ """
60
+ pass
61
+
62
+
63
+ @contextlib.contextmanager
64
+ def deferred_problems():
65
+ problems = []
66
+
67
+ def note_problems(problem):
68
+ problems.append(problem)
69
+
70
+ yield note_problems
71
+
72
+ if problems:
73
+ for problem in problems:
74
+ PRINT(f"Problem: {problem}")
75
+ raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False))
76
+
77
+
78
+ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False):
79
+ if kwargs:
80
+ unwanted = [f"{argname}={value!r}" if detailed else argname
81
+ for argname, value in kwargs.items()
82
+ if value is not None]
83
+ if unwanted:
84
+ does_not = "don't" if context_plural else "doesn't"
85
+ raise LoadArgumentsError(f"{context} {does_not} use"
86
+ f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")
87
+
88
+
89
+ def prefer_number(value: SheetCellValue):
90
+ if isinstance(value, str): # the given value might be an int or float, in which case just fall through
91
+ if not value:
92
+ return None
93
+ value = value
94
+ ch0 = value[0]
95
+ if ch0 == '+' or ch0 == '-' or ch0.isdigit():
96
+ try:
97
+ return int(value)
98
+ except Exception:
99
+ pass
100
+ try:
101
+ return float(value)
102
+ except Exception:
103
+ pass
104
+ # If we couldn't parse it as an int or float, fall through to returning the original value
105
+ pass
106
+ return value
107
+
108
+
109
+ def expand_string_escape_sequences(text: str) -> str:
110
+ s = io.StringIO()
111
+ escaping = False
112
+ for ch in text:
113
+ if escaping:
114
+ if ch == 'r':
115
+ s.write('\r')
116
+ elif ch == 't':
117
+ s.write('\t')
118
+ elif ch == 'n':
119
+ s.write('\n')
120
+ elif ch == '\\':
121
+ s.write('\\')
122
+ else:
123
+ # Rather than err, just leave other sequences as-is.
124
+ s.write(f"\\{ch}")
125
+ escaping = False
126
+ elif ch == '\\':
127
+ escaping = True
128
+ else:
129
+ s.write(ch)
130
+ return s.getvalue()
131
+
132
+
133
+ def open_unicode_text_input_file_respecting_byte_order_mark(filename):
134
+ """
135
+ Opens a file for text input, respecting a byte-order mark (BOM).
136
+ """
137
+ with io.open(filename, 'rb') as fp:
138
+ leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
139
+ bom_info = chardet.detect(leading_bytes, should_rename_legacy=True)
140
+ detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
141
+ use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding
142
+ return io.open(filename, 'r', encoding=use_encoding)
143
+
144
+
145
+ class TypeHint:
146
+ def apply_hint(self, value):
147
+ return value
148
+
149
+ def __str__(self):
150
+ return f"<{self.__class__.__name__}>"
151
+
152
+ def __repr__(self):
153
+ return self.__str__()
154
+
155
+
156
+ class BoolHint(TypeHint):
157
+
158
+ def apply_hint(self, value):
159
+ if isinstance(value, str) and value:
160
+ if 'true'.startswith(value.lower()):
161
+ return True
162
+ elif 'false'.startswith(value.lower()):
163
+ return False
164
+ return super().apply_hint(value)
165
+
166
+
167
+ class EnumHint(TypeHint):
168
+
169
+ def __str__(self):
170
+ return f"<EnumHint {','.join(f'{key}={val}' for key, val in self.value_map.items())}>"
171
+
172
+ def __init__(self, value_map):
173
+ self.value_map = value_map
174
+
175
+ def apply_hint(self, value):
176
+ if isinstance(value, str):
177
+ if value in self.value_map:
178
+ result = self.value_map[value]
179
+ return result
180
+ else:
181
+ lvalue = value.lower()
182
+ found = []
183
+ for lkey, key in self.value_map.items():
184
+ if lkey.startswith(lvalue):
185
+ found.append(lkey)
186
+ if len(found) == 1:
187
+ [only_found] = found
188
+ result = self.value_map[only_found]
189
+ return result
190
+ return super().apply_hint(value)
191
+
192
+
193
+ OptionalTypeHints = List[Optional[TypeHint]]
194
+
195
+
196
+ class ItemTools:
197
+ """
198
+ Implements operations on table-related data without pre-supposing the specific representation of the table.
199
+ It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
200
+ it does not presuppose the source of the data nor where it will be written to.
201
+
202
+ For the purpose of this class:
203
+
204
+ * a 'header' is a string representing the top of a column.
205
+
206
+ * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
207
+ "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
208
+ each numeric token as an int instead of a string.
209
+
210
+ * a 'headers' object is just a list of strings, each of which is a 'header'.
211
+
212
+ * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
213
+ e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
214
+
215
+ """
216
+
217
+ @classmethod
218
+ def parse_sheet_header(cls, header: Header) -> ParsedHeader:
219
+ result = []
220
+ token = ""
221
+ for i in range(len(header)):
222
+ ch = header[i]
223
+ if ch == '.' or ch == '#':
224
+ if token:
225
+ result.append(int(token) if token.isdigit() else token)
226
+ token = ""
227
+ else:
228
+ token += ch
229
+ if token:
230
+ result.append(int(token) if token.isdigit() else token)
231
+ return result
232
+
233
+ @classmethod
234
+ def parse_sheet_headers(cls, headers: Headers):
235
+ return [cls.parse_sheet_header(header)
236
+ for header in headers]
237
+
238
+ @classmethod
239
+ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
240
+ prototype = {}
241
+ for parsed_header in parsed_headers:
242
+ parsed_header0 = parsed_header[0]
243
+ if isinstance(parsed_header0, int):
244
+ raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}")
245
+ cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
246
+ return prototype
247
+
248
+ @classmethod
249
+ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
250
+ [key0, *more_keys] = keys
251
+ key1 = more_keys[0] if more_keys else None
252
+ if isinstance(key1, int):
253
+ placeholder = []
254
+ elif isinstance(key1, str):
255
+ placeholder = {}
256
+ else:
257
+ placeholder = None
258
+ if isinstance(key0, int):
259
+ n = len(parent)
260
+ if key0 == n:
261
+ parent.append(placeholder)
262
+ elif key0 > n:
263
+ raise LoadTableError("Numeric items must occur sequentially.")
264
+ elif isinstance(key0, str):
265
+ if key0 not in parent:
266
+ parent[key0] = placeholder
267
+ if key1 is not None:
268
+ cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
269
+ return parent
270
+
271
+ INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default
272
+
273
+ @classmethod
274
+ def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData:
275
+ # TODO: Remodularize this for easier testing and more Schema-driven effect
276
+ # Doug asks that this be broken up into different mechanisms, more modular and separately testable.
277
+ # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired.
278
+ if isinstance(value, str):
279
+ lvalue = value.lower()
280
+ # TODO: We could consult a schema to make this less heuristic, but this may do for now
281
+ if lvalue == 'true':
282
+ return True
283
+ elif lvalue == 'false':
284
+ return False
285
+ elif lvalue == 'null' or lvalue == '':
286
+ return None
287
+ elif '|' in value:
288
+ if value == '|': # Use '|' for []
289
+ return []
290
+ else:
291
+ if value.endswith("|"): # Use 'foo|' for ['foo']
292
+ value = value[:-1]
293
+ return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')]
294
+ elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'):
295
+ # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid
296
+ return cls.get_instaguid(value, context=context)
297
+ else:
298
+ # Doug points out that the schema might not agree, might want a string representation of a number.
299
+ # At this semantic layer, this might be a bad choice.
300
+ return prefer_number(value)
301
+ else: # presumably a number (int or float)
302
+ return value
303
+
304
+ @classmethod
305
+ def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None):
306
+ if context is None:
307
+ return guid_placeholder
308
+ else:
309
+ referent = context.get(guid_placeholder)
310
+ if not referent:
311
+ context[guid_placeholder] = referent = str(uuid.uuid4())
312
+ return referent
313
+
314
+ @classmethod
315
+ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
316
+ if (value is None or value == '') and not force:
317
+ return
318
+ [key, *more_path] = path
319
+ if not more_path:
320
+ datum[key] = value
321
+ else:
322
+ cls.set_path_value(datum[key], more_path, value)
323
+
324
+ @classmethod
325
+ def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any):
326
+
327
+ def finder(subheader, subschema):
328
+ if not parsed_header:
329
+ return None
330
+ else:
331
+ [key1, *other_headers] = subheader
332
+ if isinstance(key1, str) and isinstance(subschema, dict):
333
+ if subschema.get('type') == 'object':
334
+ def1 = subschema.get('properties', {}).get(key1)
335
+ if not other_headers:
336
+ if def1 is not None:
337
+ t = def1.get('type')
338
+ if t == 'string':
339
+ enum = def1.get('enum')
340
+ if enum:
341
+ mapping = {e.lower(): e for e in enum}
342
+ return EnumHint(mapping)
343
+ elif t == 'boolean':
344
+ return BoolHint()
345
+ else:
346
+ pass # fall through to asking super()
347
+ else:
348
+ pass # fall through to asking super()
349
+ else:
350
+ return finder(subheader=other_headers, subschema=def1)
351
+
352
+ return finder(subheader=parsed_header, subschema=schema)
353
+
354
+ @classmethod
355
+ def infer_tab_name(cls, filename):
356
+ return os.path.basename(filename).split('.')[0]
357
+
358
+
359
+ # TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
360
+ #
361
+ # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
362
+ # I am less certain but open to discussion. Among other things, as implemented now,
363
+ # the __init__ method here needs to run and the documentation says that ABC's won't appear
364
+ # in the method resolution order. -kmp 17-Aug-2023
365
+ # See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
366
+ class AbstractTableSetManager:
367
+ """
368
+ The TableSetManager is the spanning class of anything that wants to be able to load a table set,
369
+ regardless of what it wants to load it from. To do this, it must support a load method
370
+ that takes a filename and returns the file content in the form:
371
+ {
372
+ "Sheet1": [
373
+ {...representation of row1 as some kind of dict...},
374
+ {...representation of row2 as some kind of dict...}
375
+ ],
376
+ "Sheet2": [...],
377
+ ...,
378
+ }
379
+ It also needs some implementation of the .tab_names property.
380
+ Note that at this level of abstraction, we take no position on what form of representation is used
381
+ for the rows, as long as it is JSON data of some kind. It might be
382
+ {"col1": "val1", "col2": "val2", ...}
383
+ or it might be something more structured like
384
+ {"something": "val1", {"something_else": ["val2"]}}
385
+ Additionally, the values stored might be altered as well. In particular, the most likely alteration
386
+ is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
387
+ happen is not constrained by this class.
388
+ """
389
+
390
+ ALLOWED_FILE_EXTENSIONS: List[str] = []
391
+
392
+ def __init__(self, filename: str, **kwargs):
393
+ self.filename: str = filename
394
+ unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs)
395
+
396
+ # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
397
+ @classmethod
398
+ def load(cls, filename: str, **kwargs) -> TabbedSheetData:
399
+ """
400
+ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
401
+ For more information, see documentation of AbstractTableSetManager.
402
+ """
403
+ raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA
404
+
405
+ @property
406
+ def tab_names(self) -> List[str]:
407
+ raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA
408
+
409
+ def load_content(self) -> Any:
410
+ raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA
411
+
412
+
413
+ class BasicTableSetManager(AbstractTableSetManager):
414
+ """
415
+ A BasicTableManager provides some structure that most kinds of parsers will need.
416
+ In particular, everything will likely need some way of storing headers and some way of storing content
417
+ of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
418
+ of this where there's only one set of headers and only one block of content.
419
+ """
420
+
421
+ def __init__(self, filename: str, **kwargs):
422
+ super().__init__(filename=filename, **kwargs)
423
+ self.headers_by_tab_name: Dict[str, Headers] = {}
424
+ self.content_by_tab_name: Dict[str, SheetData] = {}
425
+ self.reader_agent: Any = self._get_reader_agent()
426
+
427
+ def tab_headers(self, tab_name: str) -> Headers:
428
+ return self.headers_by_tab_name[tab_name]
429
+
430
+ def tab_content(self, tab_name: str) -> List[AnyJsonData]:
431
+ return self.content_by_tab_name[tab_name]
432
+
433
+ @classmethod
434
+ def _create_tab_processor_state(cls, tab_name: str) -> Any:
435
+ """
436
+ This method provides for the possibility that some parsers will want auxiliary state,
437
+ (such as parsed headers or a line count or a table of temporary names for objects to cross-link
438
+ or some other such feature) that it carries with it as it moves from line to line parsing things.
439
+ Subclasses might therefore want to make this do something more interesting.
440
+ """
441
+ ignored(tab_name) # subclasses might need this, but we don't
442
+ return None
443
+
444
+ def _get_reader_agent(self) -> Any:
445
+ """This function is responsible for opening the workbook and returning a workbook object."""
446
+ raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA
447
+
448
+
449
+ class SemanticTableSetManager(BasicTableSetManager):
450
+ """
451
+ This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing.
452
+ Those may be:
453
+ * Excel workbook readers (.xlsx)
454
+ * Comma-separated file readers (.csv)
455
+ * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright
456
+ refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt)
457
+ There are two levels to each of these: a class that is not semantically interpreted,
458
+ and a class that is semantically interpreted as an "item".
459
+
460
+ This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing
461
+ were already done (in part so that they can be used to test the results of other formats):
462
+ * Json files
463
+ * Yaml files
464
+ * Inserts directories
465
+ * JsonLines files
466
+ """
467
+
468
+ @classmethod
469
+ def load(cls, filename: str, **kwargs) -> AnyJsonData:
470
+ if cls.ALLOWED_FILE_EXTENSIONS:
471
+ if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
472
+ raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
473
+ f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")
474
+
475
+ table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs)
476
+ return table_set_manager.load_content()
477
+
478
+ def __init__(self, filename: str, **kwargs):
479
+ super().__init__(filename=filename, **kwargs)
480
+
481
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
482
+ """
483
+ Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values.
484
+ """
485
+ raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA
486
+
487
+ def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
488
+ """
489
+ This needs to take a state and whatever represents a row and
490
+ must return a list of objects representing column values.
491
+ What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
492
+ """
493
+ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA
494
+
495
+ def load_content(self) -> AnyJsonData:
496
+ for tab_name in self.tab_names:
497
+ sheet_content = []
498
+ state = self._create_tab_processor_state(tab_name)
499
+ for row_data in self._raw_row_generator_for_tab_name(tab_name):
500
+ processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data)
501
+ sheet_content.append(processed_row_data)
502
+ self.content_by_tab_name[tab_name] = sheet_content
503
+ return self.content_by_tab_name
504
+
505
+ @classmethod
506
+ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
507
+ return prefer_number(value)
508
+
509
+
510
+ class AbstractItemManager(AbstractTableSetManager):
511
+
512
+ pass
513
+
514
+
515
+ class TableSetManagerRegistry:
516
+
517
+ def __init__(self):
518
+ self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {}
519
+ self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = []
520
+
521
+ def register(self, regexp: Optional[str] = None):
522
+ def _wrapped_register(class_to_register: Type[AbstractTableSetManager]):
523
+ if regexp:
524
+ self.regexp_mappings.append((re.compile(regexp), class_to_register))
525
+ for ext in class_to_register.ALLOWED_FILE_EXTENSIONS:
526
+ existing = self.manager_table.get(ext)
527
+ if existing:
528
+ raise Exception(f"Tried to define {class_to_register} to extension {ext},"
529
+ f" but {existing} already claimed that.")
530
+ self.manager_table[ext] = class_to_register
531
+ return class_to_register
532
+ return _wrapped_register
533
+
534
+ register1 = register
535
+
536
+ def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]:
537
+ base: str = os.path.basename(filename)
538
+ suffix_parts = base.split('.')[1:]
539
+ if suffix_parts:
540
+ for i in range(0, len(suffix_parts)):
541
+ suffix = f".{'.'.join(suffix_parts[i:])}"
542
+ found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix)
543
+ if found:
544
+ return found
545
+ else:
546
+ special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename)
547
+ if special_case:
548
+ return special_case
549
+ raise LoadArgumentsError(f"Unknown file type: {filename}")
550
+
551
+ def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]:
552
+ for pattern, manager_class in self.regexp_mappings:
553
+ if pattern.match(filename):
554
+ return manager_class
555
+ return None
556
+
557
+
558
+ TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry()
559
+ ITEM_MANAGER_REGISTRY = TableSetManagerRegistry()
560
+
561
+
562
+ @TABLE_SET_MANAGER_REGISTRY.register()
563
+ class XlsxManager(SemanticTableSetManager):
564
+ """
565
+ This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
566
+ """
567
+
568
+ ALLOWED_FILE_EXTENSIONS = ['.xlsx']
569
+
570
+ @classmethod
571
+ def _all_rows(cls, sheet: Worksheet):
572
+ row_max = sheet.max_row
573
+ for row in range(2, row_max + 1):
574
+ yield row
575
+
576
+ @classmethod
577
+ def _all_cols(cls, sheet: Worksheet):
578
+ col_max = sheet.max_column
579
+ for col in range(1, col_max + 1):
580
+ yield col
581
+
582
+ @property
583
+ def tab_names(self) -> List[str]:
584
+ return self.reader_agent.sheetnames
585
+
586
+ def _get_reader_agent(self) -> Workbook:
587
+ return openpyxl.load_workbook(self.filename)
588
+
589
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
590
+ sheet = self.reader_agent[tab_name]
591
+ return (self._get_raw_row_content_tuple(sheet, row)
592
+ for row in self._all_rows(sheet))
593
+
594
+ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
595
+ return [sheet.cell(row=row, column=col).value
596
+ for col in self._all_cols(sheet)]
597
+
598
+ def _create_tab_processor_state(self, tab_name: str) -> Headers:
599
+ sheet = self.reader_agent[tab_name]
600
+ headers: Headers = [str(sheet.cell(row=1, column=col).value)
601
+ for col in self._all_cols(sheet)]
602
+ self.headers_by_tab_name[sheet.title] = headers
603
+ return headers
604
+
605
+ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
606
+ ignored(tab_name)
607
+ return {headers[i]: self.parse_cell_value(row_datum)
608
+ for i, row_datum in enumerate(row_data)}
609
+
610
+
611
+ class SchemaAutoloadMixin(AbstractTableSetManager):
612
+
613
+ SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
614
+ CACHE_SCHEMAS = True # Controls whether we're doing caching at all
615
+ AUTOLOAD_SCHEMAS_DEFAULT = True
616
+
617
+ def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
618
+ portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs):
619
+ # This setup must be in place before the class initialization is done (via the super call).
620
+ self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas
621
+ if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting.
622
+ if portal_env is None and portal_vapp is None:
623
+ portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
624
+ PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
625
+ self.portal_env: Optional[str] = portal_env
626
+ self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp
627
+ super().__init__(filename=filename, **kwargs)
628
+
629
+ def fetch_relevant_schemas(self, schema_names: List[str]):
630
+ # The schema_names argument is not normally given, but it is there for easier testing
631
+ def fetch_schema(schema_name):
632
+ schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
633
+ return schema_name, schema
634
+ if self.autoload_schemas and (self.portal_env or self.portal_vapp):
635
+ autoloaded = {tab_name: schema
636
+ for tab_name, schema in pmap(fetch_schema, schema_names)}
637
+ return autoloaded
638
+ else:
639
+ return {}
640
+
641
+ @classmethod
642
+ def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None,
643
+ portal_vapp: Optional[AbstractVirtualApp] = None):
644
+ def just_fetch_it():
645
+ return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp)
646
+ if cls.CACHE_SCHEMAS:
647
+ schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name)
648
+ if schema is None:
649
+ cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it()
650
+ return schema
651
+ else:
652
+ return just_fetch_it()
653
+
654
+ @classmethod
655
+ def clear_schema_cache(cls):
656
+ for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first
657
+ cls.SCHEMA_CACHE.pop(key, None)
658
+
659
+
660
+ class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager):
661
+ """
662
+ This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
663
+ get handled like Items instead of just flat table rows.
664
+ """
665
+
666
+ def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs):
667
+ super().__init__(filename=filename, **kwargs)
668
+ self.patch_prototypes_by_tab_name: Dict[str, Dict] = {}
669
+ self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {}
670
+ self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {}
671
+ self._schemas = schemas
672
+ self._instaguid_context_table: Dict[str, str] = {}
673
+
674
+ @property
675
+ def schemas(self):
676
+ schemas = self._schemas
677
+ if schemas is None:
678
+ self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names)
679
+ return schemas
680
+
681
+ def sheet_patch_prototype(self, tab_name: str) -> Dict:
682
+ return self.patch_prototypes_by_tab_name[tab_name]
683
+
684
+ def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders:
685
+ return self.parsed_headers_by_tab_name[tab_name]
686
+
687
+ def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints:
688
+ return self.type_hints_by_tab_name[tab_name]
689
+
690
+ class SheetState:
691
+
692
+ def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints):
693
+ self.parsed_headers = parsed_headers
694
+ self.type_hints = type_hints
695
+
696
+ def _compile_type_hints(self, tab_name: str):
697
+ parsed_headers = self.sheet_parsed_headers(tab_name)
698
+ schema = self.schemas.get(tab_name)
699
+ with deferred_problems() as note_problem:
700
+ for required_header in self._schema_required_headers(schema):
701
+ if required_header not in parsed_headers:
702
+ note_problem("Missing required header")
703
+ type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None
704
+ for parsed_header in parsed_headers]
705
+ self.type_hints_by_tab_name[tab_name] = type_hints
706
+
707
+ @classmethod
708
+ def _schema_required_headers(cls, schema):
709
+ ignored(schema)
710
+ return [] # TODO: Make this compute a list of required headers (in parsed header form)
711
+
712
+ def _compile_sheet_headers(self, tab_name: str):
713
+ headers = self.headers_by_tab_name[tab_name]
714
+ parsed_headers = ItemTools.parse_sheet_headers(headers)
715
+ self.parsed_headers_by_tab_name[tab_name] = parsed_headers
716
+ prototype = ItemTools.compute_patch_prototype(parsed_headers)
717
+ self.patch_prototypes_by_tab_name[tab_name] = prototype
718
+
719
+ def _create_tab_processor_state(self, tab_name: str) -> SheetState:
720
+ super()._create_tab_processor_state(tab_name)
721
+ # This will create state that allows us to efficiently assign values in the right place on each row
722
+ # by setting up a prototype we can copy and then drop values into.
723
+ self._compile_sheet_headers(tab_name)
724
+ self._compile_type_hints(tab_name)
725
+ return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name),
726
+ type_hints=self.sheet_type_hints(tab_name))
727
+
728
+ def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData:
729
+ parsed_headers = state.parsed_headers
730
+ type_hints = state.type_hints
731
+ patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name))
732
+ for i, value in enumerate(row_data):
733
+ parsed_value = self.parse_cell_value(value)
734
+ type_hint = type_hints[i]
735
+ if type_hint:
736
+ parsed_value = type_hint.apply_hint(parsed_value)
737
+ ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
738
+ return patch_item
739
+
740
+ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
741
+ return ItemTools.parse_item_value(value, context=self._instaguid_context_table)
742
+
743
+
744
+ @ITEM_MANAGER_REGISTRY.register()
745
+ class XlsxItemManager(ItemManagerMixin, XlsxManager):
746
+ """
747
+ This layers item-style row processing functionality on an XLSX file.
748
+ """
749
+ pass
750
+
751
+
752
+ class SingleTableMixin(AbstractTableSetManager):
753
+
754
+ def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
755
+ self._tab_name = tab_name or ItemTools.infer_tab_name(filename)
756
+ super().__init__(filename=filename, **kwargs)
757
+
758
+ @property
759
+ def tab_names(self) -> List[str]:
760
+ return [self._tab_name]
761
+
762
+
763
+ class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here
764
+
765
+ ALLOWED_FILE_EXTENSIONS = []
766
+
767
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
768
+ raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA
769
+
770
+ def _load_inserts_data(self, filename: str) -> TabbedSheetData:
771
+ data: AnyJsonData = self._parse_inserts_data(filename)
772
+ tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data)
773
+ if (not isinstance(tabbed_inserts, dict)
774
+ or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys())
775
+ or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content)
776
+ for content in tabbed_inserts.values())):
777
+ raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).")
778
+ tabbed_inserts: TabbedSheetData # we've just checked that
779
+ return tabbed_inserts
780
+
781
+ @classmethod
782
+ def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData:
783
+ ignored(filename)
784
+ return data
785
+
786
+ @property
787
+ def tab_names(self) -> List[str]:
788
+ return list(self.content_by_tab_name.keys())
789
+
790
+ def _get_reader_agent(self) -> Any:
791
+ return self
792
+
793
+ def load_content(self) -> Dict[str, AnyJsonData]:
794
+ data = self._load_inserts_data(self.filename)
795
+ for tab_name, tab_content in data.items():
796
+ self.content_by_tab_name[tab_name] = tab_content
797
+ if not tab_content:
798
+ self.headers_by_tab_name[tab_name] = []
799
+ else:
800
+ self.headers_by_tab_name[tab_name] = list(tab_content[0].keys())
801
+ return self.content_by_tab_name
802
+
803
+
804
+ class SimpleInsertsMixin(SingleTableMixin):
805
+
806
+ def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData:
807
+ if (not isinstance(data, list)
808
+ or not all(isinstance(item, dict) for item in data)):
809
+ raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).")
810
+ return {self._tab_name: data}
811
+
812
+
813
+ class JsonInsertsMixin:
814
+
815
+ @classmethod
816
+ def _parse_inserts_data(cls, filename: str) -> AnyJsonData:
817
+ return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
818
+
819
+
820
+ @TABLE_SET_MANAGER_REGISTRY.register()
821
+ class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager):
822
+
823
+ ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension
824
+
825
+
826
+ @TABLE_SET_MANAGER_REGISTRY.register()
827
+ class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager):
828
+
829
+ ALLOWED_FILE_EXTENSIONS = [".json"]
830
+
831
+
832
+ class YamlInsertsMixin:
833
+
834
+ def _parse_inserts_data(self, filename) -> AnyJsonData:
835
+ return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
836
+
837
+
838
+ @TABLE_SET_MANAGER_REGISTRY.register()
839
+ class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager):
840
+
841
+ ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"]
842
+
843
+ def _parse_inserts_data(self, filename) -> AnyJsonData:
844
+ return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
845
+
846
+
847
+ @TABLE_SET_MANAGER_REGISTRY.register()
848
+ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager):
849
+
850
+ ALLOWED_FILE_EXTENSIONS = [".yaml"]
851
+
852
+
853
+ class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here
854
+ """
855
+ This class is used for inserts directories and other JSON-like data that will be literally used as an Item
856
+ without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness
857
+ but instead assumed to have been checked by other means.
858
+ """
859
+
860
+ AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value.
861
+
862
+ def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
863
+ portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None,
864
+ **kwargs):
865
+ ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that.
866
+ if schemas not in [None, {}]:
867
+ raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.")
868
+ if autoload_schemas not in [None, False]:
869
+ raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.")
870
+ super().__init__(filename=filename, **kwargs)
871
+
872
+
873
+ @ITEM_MANAGER_REGISTRY.register()
874
+ class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager):
875
+ pass
876
+
877
+
878
+ @ITEM_MANAGER_REGISTRY.register()
879
+ class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager):
880
+ pass
881
+
882
+
883
+ @ITEM_MANAGER_REGISTRY.register()
884
+ class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager):
885
+ pass
886
+
887
+
888
+ @ITEM_MANAGER_REGISTRY.register()
889
+ class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager):
890
+ pass
891
+
892
+
893
+ @TABLE_SET_MANAGER_REGISTRY.register()
894
+ class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager):
895
+
896
+ ALLOWED_FILE_EXTENSIONS = [".jsonl"]
897
+
898
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
899
+ return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))]
900
+
901
+
902
+ @ITEM_MANAGER_REGISTRY.register()
903
+ class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager):
904
+ pass
905
+
906
+
907
+ @TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
908
+ class InsertsDirectoryManager(InsertsManager):
909
+
910
+ ALLOWED_FILE_EXTENSIONS = []
911
+
912
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
913
+ if not os.path.isdir(filename):
914
+ raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.")
915
+ tab_files = glob.glob(os.path.join(filename, "*.json"))
916
+ data = {}
917
+ for tab_file in tab_files:
918
+ tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file))
919
+ # Here we don't use os.path.splitext because we want to split on the first dot.
920
+ # e.g., for foo.bar.baz, return just foo
921
+ # this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes
922
+ # for things unrelated to the tab name.
923
+ tab_name = os.path.basename(tab_file).split('.')[0]
924
+ data[tab_name] = tab_content
925
+ return data
926
+
927
+
928
+ @ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
929
+ class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager):
930
+ pass
931
+
932
+
933
+ @TABLE_SET_MANAGER_REGISTRY.register()
934
+ class CsvManager(SingleTableMixin, SemanticTableSetManager):
935
+ """
936
+ This implements the mechanism to get a series of rows out of the sheet in a csv file,
937
+ returning a result that still looks like there could have been multiple tabs.
938
+ """
939
+
940
+ ALLOWED_FILE_EXTENSIONS = ['.csv']
941
+
942
+ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs):
943
+ super().__init__(filename=filename, **kwargs)
944
+ self.escaping: bool = escaping or False
945
+
946
+ def _get_reader_agent(self) -> CsvReader:
947
+ return self._get_reader_agent_for_filename(self.filename)
948
+
949
+ @classmethod
950
+ def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
951
+ return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename))
952
+
953
+ PAD_TRAILING_TABS = True
954
+
955
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
956
+ headers = self.tab_headers(tab_name)
957
+ n_headers = len(headers)
958
+ for row_data in self.reader_agent:
959
+ if self.PAD_TRAILING_TABS:
960
+ row_data = pad_to(n_headers, row_data, padding='')
961
+ yield row_data
962
+
963
+ def _create_tab_processor_state(self, tab_name: str) -> Headers:
964
+ headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name)
965
+ if headers is None:
966
+ self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__()
967
+ return headers
968
+
969
+ @classmethod
970
+ def _escape_cell_text(cls, cell_text):
971
+ if '\\' in cell_text:
972
+ return expand_string_escape_sequences(cell_text)
973
+ else:
974
+ return cell_text
975
+
976
+ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
977
+ ignored(tab_name)
978
+ if self.escaping:
979
+ return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text))
980
+ for i, cell_text in enumerate(row_data)}
981
+ else:
982
+ return {headers[i]: self.parse_cell_value(cell_text)
983
+ for i, cell_text in enumerate(row_data)}
984
+
985
+
986
+ @ITEM_MANAGER_REGISTRY.register()
987
+ class CsvItemManager(ItemManagerMixin, CsvManager):
988
+ """
989
+ This layers item-style row processing functionality on a CSV file.
990
+ """
991
+ pass
992
+
993
+
994
+ @TABLE_SET_MANAGER_REGISTRY.register()
995
+ class TsvManager(CsvManager):
996
+ """
997
+ TSV files are just CSV files with tabs instead of commas as separators.
998
+ (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
999
+ """
1000
+ ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt']
1001
+
1002
+ @classmethod
1003
+ def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
1004
+ return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
1005
+
1006
+
1007
+ @ITEM_MANAGER_REGISTRY.register()
1008
+ class TsvItemManager(ItemManagerMixin, TsvManager):
1009
+ """
1010
+ This layers item-style row processing functionality on a TSV file.
1011
+ """
1012
+ pass
1013
+
1014
+
1015
+ def _do_shell_command(command, cwd=None):
1016
+ # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023
1017
+ subprocess.check_output(command, cwd=cwd)
1018
+
1019
+
1020
+ @contextlib.contextmanager
1021
+ def maybe_unpack(filename): # Maybe move to another module
1022
+ """
1023
+ If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not).
1024
+ """
1025
+ unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip']
1026
+ ext = None
1027
+ for unpackable in unpackables:
1028
+ if filename.endswith(unpackable):
1029
+ ext = unpackable
1030
+ break
1031
+ if not ext:
1032
+ yield filename
1033
+ return
1034
+ if not os.path.exists(filename):
1035
+ # We don't bother to raise this error if we're not planning to do any unpacking.
1036
+ # The caller can decide if/when such errors are needed in that case.
1037
+ # But if we are going to have to move bits around, they'll need to actually be there.
1038
+ # -kmp 12-Sep-2023
1039
+ raise ValueError(f"The file {filename!r} does not exist.")
1040
+ target_base_part = remove_suffix(ext, os.path.basename(filename), required=True)
1041
+ target_ext = '.tar.gz' if ext == '.tgz' else ext
1042
+ with TemporaryDirectory() as temp_dir:
1043
+ temp_base = os.path.join(temp_dir, target_base_part)
1044
+ temp_filename = temp_base + target_ext
1045
+ _do_shell_command(['cp', filename, temp_filename])
1046
+ if temp_filename.endswith('.gz'):
1047
+ _do_shell_command(['gunzip', temp_filename], cwd=temp_dir)
1048
+ temp_filename = remove_suffix('.gz', temp_filename)
1049
+ elif temp_filename.endswith(".zip"):
1050
+ _do_shell_command(['unzip', temp_filename], cwd=temp_dir)
1051
+ temp_filename = remove_suffix('.zip', temp_filename)
1052
+ if temp_filename.endswith(".tar"):
1053
+ _do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir)
1054
+ tar_file = temp_filename
1055
+ temp_filename = remove_suffix(".tar", temp_filename, required=True)
1056
+ if not os.path.isdir(temp_filename):
1057
+ raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}")
1058
+ # print(f"Unpacked {filename} to {temp_filename}")
1059
+ yield temp_filename
1060
+
1061
+
1062
+ class TableSetManager(AbstractTableSetManager):
1063
+ """
1064
+ This class will open a .xlsx or .csv file and load its content in our standard format.
1065
+ (See more detailed description in AbstractTableManager.)
1066
+ """
1067
+
1068
+ @classmethod
1069
+ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager:
1070
+ reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename)
1071
+ if issubclass(reader_agent_class, AbstractItemManager):
1072
+ raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.")
1073
+ reader_agent = reader_agent_class(filename=filename, **kwargs)
1074
+ return reader_agent
1075
+
1076
+ @classmethod
1077
+ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
1078
+ **kwargs) -> TabbedSheetData:
1079
+ """
1080
+ Given a filename and various options
1081
+ """
1082
+ with maybe_unpack(filename) as filename:
1083
+ manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
1084
+ **kwargs)
1085
+ return manager.load_content()
1086
+
1087
+
1088
+ class ItemManager(AbstractTableSetManager):
1089
+ """
1090
+ This class will open a .xlsx or .csv file and load its content in our standard format.
1091
+ (See more detailed description in AbstractTableManager.)
1092
+ """
1093
+
1094
+ @classmethod
1095
+ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager:
1096
+ reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename)
1097
+ if not issubclass(reader_agent_class, AbstractItemManager):
1098
+ raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.")
1099
+ reader_agent_class: Type[AbstractItemManager]
1100
+ reader_agent = reader_agent_class(filename=filename, **kwargs)
1101
+ return reader_agent
1102
+
1103
+ @classmethod
1104
+ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
1105
+ schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None,
1106
+ portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
1107
+ **kwargs) -> TabbedSheetData:
1108
+ """
1109
+ Given a filename and various options, loads the items associated with that filename.
1110
+
1111
+ :param filename: The name of the file to load.
1112
+ :param tab_name: For files that lack multiple tabs (such as .csv or .tsv),
1113
+ the tab name to associate with the data.
1114
+ :param escaping: Whether to perform escape processing on backslashes.
1115
+ :param schemas: A set of schemas to use instead of trying to load them.
1116
+ :param autoload_schemas: Whether to try autoloading schemas.
1117
+ :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal).
1118
+ :param portal_vapp: A vapp to use (usually if calling from within a portal).
1119
+ """
1120
+
1121
+ with maybe_unpack(filename) as filename:
1122
+
1123
+ manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
1124
+ schemas=schemas, autoload_schemas=autoload_schemas,
1125
+ portal_env=portal_env, portal_vapp=portal_vapp,
1126
+ **kwargs)
1127
+ return manager.load_content()
1128
+
1129
+
1130
+ load_table_set = TableSetManager.load
1131
+ load_items = ItemManager.load
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 7.11.0
3
+ Version: 7.11.0.1b9
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -25,9 +25,11 @@ Requires-Dist: PyYAML (>=5.1,<5.5)
25
25
  Requires-Dist: aws-requests-auth (>=0.4.2,<1)
26
26
  Requires-Dist: boto3 (>=1.17.39,<2.0.0)
27
27
  Requires-Dist: botocore (>=1.20.39,<2.0.0)
28
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
28
29
  Requires-Dist: docker (>=4.4.4,<5.0.0)
29
30
  Requires-Dist: elasticsearch (==7.13.4)
30
31
  Requires-Dist: gitpython (>=3.1.2,<4.0.0)
32
+ Requires-Dist: openpyxl (>=3.1.2,<4.0.0)
31
33
  Requires-Dist: opensearch-py (>=2.0.1,<3.0.0)
32
34
  Requires-Dist: pyOpenSSL (>=23.1.1,<24.0.0)
33
35
  Requires-Dist: pytz (>=2020.4)
@@ -32,7 +32,7 @@ dcicutils/kibana/readme.md,sha256=3KmHF9FH6A6xwYsNxRFLw27q0XzHYnjZOlYUnn3VkQQ,21
32
32
  dcicutils/lang_utils.py,sha256=cVLRUGyYeSPJAq3z_RJjA6miajHrXoi6baxF8HzHmLc,27797
33
33
  dcicutils/license_utils.py,sha256=OhOfTXFivvb6Y3tiJAb1b9Is-OTpBfZjC18M-RvqBqk,40456
34
34
  dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
35
- dcicutils/misc_utils.py,sha256=sXJ7ChrMyXZooaCnUtLxWHOmFIqxrxJKGJ6Ayd5i2Gk,91032
35
+ dcicutils/misc_utils.py,sha256=XisEQGMkHI7k5RiK-k4yeG8Zw00H8b-v9o2Y7mZyKb8,94548
36
36
  dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
37
37
  dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
38
38
  dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
@@ -43,13 +43,14 @@ dcicutils/redis_utils.py,sha256=VJ-7g8pOZqR1ZCtdcjKz3-6as2DMUcs1b1zG6wSprH4,6462
43
43
  dcicutils/s3_utils.py,sha256=a9eU3Flh8Asc8xPWLGP16A6UQ_FVwhoFQNqm4ZYgSQ4,28852
44
44
  dcicutils/scripts/publish_to_pypi.py,sha256=qmWyjrg5bNQNfpNKFTZdyMXpRmrECnRV9VmNQddUPQA,13576
45
45
  dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19745
46
+ dcicutils/sheet_utils.py,sha256=bnnefjeTUL4ES7gtqThISXJKeli1AIFryu4h7Dt9dxw,47040
46
47
  dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
47
48
  dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
48
49
  dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
49
50
  dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
50
51
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
51
- dcicutils-7.11.0.dist-info/LICENSE.txt,sha256=t0_-jIjqxNnymZoNJe-OltRIuuF8qfhN0ATlHyrUJPk,1102
52
- dcicutils-7.11.0.dist-info/METADATA,sha256=ShBT5l8cU5eq4WKOCVNrfzoE5tPwOt7vh5WAXqgfM4Q,2999
53
- dcicutils-7.11.0.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
54
- dcicutils-7.11.0.dist-info/entry_points.txt,sha256=Z3vezbXsTpTIY4N2F33c5e-WDVQxgz_Vsk1oV_JBN7A,146
55
- dcicutils-7.11.0.dist-info/RECORD,,
52
+ dcicutils-7.11.0.1b9.dist-info/LICENSE.txt,sha256=t0_-jIjqxNnymZoNJe-OltRIuuF8qfhN0ATlHyrUJPk,1102
53
+ dcicutils-7.11.0.1b9.dist-info/METADATA,sha256=MER7N-gDAB5nz6YT51jT7aIu8_rHT2x65FBF5x3DN70,3084
54
+ dcicutils-7.11.0.1b9.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
55
+ dcicutils-7.11.0.1b9.dist-info/entry_points.txt,sha256=Z3vezbXsTpTIY4N2F33c5e-WDVQxgz_Vsk1oV_JBN7A,146
56
+ dcicutils-7.11.0.1b9.dist-info/RECORD,,