dcicutils 7.11.0__py3-none-any.whl → 7.11.0.1b9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of dcicutils might be problematic. Click here for more details.

dcicutils/misc_utils.py CHANGED
@@ -9,6 +9,7 @@ import hashlib
9
9
  import inspect
10
10
  import math
11
11
  import io
12
+ import json
12
13
  import os
13
14
  import logging
14
15
  import pytz
@@ -191,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp):
191
192
  pass
192
193
 
193
194
 
194
- class VirtualApp:
195
+ class AbstractVirtualApp:
196
+ pass
197
+
198
+
199
+ class VirtualApp(AbstractVirtualApp):
195
200
  """
196
201
  Wrapper class for TestApp, to allow custom control over submitting Encoded requests,
197
202
  simulating a number of conditions, including permissions.
@@ -1352,6 +1357,25 @@ def capitalize1(s):
1352
1357
  return s[:1].upper() + s[1:]
1353
1358
 
1354
1359
 
1360
+ """
1361
+ Python's UUID ignores all dashes, whereas Postgres is more strict
1362
+ http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
1363
+ See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
1364
+ And, anyway, this pattern is what our portals have been doing
1365
+ for quite a while, so it's the most stable choice for us now.
1366
+ """
1367
+
1368
+ uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?')
1369
+
1370
+
1371
+ def is_uuid(instance):
1372
+ """
1373
+ Predicate returns true for any group of 32 hex characters with optional hyphens every four characters.
1374
+ We insist on lowercase to make matching faster. See other notes on this design choice above.
1375
+ """
1376
+ return bool(uuid_re.match(instance))
1377
+
1378
+
1355
1379
  def string_list(s):
1356
1380
  """
1357
1381
  Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls.
@@ -2313,3 +2337,73 @@ def parse_in_radix(text: str, *, radix: int):
2313
2337
  except Exception:
2314
2338
  pass
2315
2339
  raise ValueError(f"Unable to parse: {text!r}")
2340
+
2341
+
2342
+ def pad_to(target_size: int, data: list, *, padding=None):
2343
+ """
2344
+ This will pad to a given target size, a list of a potentially different actual size, using given padding.
2345
+ e.g., pad_to(3, [1, 2]) will return [1, 2, None]
2346
+ """
2347
+ actual_size = len(data)
2348
+ if actual_size < target_size:
2349
+ data = data + [padding] * (target_size - actual_size)
2350
+ return data
2351
+
2352
+
2353
+ class JsonLinesReader:
2354
+
2355
+ def __init__(self, fp, padded=False, padding=None):
2356
+ """
2357
+ Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns,
2358
+ this creates an object that can be used to iterate across the lines in the JSON lines file
2359
+ that the fp is reading from.
2360
+
2361
+ There are two possible formats that this will return.
2362
+
2363
+ For files that contain a series of dictionaries, such as:
2364
+ {"something": 1, "else": "a"}
2365
+ {"something": 2, "else": "b"}
2366
+ ...etc
2367
+ this will just return thos those dictionaries one-by-one when iterated over.
2368
+
2369
+ The same set of dictionaries will also be yielded by a file containing:
2370
+ ["something", "else"]
2371
+ [1, "a"]
2372
+ [2, "b"]
2373
+ ...etc
2374
+ this will just return thos those dictionaries one-by-one when iterated over.
2375
+
2376
+ NOTES:
2377
+
2378
+ * In the second case, shorter lists on subsequent lines return only partial dictionaries.
2379
+ * In the second case, longer lists on subsequent lines will quietly drop any extra elements.
2380
+ """
2381
+
2382
+ self.fp = fp
2383
+ self.padded: bool = padded
2384
+ self.padding = padding
2385
+ self.headers = None # Might change after we see first line
2386
+
2387
+ def __iter__(self):
2388
+ first_line = True
2389
+ n_headers = 0
2390
+ for raw_line in self.fp:
2391
+ line = json.loads(raw_line)
2392
+ if first_line:
2393
+ first_line = False
2394
+ if isinstance(line, list):
2395
+ self.headers = line
2396
+ n_headers = len(line)
2397
+ continue
2398
+ # If length of line is more than we expect, ignore it. Let user put comments beyond our table
2399
+ # But if length of line is less than we expect, extend the line with None
2400
+ if self.headers:
2401
+ if not isinstance(line, list):
2402
+ raise Exception("If the first line is a list, all lines must be.")
2403
+ if self.padded and len(line) < n_headers:
2404
+ line = pad_to(n_headers, line, padding=self.padding)
2405
+ yield dict(zip(self.headers, line))
2406
+ elif isinstance(line, dict):
2407
+ yield line
2408
+ else:
2409
+ raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")
@@ -0,0 +1,1131 @@
1
+ import chardet
2
+ import contextlib
3
+ import copy
4
+ import csv
5
+ import glob
6
+ import io
7
+ import json
8
+ import openpyxl
9
+ import os
10
+ import re
11
+ import subprocess
12
+ import uuid
13
+ import yaml
14
+
15
+ from openpyxl.worksheet.worksheet import Worksheet
16
+ from openpyxl.workbook.workbook import Workbook
17
+ from tempfile import TemporaryFile, TemporaryDirectory
18
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
19
+ from .common import AnyJsonData
20
+ from .env_utils import public_env_name, EnvUtils
21
+ from .ff_utils import get_schema
22
+ from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are
23
+ from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix
24
+ from .task_utils import pmap
25
+
26
+
27
+ Header = str
28
+ Headers = List[str]
29
+ ParsedHeader = List[Union[str, int]]
30
+ ParsedHeaders = List[ParsedHeader]
31
+ SheetCellValue = Union[int, float, str]
32
+ SheetRow = List[SheetCellValue]
33
+ CsvReader = type(csv.reader(TemporaryFile()))
34
+ SheetData = List[dict]
35
+ TabbedSheetData = Dict[str, SheetData]
36
+ Regexp = type(re.compile("sample"))
37
+
38
+
39
+ class LoadFailure(Exception):
40
+ """
41
+ In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail,
42
+ but some errors are so confusing or so problematic that we need to just fail the load right away.
43
+ """
44
+ pass
45
+
46
+
47
+ class LoadArgumentsError(LoadFailure):
48
+ """
49
+ Errors of this class represent situations where we can't get started because
50
+ there's a problem with the given arguments.
51
+ """
52
+ pass
53
+
54
+
55
+ class LoadTableError(LoadFailure):
56
+ """
57
+ Errors of this class represent situations where we can't get started because
58
+ there's a problem with some table's syntax, for example headers that don't make sense.
59
+ """
60
+ pass
61
+
62
+
63
+ @contextlib.contextmanager
64
+ def deferred_problems():
65
+ problems = []
66
+
67
+ def note_problems(problem):
68
+ problems.append(problem)
69
+
70
+ yield note_problems
71
+
72
+ if problems:
73
+ for problem in problems:
74
+ PRINT(f"Problem: {problem}")
75
+ raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False))
76
+
77
+
78
+ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False):
79
+ if kwargs:
80
+ unwanted = [f"{argname}={value!r}" if detailed else argname
81
+ for argname, value in kwargs.items()
82
+ if value is not None]
83
+ if unwanted:
84
+ does_not = "don't" if context_plural else "doesn't"
85
+ raise LoadArgumentsError(f"{context} {does_not} use"
86
+ f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")
87
+
88
+
89
+ def prefer_number(value: SheetCellValue):
90
+ if isinstance(value, str): # the given value might be an int or float, in which case just fall through
91
+ if not value:
92
+ return None
93
+ value = value
94
+ ch0 = value[0]
95
+ if ch0 == '+' or ch0 == '-' or ch0.isdigit():
96
+ try:
97
+ return int(value)
98
+ except Exception:
99
+ pass
100
+ try:
101
+ return float(value)
102
+ except Exception:
103
+ pass
104
+ # If we couldn't parse it as an int or float, fall through to returning the original value
105
+ pass
106
+ return value
107
+
108
+
109
+ def expand_string_escape_sequences(text: str) -> str:
110
+ s = io.StringIO()
111
+ escaping = False
112
+ for ch in text:
113
+ if escaping:
114
+ if ch == 'r':
115
+ s.write('\r')
116
+ elif ch == 't':
117
+ s.write('\t')
118
+ elif ch == 'n':
119
+ s.write('\n')
120
+ elif ch == '\\':
121
+ s.write('\\')
122
+ else:
123
+ # Rather than err, just leave other sequences as-is.
124
+ s.write(f"\\{ch}")
125
+ escaping = False
126
+ elif ch == '\\':
127
+ escaping = True
128
+ else:
129
+ s.write(ch)
130
+ return s.getvalue()
131
+
132
+
133
+ def open_unicode_text_input_file_respecting_byte_order_mark(filename):
134
+ """
135
+ Opens a file for text input, respecting a byte-order mark (BOM).
136
+ """
137
+ with io.open(filename, 'rb') as fp:
138
+ leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
139
+ bom_info = chardet.detect(leading_bytes, should_rename_legacy=True)
140
+ detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
141
+ use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding
142
+ return io.open(filename, 'r', encoding=use_encoding)
143
+
144
+
145
+ class TypeHint:
146
+ def apply_hint(self, value):
147
+ return value
148
+
149
+ def __str__(self):
150
+ return f"<{self.__class__.__name__}>"
151
+
152
+ def __repr__(self):
153
+ return self.__str__()
154
+
155
+
156
+ class BoolHint(TypeHint):
157
+
158
+ def apply_hint(self, value):
159
+ if isinstance(value, str) and value:
160
+ if 'true'.startswith(value.lower()):
161
+ return True
162
+ elif 'false'.startswith(value.lower()):
163
+ return False
164
+ return super().apply_hint(value)
165
+
166
+
167
+ class EnumHint(TypeHint):
168
+
169
+ def __str__(self):
170
+ return f"<EnumHint {','.join(f'{key}={val}' for key, val in self.value_map.items())}>"
171
+
172
+ def __init__(self, value_map):
173
+ self.value_map = value_map
174
+
175
+ def apply_hint(self, value):
176
+ if isinstance(value, str):
177
+ if value in self.value_map:
178
+ result = self.value_map[value]
179
+ return result
180
+ else:
181
+ lvalue = value.lower()
182
+ found = []
183
+ for lkey, key in self.value_map.items():
184
+ if lkey.startswith(lvalue):
185
+ found.append(lkey)
186
+ if len(found) == 1:
187
+ [only_found] = found
188
+ result = self.value_map[only_found]
189
+ return result
190
+ return super().apply_hint(value)
191
+
192
+
193
+ OptionalTypeHints = List[Optional[TypeHint]]
194
+
195
+
196
+ class ItemTools:
197
+ """
198
+ Implements operations on table-related data without pre-supposing the specific representation of the table.
199
+ It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
200
+ it does not presuppose the source of the data nor where it will be written to.
201
+
202
+ For the purpose of this class:
203
+
204
+ * a 'header' is a string representing the top of a column.
205
+
206
+ * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
207
+ "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
208
+ each numeric token as an int instead of a string.
209
+
210
+ * a 'headers' object is just a list of strings, each of which is a 'header'.
211
+
212
+ * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
213
+ e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
214
+
215
+ """
216
+
217
+ @classmethod
218
+ def parse_sheet_header(cls, header: Header) -> ParsedHeader:
219
+ result = []
220
+ token = ""
221
+ for i in range(len(header)):
222
+ ch = header[i]
223
+ if ch == '.' or ch == '#':
224
+ if token:
225
+ result.append(int(token) if token.isdigit() else token)
226
+ token = ""
227
+ else:
228
+ token += ch
229
+ if token:
230
+ result.append(int(token) if token.isdigit() else token)
231
+ return result
232
+
233
+ @classmethod
234
+ def parse_sheet_headers(cls, headers: Headers):
235
+ return [cls.parse_sheet_header(header)
236
+ for header in headers]
237
+
238
+ @classmethod
239
+ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
240
+ prototype = {}
241
+ for parsed_header in parsed_headers:
242
+ parsed_header0 = parsed_header[0]
243
+ if isinstance(parsed_header0, int):
244
+ raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}")
245
+ cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
246
+ return prototype
247
+
248
+ @classmethod
249
+ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
250
+ [key0, *more_keys] = keys
251
+ key1 = more_keys[0] if more_keys else None
252
+ if isinstance(key1, int):
253
+ placeholder = []
254
+ elif isinstance(key1, str):
255
+ placeholder = {}
256
+ else:
257
+ placeholder = None
258
+ if isinstance(key0, int):
259
+ n = len(parent)
260
+ if key0 == n:
261
+ parent.append(placeholder)
262
+ elif key0 > n:
263
+ raise LoadTableError("Numeric items must occur sequentially.")
264
+ elif isinstance(key0, str):
265
+ if key0 not in parent:
266
+ parent[key0] = placeholder
267
+ if key1 is not None:
268
+ cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
269
+ return parent
270
+
271
+ INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default
272
+
273
+ @classmethod
274
+ def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData:
275
+ # TODO: Remodularize this for easier testing and more Schema-driven effect
276
+ # Doug asks that this be broken up into different mechanisms, more modular and separately testable.
277
+ # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired.
278
+ if isinstance(value, str):
279
+ lvalue = value.lower()
280
+ # TODO: We could consult a schema to make this less heuristic, but this may do for now
281
+ if lvalue == 'true':
282
+ return True
283
+ elif lvalue == 'false':
284
+ return False
285
+ elif lvalue == 'null' or lvalue == '':
286
+ return None
287
+ elif '|' in value:
288
+ if value == '|': # Use '|' for []
289
+ return []
290
+ else:
291
+ if value.endswith("|"): # Use 'foo|' for ['foo']
292
+ value = value[:-1]
293
+ return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')]
294
+ elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'):
295
+ # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid
296
+ return cls.get_instaguid(value, context=context)
297
+ else:
298
+ # Doug points out that the schema might not agree, might want a string representation of a number.
299
+ # At this semantic layer, this might be a bad choice.
300
+ return prefer_number(value)
301
+ else: # presumably a number (int or float)
302
+ return value
303
+
304
+ @classmethod
305
+ def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None):
306
+ if context is None:
307
+ return guid_placeholder
308
+ else:
309
+ referent = context.get(guid_placeholder)
310
+ if not referent:
311
+ context[guid_placeholder] = referent = str(uuid.uuid4())
312
+ return referent
313
+
314
+ @classmethod
315
+ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
316
+ if (value is None or value == '') and not force:
317
+ return
318
+ [key, *more_path] = path
319
+ if not more_path:
320
+ datum[key] = value
321
+ else:
322
+ cls.set_path_value(datum[key], more_path, value)
323
+
324
+ @classmethod
325
+ def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any):
326
+
327
+ def finder(subheader, subschema):
328
+ if not parsed_header:
329
+ return None
330
+ else:
331
+ [key1, *other_headers] = subheader
332
+ if isinstance(key1, str) and isinstance(subschema, dict):
333
+ if subschema.get('type') == 'object':
334
+ def1 = subschema.get('properties', {}).get(key1)
335
+ if not other_headers:
336
+ if def1 is not None:
337
+ t = def1.get('type')
338
+ if t == 'string':
339
+ enum = def1.get('enum')
340
+ if enum:
341
+ mapping = {e.lower(): e for e in enum}
342
+ return EnumHint(mapping)
343
+ elif t == 'boolean':
344
+ return BoolHint()
345
+ else:
346
+ pass # fall through to asking super()
347
+ else:
348
+ pass # fall through to asking super()
349
+ else:
350
+ return finder(subheader=other_headers, subschema=def1)
351
+
352
+ return finder(subheader=parsed_header, subschema=schema)
353
+
354
+ @classmethod
355
+ def infer_tab_name(cls, filename):
356
+ return os.path.basename(filename).split('.')[0]
357
+
358
+
359
+ # TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
360
+ #
361
+ # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
362
+ # I am less certain but open to discussion. Among other things, as implemented now,
363
+ # the __init__ method here needs to run and the documentation says that ABC's won't appear
364
+ # in the method resolution order. -kmp 17-Aug-2023
365
+ # See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
366
+ class AbstractTableSetManager:
367
+ """
368
+ The TableSetManager is the spanning class of anything that wants to be able to load a table set,
369
+ regardless of what it wants to load it from. To do this, it must support a load method
370
+ that takes a filename and returns the file content in the form:
371
+ {
372
+ "Sheet1": [
373
+ {...representation of row1 as some kind of dict...},
374
+ {...representation of row2 as some kind of dict...}
375
+ ],
376
+ "Sheet2": [...],
377
+ ...,
378
+ }
379
+ It also needs some implementation of the .tab_names property.
380
+ Note that at this level of abstraction, we take no position on what form of representation is used
381
+ for the rows, as long as it is JSON data of some kind. It might be
382
+ {"col1": "val1", "col2": "val2", ...}
383
+ or it might be something more structured like
384
+ {"something": "val1", {"something_else": ["val2"]}}
385
+ Additionally, the values stored might be altered as well. In particular, the most likely alteration
386
+ is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
387
+ happen is not constrained by this class.
388
+ """
389
+
390
+ ALLOWED_FILE_EXTENSIONS: List[str] = []
391
+
392
+ def __init__(self, filename: str, **kwargs):
393
+ self.filename: str = filename
394
+ unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs)
395
+
396
+ # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
397
+ @classmethod
398
+ def load(cls, filename: str, **kwargs) -> TabbedSheetData:
399
+ """
400
+ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
401
+ For more information, see documentation of AbstractTableSetManager.
402
+ """
403
+ raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA
404
+
405
+ @property
406
+ def tab_names(self) -> List[str]:
407
+ raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA
408
+
409
+ def load_content(self) -> Any:
410
+ raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA
411
+
412
+
413
+ class BasicTableSetManager(AbstractTableSetManager):
414
+ """
415
+ A BasicTableManager provides some structure that most kinds of parsers will need.
416
+ In particular, everything will likely need some way of storing headers and some way of storing content
417
+ of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
418
+ of this where there's only one set of headers and only one block of content.
419
+ """
420
+
421
+ def __init__(self, filename: str, **kwargs):
422
+ super().__init__(filename=filename, **kwargs)
423
+ self.headers_by_tab_name: Dict[str, Headers] = {}
424
+ self.content_by_tab_name: Dict[str, SheetData] = {}
425
+ self.reader_agent: Any = self._get_reader_agent()
426
+
427
+ def tab_headers(self, tab_name: str) -> Headers:
428
+ return self.headers_by_tab_name[tab_name]
429
+
430
+ def tab_content(self, tab_name: str) -> List[AnyJsonData]:
431
+ return self.content_by_tab_name[tab_name]
432
+
433
+ @classmethod
434
+ def _create_tab_processor_state(cls, tab_name: str) -> Any:
435
+ """
436
+ This method provides for the possibility that some parsers will want auxiliary state,
437
+ (such as parsed headers or a line count or a table of temporary names for objects to cross-link
438
+ or some other such feature) that it carries with it as it moves from line to line parsing things.
439
+ Subclasses might therefore want to make this do something more interesting.
440
+ """
441
+ ignored(tab_name) # subclasses might need this, but we don't
442
+ return None
443
+
444
+ def _get_reader_agent(self) -> Any:
445
+ """This function is responsible for opening the workbook and returning a workbook object."""
446
+ raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA
447
+
448
+
449
+ class SemanticTableSetManager(BasicTableSetManager):
450
+ """
451
+ This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing.
452
+ Those may be:
453
+ * Excel workbook readers (.xlsx)
454
+ * Comma-separated file readers (.csv)
455
+ * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright
456
+ refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt)
457
+ There are two levels to each of these: a class that is not semantically interpreted,
458
+ and a class that is semantically interpreted as an "item".
459
+
460
+ This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing
461
+ were already done (in part so that they can be used to test the results of other formats):
462
+ * Json files
463
+ * Yaml files
464
+ * Inserts directories
465
+ * JsonLines files
466
+ """
467
+
468
+ @classmethod
469
+ def load(cls, filename: str, **kwargs) -> AnyJsonData:
470
+ if cls.ALLOWED_FILE_EXTENSIONS:
471
+ if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
472
+ raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
473
+ f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")
474
+
475
+ table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs)
476
+ return table_set_manager.load_content()
477
+
478
+ def __init__(self, filename: str, **kwargs):
479
+ super().__init__(filename=filename, **kwargs)
480
+
481
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
482
+ """
483
+ Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values.
484
+ """
485
+ raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA
486
+
487
+ def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
488
+ """
489
+ This needs to take a state and whatever represents a row and
490
+ must return a list of objects representing column values.
491
+ What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
492
+ """
493
+ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA
494
+
495
+ def load_content(self) -> AnyJsonData:
496
+ for tab_name in self.tab_names:
497
+ sheet_content = []
498
+ state = self._create_tab_processor_state(tab_name)
499
+ for row_data in self._raw_row_generator_for_tab_name(tab_name):
500
+ processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data)
501
+ sheet_content.append(processed_row_data)
502
+ self.content_by_tab_name[tab_name] = sheet_content
503
+ return self.content_by_tab_name
504
+
505
+ @classmethod
506
+ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
507
+ return prefer_number(value)
508
+
509
+
510
+ class AbstractItemManager(AbstractTableSetManager):
511
+
512
+ pass
513
+
514
+
515
+ class TableSetManagerRegistry:
516
+
517
+ def __init__(self):
518
+ self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {}
519
+ self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = []
520
+
521
+ def register(self, regexp: Optional[str] = None):
522
+ def _wrapped_register(class_to_register: Type[AbstractTableSetManager]):
523
+ if regexp:
524
+ self.regexp_mappings.append((re.compile(regexp), class_to_register))
525
+ for ext in class_to_register.ALLOWED_FILE_EXTENSIONS:
526
+ existing = self.manager_table.get(ext)
527
+ if existing:
528
+ raise Exception(f"Tried to define {class_to_register} to extension {ext},"
529
+ f" but {existing} already claimed that.")
530
+ self.manager_table[ext] = class_to_register
531
+ return class_to_register
532
+ return _wrapped_register
533
+
534
+ register1 = register
535
+
536
+ def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]:
537
+ base: str = os.path.basename(filename)
538
+ suffix_parts = base.split('.')[1:]
539
+ if suffix_parts:
540
+ for i in range(0, len(suffix_parts)):
541
+ suffix = f".{'.'.join(suffix_parts[i:])}"
542
+ found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix)
543
+ if found:
544
+ return found
545
+ else:
546
+ special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename)
547
+ if special_case:
548
+ return special_case
549
+ raise LoadArgumentsError(f"Unknown file type: {filename}")
550
+
551
+ def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]:
552
+ for pattern, manager_class in self.regexp_mappings:
553
+ if pattern.match(filename):
554
+ return manager_class
555
+ return None
556
+
557
+
558
+ TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry()
559
+ ITEM_MANAGER_REGISTRY = TableSetManagerRegistry()
560
+
561
+
562
+ @TABLE_SET_MANAGER_REGISTRY.register()
563
+ class XlsxManager(SemanticTableSetManager):
564
+ """
565
+ This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
566
+ """
567
+
568
+ ALLOWED_FILE_EXTENSIONS = ['.xlsx']
569
+
570
+ @classmethod
571
+ def _all_rows(cls, sheet: Worksheet):
572
+ row_max = sheet.max_row
573
+ for row in range(2, row_max + 1):
574
+ yield row
575
+
576
+ @classmethod
577
+ def _all_cols(cls, sheet: Worksheet):
578
+ col_max = sheet.max_column
579
+ for col in range(1, col_max + 1):
580
+ yield col
581
+
582
+ @property
583
+ def tab_names(self) -> List[str]:
584
+ return self.reader_agent.sheetnames
585
+
586
+ def _get_reader_agent(self) -> Workbook:
587
+ return openpyxl.load_workbook(self.filename)
588
+
589
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
590
+ sheet = self.reader_agent[tab_name]
591
+ return (self._get_raw_row_content_tuple(sheet, row)
592
+ for row in self._all_rows(sheet))
593
+
594
+ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
595
+ return [sheet.cell(row=row, column=col).value
596
+ for col in self._all_cols(sheet)]
597
+
598
+ def _create_tab_processor_state(self, tab_name: str) -> Headers:
599
+ sheet = self.reader_agent[tab_name]
600
+ headers: Headers = [str(sheet.cell(row=1, column=col).value)
601
+ for col in self._all_cols(sheet)]
602
+ self.headers_by_tab_name[sheet.title] = headers
603
+ return headers
604
+
605
+ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
606
+ ignored(tab_name)
607
+ return {headers[i]: self.parse_cell_value(row_datum)
608
+ for i, row_datum in enumerate(row_data)}
609
+
610
+
611
+ class SchemaAutoloadMixin(AbstractTableSetManager):
612
+
613
+ SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
614
+ CACHE_SCHEMAS = True # Controls whether we're doing caching at all
615
+ AUTOLOAD_SCHEMAS_DEFAULT = True
616
+
617
+ def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
618
+ portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs):
619
+ # This setup must be in place before the class initialization is done (via the super call).
620
+ self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas
621
+ if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting.
622
+ if portal_env is None and portal_vapp is None:
623
+ portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
624
+ PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
625
+ self.portal_env: Optional[str] = portal_env
626
+ self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp
627
+ super().__init__(filename=filename, **kwargs)
628
+
629
+ def fetch_relevant_schemas(self, schema_names: List[str]):
630
+ # The schema_names argument is not normally given, but it is there for easier testing
631
+ def fetch_schema(schema_name):
632
+ schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
633
+ return schema_name, schema
634
+ if self.autoload_schemas and (self.portal_env or self.portal_vapp):
635
+ autoloaded = {tab_name: schema
636
+ for tab_name, schema in pmap(fetch_schema, schema_names)}
637
+ return autoloaded
638
+ else:
639
+ return {}
640
+
641
+ @classmethod
642
+ def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None,
643
+ portal_vapp: Optional[AbstractVirtualApp] = None):
644
+ def just_fetch_it():
645
+ return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp)
646
+ if cls.CACHE_SCHEMAS:
647
+ schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name)
648
+ if schema is None:
649
+ cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it()
650
+ return schema
651
+ else:
652
+ return just_fetch_it()
653
+
654
+ @classmethod
655
+ def clear_schema_cache(cls):
656
+ for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first
657
+ cls.SCHEMA_CACHE.pop(key, None)
658
+
659
+
660
+ class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager):
661
+ """
662
+ This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
663
+ get handled like Items instead of just flat table rows.
664
+ """
665
+
666
+ def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs):
667
+ super().__init__(filename=filename, **kwargs)
668
+ self.patch_prototypes_by_tab_name: Dict[str, Dict] = {}
669
+ self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {}
670
+ self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {}
671
+ self._schemas = schemas
672
+ self._instaguid_context_table: Dict[str, str] = {}
673
+
674
+ @property
675
+ def schemas(self):
676
+ schemas = self._schemas
677
+ if schemas is None:
678
+ self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names)
679
+ return schemas
680
+
681
+ def sheet_patch_prototype(self, tab_name: str) -> Dict:
682
+ return self.patch_prototypes_by_tab_name[tab_name]
683
+
684
+ def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders:
685
+ return self.parsed_headers_by_tab_name[tab_name]
686
+
687
+ def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints:
688
+ return self.type_hints_by_tab_name[tab_name]
689
+
690
+ class SheetState:
691
+
692
+ def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints):
693
+ self.parsed_headers = parsed_headers
694
+ self.type_hints = type_hints
695
+
696
+ def _compile_type_hints(self, tab_name: str):
697
+ parsed_headers = self.sheet_parsed_headers(tab_name)
698
+ schema = self.schemas.get(tab_name)
699
+ with deferred_problems() as note_problem:
700
+ for required_header in self._schema_required_headers(schema):
701
+ if required_header not in parsed_headers:
702
+ note_problem("Missing required header")
703
+ type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None
704
+ for parsed_header in parsed_headers]
705
+ self.type_hints_by_tab_name[tab_name] = type_hints
706
+
707
+ @classmethod
708
+ def _schema_required_headers(cls, schema):
709
+ ignored(schema)
710
+ return [] # TODO: Make this compute a list of required headers (in parsed header form)
711
+
712
+ def _compile_sheet_headers(self, tab_name: str):
713
+ headers = self.headers_by_tab_name[tab_name]
714
+ parsed_headers = ItemTools.parse_sheet_headers(headers)
715
+ self.parsed_headers_by_tab_name[tab_name] = parsed_headers
716
+ prototype = ItemTools.compute_patch_prototype(parsed_headers)
717
+ self.patch_prototypes_by_tab_name[tab_name] = prototype
718
+
719
+ def _create_tab_processor_state(self, tab_name: str) -> SheetState:
720
+ super()._create_tab_processor_state(tab_name)
721
+ # This will create state that allows us to efficiently assign values in the right place on each row
722
+ # by setting up a prototype we can copy and then drop values into.
723
+ self._compile_sheet_headers(tab_name)
724
+ self._compile_type_hints(tab_name)
725
+ return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name),
726
+ type_hints=self.sheet_type_hints(tab_name))
727
+
728
+ def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData:
729
+ parsed_headers = state.parsed_headers
730
+ type_hints = state.type_hints
731
+ patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name))
732
+ for i, value in enumerate(row_data):
733
+ parsed_value = self.parse_cell_value(value)
734
+ type_hint = type_hints[i]
735
+ if type_hint:
736
+ parsed_value = type_hint.apply_hint(parsed_value)
737
+ ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
738
+ return patch_item
739
+
740
+ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
741
+ return ItemTools.parse_item_value(value, context=self._instaguid_context_table)
742
+
743
+
744
+ @ITEM_MANAGER_REGISTRY.register()
745
+ class XlsxItemManager(ItemManagerMixin, XlsxManager):
746
+ """
747
+ This layers item-style row processing functionality on an XLSX file.
748
+ """
749
+ pass
750
+
751
+
752
+ class SingleTableMixin(AbstractTableSetManager):
753
+
754
+ def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
755
+ self._tab_name = tab_name or ItemTools.infer_tab_name(filename)
756
+ super().__init__(filename=filename, **kwargs)
757
+
758
+ @property
759
+ def tab_names(self) -> List[str]:
760
+ return [self._tab_name]
761
+
762
+
763
+ class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here
764
+
765
+ ALLOWED_FILE_EXTENSIONS = []
766
+
767
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
768
+ raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA
769
+
770
+ def _load_inserts_data(self, filename: str) -> TabbedSheetData:
771
+ data: AnyJsonData = self._parse_inserts_data(filename)
772
+ tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data)
773
+ if (not isinstance(tabbed_inserts, dict)
774
+ or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys())
775
+ or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content)
776
+ for content in tabbed_inserts.values())):
777
+ raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).")
778
+ tabbed_inserts: TabbedSheetData # we've just checked that
779
+ return tabbed_inserts
780
+
781
+ @classmethod
782
+ def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData:
783
+ ignored(filename)
784
+ return data
785
+
786
+ @property
787
+ def tab_names(self) -> List[str]:
788
+ return list(self.content_by_tab_name.keys())
789
+
790
+ def _get_reader_agent(self) -> Any:
791
+ return self
792
+
793
+ def load_content(self) -> Dict[str, AnyJsonData]:
794
+ data = self._load_inserts_data(self.filename)
795
+ for tab_name, tab_content in data.items():
796
+ self.content_by_tab_name[tab_name] = tab_content
797
+ if not tab_content:
798
+ self.headers_by_tab_name[tab_name] = []
799
+ else:
800
+ self.headers_by_tab_name[tab_name] = list(tab_content[0].keys())
801
+ return self.content_by_tab_name
802
+
803
+
804
+ class SimpleInsertsMixin(SingleTableMixin):
805
+
806
+ def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData:
807
+ if (not isinstance(data, list)
808
+ or not all(isinstance(item, dict) for item in data)):
809
+ raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).")
810
+ return {self._tab_name: data}
811
+
812
+
813
+ class JsonInsertsMixin:
814
+
815
+ @classmethod
816
+ def _parse_inserts_data(cls, filename: str) -> AnyJsonData:
817
+ return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
818
+
819
+
820
+ @TABLE_SET_MANAGER_REGISTRY.register()
821
+ class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager):
822
+
823
+ ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension
824
+
825
+
826
+ @TABLE_SET_MANAGER_REGISTRY.register()
827
+ class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager):
828
+
829
+ ALLOWED_FILE_EXTENSIONS = [".json"]
830
+
831
+
832
+ class YamlInsertsMixin:
833
+
834
+ def _parse_inserts_data(self, filename) -> AnyJsonData:
835
+ return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
836
+
837
+
838
+ @TABLE_SET_MANAGER_REGISTRY.register()
839
+ class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager):
840
+
841
+ ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"]
842
+
843
+ def _parse_inserts_data(self, filename) -> AnyJsonData:
844
+ return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename))
845
+
846
+
847
+ @TABLE_SET_MANAGER_REGISTRY.register()
848
+ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager):
849
+
850
+ ALLOWED_FILE_EXTENSIONS = [".yaml"]
851
+
852
+
853
+ class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here
854
+ """
855
+ This class is used for inserts directories and other JSON-like data that will be literally used as an Item
856
+ without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness
857
+ but instead assumed to have been checked by other means.
858
+ """
859
+
860
+ AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value.
861
+
862
+ def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
863
+ portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None,
864
+ **kwargs):
865
+ ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that.
866
+ if schemas not in [None, {}]:
867
+ raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.")
868
+ if autoload_schemas not in [None, False]:
869
+ raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.")
870
+ super().__init__(filename=filename, **kwargs)
871
+
872
+
873
+ @ITEM_MANAGER_REGISTRY.register()
874
+ class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager):
875
+ pass
876
+
877
+
878
+ @ITEM_MANAGER_REGISTRY.register()
879
+ class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager):
880
+ pass
881
+
882
+
883
+ @ITEM_MANAGER_REGISTRY.register()
884
+ class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager):
885
+ pass
886
+
887
+
888
+ @ITEM_MANAGER_REGISTRY.register()
889
+ class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager):
890
+ pass
891
+
892
+
893
+ @TABLE_SET_MANAGER_REGISTRY.register()
894
+ class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager):
895
+
896
+ ALLOWED_FILE_EXTENSIONS = [".jsonl"]
897
+
898
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
899
+ return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))]
900
+
901
+
902
+ @ITEM_MANAGER_REGISTRY.register()
903
+ class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager):
904
+ pass
905
+
906
+
907
+ @TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
908
+ class InsertsDirectoryManager(InsertsManager):
909
+
910
+ ALLOWED_FILE_EXTENSIONS = []
911
+
912
+ def _parse_inserts_data(self, filename: str) -> AnyJsonData:
913
+ if not os.path.isdir(filename):
914
+ raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.")
915
+ tab_files = glob.glob(os.path.join(filename, "*.json"))
916
+ data = {}
917
+ for tab_file in tab_files:
918
+ tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file))
919
+ # Here we don't use os.path.splitext because we want to split on the first dot.
920
+ # e.g., for foo.bar.baz, return just foo
921
+ # this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes
922
+ # for things unrelated to the tab name.
923
+ tab_name = os.path.basename(tab_file).split('.')[0]
924
+ data[tab_name] = tab_content
925
+ return data
926
+
927
+
928
+ @ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$")
929
+ class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager):
930
+ pass
931
+
932
+
933
+ @TABLE_SET_MANAGER_REGISTRY.register()
934
+ class CsvManager(SingleTableMixin, SemanticTableSetManager):
935
+ """
936
+ This implements the mechanism to get a series of rows out of the sheet in a csv file,
937
+ returning a result that still looks like there could have been multiple tabs.
938
+ """
939
+
940
+ ALLOWED_FILE_EXTENSIONS = ['.csv']
941
+
942
+ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs):
943
+ super().__init__(filename=filename, **kwargs)
944
+ self.escaping: bool = escaping or False
945
+
946
+ def _get_reader_agent(self) -> CsvReader:
947
+ return self._get_reader_agent_for_filename(self.filename)
948
+
949
+ @classmethod
950
+ def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
951
+ return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename))
952
+
953
+ PAD_TRAILING_TABS = True
954
+
955
+ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
956
+ headers = self.tab_headers(tab_name)
957
+ n_headers = len(headers)
958
+ for row_data in self.reader_agent:
959
+ if self.PAD_TRAILING_TABS:
960
+ row_data = pad_to(n_headers, row_data, padding='')
961
+ yield row_data
962
+
963
+ def _create_tab_processor_state(self, tab_name: str) -> Headers:
964
+ headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name)
965
+ if headers is None:
966
+ self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__()
967
+ return headers
968
+
969
+ @classmethod
970
+ def _escape_cell_text(cls, cell_text):
971
+ if '\\' in cell_text:
972
+ return expand_string_escape_sequences(cell_text)
973
+ else:
974
+ return cell_text
975
+
976
+ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
977
+ ignored(tab_name)
978
+ if self.escaping:
979
+ return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text))
980
+ for i, cell_text in enumerate(row_data)}
981
+ else:
982
+ return {headers[i]: self.parse_cell_value(cell_text)
983
+ for i, cell_text in enumerate(row_data)}
984
+
985
+
986
+ @ITEM_MANAGER_REGISTRY.register()
987
+ class CsvItemManager(ItemManagerMixin, CsvManager):
988
+ """
989
+ This layers item-style row processing functionality on a CSV file.
990
+ """
991
+ pass
992
+
993
+
994
+ @TABLE_SET_MANAGER_REGISTRY.register()
995
+ class TsvManager(CsvManager):
996
+ """
997
+ TSV files are just CSV files with tabs instead of commas as separators.
998
+ (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
999
+ """
1000
+ ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt']
1001
+
1002
+ @classmethod
1003
+ def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
1004
+ return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
1005
+
1006
+
1007
+ @ITEM_MANAGER_REGISTRY.register()
1008
+ class TsvItemManager(ItemManagerMixin, TsvManager):
1009
+ """
1010
+ This layers item-style row processing functionality on a TSV file.
1011
+ """
1012
+ pass
1013
+
1014
+
1015
+ def _do_shell_command(command, cwd=None):
1016
+ # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023
1017
+ subprocess.check_output(command, cwd=cwd)
1018
+
1019
+
1020
+ @contextlib.contextmanager
1021
+ def maybe_unpack(filename): # Maybe move to another module
1022
+ """
1023
+ If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not).
1024
+ """
1025
+ unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip']
1026
+ ext = None
1027
+ for unpackable in unpackables:
1028
+ if filename.endswith(unpackable):
1029
+ ext = unpackable
1030
+ break
1031
+ if not ext:
1032
+ yield filename
1033
+ return
1034
+ if not os.path.exists(filename):
1035
+ # We don't bother to raise this error if we're not planning to do any unpacking.
1036
+ # The caller can decide if/when such errors are needed in that case.
1037
+ # But if we are going to have to move bits around, they'll need to actually be there.
1038
+ # -kmp 12-Sep-2023
1039
+ raise ValueError(f"The file {filename!r} does not exist.")
1040
+ target_base_part = remove_suffix(ext, os.path.basename(filename), required=True)
1041
+ target_ext = '.tar.gz' if ext == '.tgz' else ext
1042
+ with TemporaryDirectory() as temp_dir:
1043
+ temp_base = os.path.join(temp_dir, target_base_part)
1044
+ temp_filename = temp_base + target_ext
1045
+ _do_shell_command(['cp', filename, temp_filename])
1046
+ if temp_filename.endswith('.gz'):
1047
+ _do_shell_command(['gunzip', temp_filename], cwd=temp_dir)
1048
+ temp_filename = remove_suffix('.gz', temp_filename)
1049
+ elif temp_filename.endswith(".zip"):
1050
+ _do_shell_command(['unzip', temp_filename], cwd=temp_dir)
1051
+ temp_filename = remove_suffix('.zip', temp_filename)
1052
+ if temp_filename.endswith(".tar"):
1053
+ _do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir)
1054
+ tar_file = temp_filename
1055
+ temp_filename = remove_suffix(".tar", temp_filename, required=True)
1056
+ if not os.path.isdir(temp_filename):
1057
+ raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}")
1058
+ # print(f"Unpacked {filename} to {temp_filename}")
1059
+ yield temp_filename
1060
+
1061
+
1062
+ class TableSetManager(AbstractTableSetManager):
1063
+ """
1064
+ This class will open a .xlsx or .csv file and load its content in our standard format.
1065
+ (See more detailed description in AbstractTableManager.)
1066
+ """
1067
+
1068
+ @classmethod
1069
+ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager:
1070
+ reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename)
1071
+ if issubclass(reader_agent_class, AbstractItemManager):
1072
+ raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.")
1073
+ reader_agent = reader_agent_class(filename=filename, **kwargs)
1074
+ return reader_agent
1075
+
1076
+ @classmethod
1077
+ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
1078
+ **kwargs) -> TabbedSheetData:
1079
+ """
1080
+ Given a filename and various options
1081
+ """
1082
+ with maybe_unpack(filename) as filename:
1083
+ manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
1084
+ **kwargs)
1085
+ return manager.load_content()
1086
+
1087
+
1088
+ class ItemManager(AbstractTableSetManager):
1089
+ """
1090
+ This class will open a .xlsx or .csv file and load its content in our standard format.
1091
+ (See more detailed description in AbstractTableManager.)
1092
+ """
1093
+
1094
+ @classmethod
1095
+ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager:
1096
+ reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename)
1097
+ if not issubclass(reader_agent_class, AbstractItemManager):
1098
+ raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.")
1099
+ reader_agent_class: Type[AbstractItemManager]
1100
+ reader_agent = reader_agent_class(filename=filename, **kwargs)
1101
+ return reader_agent
1102
+
1103
+ @classmethod
1104
+ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
1105
+ schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None,
1106
+ portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
1107
+ **kwargs) -> TabbedSheetData:
1108
+ """
1109
+ Given a filename and various options, loads the items associated with that filename.
1110
+
1111
+ :param filename: The name of the file to load.
1112
+ :param tab_name: For files that lack multiple tabs (such as .csv or .tsv),
1113
+ the tab name to associate with the data.
1114
+ :param escaping: Whether to perform escape processing on backslashes.
1115
+ :param schemas: A set of schemas to use instead of trying to load them.
1116
+ :param autoload_schemas: Whether to try autoloading schemas.
1117
+ :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal).
1118
+ :param portal_vapp: A vapp to use (usually if calling from within a portal).
1119
+ """
1120
+
1121
+ with maybe_unpack(filename) as filename:
1122
+
1123
+ manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
1124
+ schemas=schemas, autoload_schemas=autoload_schemas,
1125
+ portal_env=portal_env, portal_vapp=portal_vapp,
1126
+ **kwargs)
1127
+ return manager.load_content()
1128
+
1129
+
1130
+ load_table_set = TableSetManager.load
1131
+ load_items = ItemManager.load
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 7.11.0
3
+ Version: 7.11.0.1b9
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -25,9 +25,11 @@ Requires-Dist: PyYAML (>=5.1,<5.5)
25
25
  Requires-Dist: aws-requests-auth (>=0.4.2,<1)
26
26
  Requires-Dist: boto3 (>=1.17.39,<2.0.0)
27
27
  Requires-Dist: botocore (>=1.20.39,<2.0.0)
28
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
28
29
  Requires-Dist: docker (>=4.4.4,<5.0.0)
29
30
  Requires-Dist: elasticsearch (==7.13.4)
30
31
  Requires-Dist: gitpython (>=3.1.2,<4.0.0)
32
+ Requires-Dist: openpyxl (>=3.1.2,<4.0.0)
31
33
  Requires-Dist: opensearch-py (>=2.0.1,<3.0.0)
32
34
  Requires-Dist: pyOpenSSL (>=23.1.1,<24.0.0)
33
35
  Requires-Dist: pytz (>=2020.4)
@@ -32,7 +32,7 @@ dcicutils/kibana/readme.md,sha256=3KmHF9FH6A6xwYsNxRFLw27q0XzHYnjZOlYUnn3VkQQ,21
32
32
  dcicutils/lang_utils.py,sha256=cVLRUGyYeSPJAq3z_RJjA6miajHrXoi6baxF8HzHmLc,27797
33
33
  dcicutils/license_utils.py,sha256=OhOfTXFivvb6Y3tiJAb1b9Is-OTpBfZjC18M-RvqBqk,40456
34
34
  dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
35
- dcicutils/misc_utils.py,sha256=sXJ7ChrMyXZooaCnUtLxWHOmFIqxrxJKGJ6Ayd5i2Gk,91032
35
+ dcicutils/misc_utils.py,sha256=XisEQGMkHI7k5RiK-k4yeG8Zw00H8b-v9o2Y7mZyKb8,94548
36
36
  dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
37
37
  dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
38
38
  dcicutils/project_utils.py,sha256=qPdCaFmWUVBJw4rw342iUytwdQC0P-XKpK4mhyIulMM,31250
@@ -43,13 +43,14 @@ dcicutils/redis_utils.py,sha256=VJ-7g8pOZqR1ZCtdcjKz3-6as2DMUcs1b1zG6wSprH4,6462
43
43
  dcicutils/s3_utils.py,sha256=a9eU3Flh8Asc8xPWLGP16A6UQ_FVwhoFQNqm4ZYgSQ4,28852
44
44
  dcicutils/scripts/publish_to_pypi.py,sha256=qmWyjrg5bNQNfpNKFTZdyMXpRmrECnRV9VmNQddUPQA,13576
45
45
  dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19745
46
+ dcicutils/sheet_utils.py,sha256=bnnefjeTUL4ES7gtqThISXJKeli1AIFryu4h7Dt9dxw,47040
46
47
  dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
47
48
  dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
48
49
  dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
49
50
  dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
50
51
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
51
- dcicutils-7.11.0.dist-info/LICENSE.txt,sha256=t0_-jIjqxNnymZoNJe-OltRIuuF8qfhN0ATlHyrUJPk,1102
52
- dcicutils-7.11.0.dist-info/METADATA,sha256=ShBT5l8cU5eq4WKOCVNrfzoE5tPwOt7vh5WAXqgfM4Q,2999
53
- dcicutils-7.11.0.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
54
- dcicutils-7.11.0.dist-info/entry_points.txt,sha256=Z3vezbXsTpTIY4N2F33c5e-WDVQxgz_Vsk1oV_JBN7A,146
55
- dcicutils-7.11.0.dist-info/RECORD,,
52
+ dcicutils-7.11.0.1b9.dist-info/LICENSE.txt,sha256=t0_-jIjqxNnymZoNJe-OltRIuuF8qfhN0ATlHyrUJPk,1102
53
+ dcicutils-7.11.0.1b9.dist-info/METADATA,sha256=MER7N-gDAB5nz6YT51jT7aIu8_rHT2x65FBF5x3DN70,3084
54
+ dcicutils-7.11.0.1b9.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
55
+ dcicutils-7.11.0.1b9.dist-info/entry_points.txt,sha256=Z3vezbXsTpTIY4N2F33c5e-WDVQxgz_Vsk1oV_JBN7A,146
56
+ dcicutils-7.11.0.1b9.dist-info/RECORD,,