dcicutils 7.7.2.1b0__tar.gz → 7.8.1.1b1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of dcicutils might be problematic. Click here for more details.

Files changed (56) hide show
  1. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/PKG-INFO +2 -1
  2. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ff_utils.py +1 -1
  3. dcicutils-7.8.1.1b1/dcicutils/sheet_utils.py +471 -0
  4. dcicutils-7.8.1.1b1/dcicutils/variant_utils.py +92 -0
  5. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/pyproject.toml +2 -1
  6. dcicutils-7.7.2.1b0/dcicutils/sheet_utils.py +0 -191
  7. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/LICENSE.txt +0 -0
  8. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/README.rst +0 -0
  9. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/__init__.py +0 -0
  10. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/base.py +0 -0
  11. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/beanstalk_utils.py +0 -0
  12. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/cloudformation_utils.py +0 -0
  13. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/codebuild_utils.py +0 -0
  14. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/command_utils.py +0 -0
  15. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/common.py +0 -0
  16. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/contribution_scripts.py +0 -0
  17. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/contribution_utils.py +0 -0
  18. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/creds_utils.py +0 -0
  19. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/data_utils.py +0 -0
  20. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/deployment_utils.py +0 -0
  21. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/diff_utils.py +0 -0
  22. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/docker_utils.py +0 -0
  23. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ecr_scripts.py +0 -0
  24. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ecr_utils.py +0 -0
  25. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ecs_utils.py +0 -0
  26. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/env_base.py +0 -0
  27. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/env_manager.py +0 -0
  28. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/env_scripts.py +0 -0
  29. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/env_utils.py +0 -0
  30. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/env_utils_legacy.py +0 -0
  31. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/es_utils.py +0 -0
  32. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/exceptions.py +0 -0
  33. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ff_mocks.py +0 -0
  34. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/function_cache_decorator.py +0 -0
  35. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/glacier_utils.py +0 -0
  36. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/jh_utils.py +0 -0
  37. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/kibana/dashboards.json +0 -0
  38. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/kibana/readme.md +0 -0
  39. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/lang_utils.py +0 -0
  40. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/license_utils.py +0 -0
  41. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/log_utils.py +0 -0
  42. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/misc_utils.py +0 -0
  43. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/obfuscation_utils.py +0 -0
  44. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/opensearch_utils.py +0 -0
  45. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/project_utils.py +0 -0
  46. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/qa_checkers.py +0 -0
  47. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/qa_utils.py +0 -0
  48. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/redis_tools.py +0 -0
  49. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/redis_utils.py +0 -0
  50. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/s3_utils.py +0 -0
  51. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/scripts/publish_to_pypi.py +0 -0
  52. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/secrets_utils.py +0 -0
  53. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/snapshot_utils.py +0 -0
  54. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/ssl_certificate_utils.py +0 -0
  55. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/task_utils.py +0 -0
  56. {dcicutils-7.7.2.1b0 → dcicutils-7.8.1.1b1}/dcicutils/trace_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 7.7.2.1b0
3
+ Version: 7.8.1.1b1
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: PyYAML (>=5.1,<5.5)
25
25
  Requires-Dist: aws-requests-auth (>=0.4.2,<1)
26
26
  Requires-Dist: boto3 (>=1.17.39,<2.0.0)
27
27
  Requires-Dist: botocore (>=1.20.39,<2.0.0)
28
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
28
29
  Requires-Dist: docker (>=4.4.4,<5.0.0)
29
30
  Requires-Dist: elasticsearch (==7.13.4)
30
31
  Requires-Dist: gitpython (>=3.1.2,<4.0.0)
@@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None):
961
961
  if value.get('isAbstract') is True:
962
962
  continue
963
963
  # some test schemas in local don't have the id field
964
- schema_filename = value.get('id')
964
+ schema_filename = value.get('$id')
965
965
  if schema_filename:
966
966
  schema_name[key] = schema_filename.split('/')[-1][:-5]
967
967
  return schema_name
@@ -0,0 +1,471 @@
1
+ import chardet
2
+ import copy
3
+ import csv
4
+ import io
5
+ import openpyxl
6
+
7
+ from dcicutils.common import AnyJsonData
8
+ from dcicutils.misc_utils import ignored
9
+ from openpyxl.worksheet.worksheet import Worksheet
10
+ from openpyxl.workbook.workbook import Workbook
11
+ from tempfile import TemporaryFile
12
+ from typing import Any, Dict, Iterable, List, Union
13
+
14
+
15
+ Header = str
16
+ Headers = List[str]
17
+ ParsedHeader = List[Union[str, int]]
18
+ ParsedHeaders = List[ParsedHeader]
19
+ SheetCellValue = Union[int, float, str]
20
+ SheetRow = List[SheetCellValue]
21
+ CsvReader = type(csv.reader(TemporaryFile()))
22
+
23
+
24
+ def prefer_number(value: SheetCellValue):
25
+ if isinstance(value, str): # the given value might be an int or float, in which case just fall through
26
+ if not value:
27
+ return None
28
+ value = value
29
+ ch0 = value[0]
30
+ if ch0 == '+' or ch0 == '-' or ch0.isdigit():
31
+ try:
32
+ return int(value)
33
+ except Exception:
34
+ pass
35
+ try:
36
+ return float(value)
37
+ except Exception:
38
+ pass
39
+ # If we couldn't parse it as an int or float, fall through to returning the original value
40
+ pass
41
+ return value
42
+
43
+
44
+ def open_text_input_file_respecting_byte_order_mark(filename):
45
+ """
46
+ Opens a file for text input, respecting a byte-order mark (BOM).
47
+ """
48
+ with io.open(filename, 'rb') as fp:
49
+ leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
50
+ bom_info = chardet.detect(leading_bytes)
51
+ detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
52
+
53
+ return io.open(filename, 'r', encoding=detected_encoding)
54
+
55
+
56
+ class ItemTools:
57
+ """
58
+ Implements operations on table-related data without pre-supposing the specific representation of the table.
59
+ It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
60
+ it does not presuppose the source of the data nor where it will be written to.
61
+
62
+ For the purpose of this class:
63
+
64
+ * a 'header' is a string representing the top of a column.
65
+
66
+ * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
67
+ "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
68
+ each numeric token as an int instead of a string.
69
+
70
+ * a 'headers' object is just a list of strings, each of which is a 'header'.
71
+
72
+ * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
73
+ e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
74
+
75
+ """
76
+
77
+ @classmethod
78
+ def parse_sheet_header(cls, header: Header) -> ParsedHeader:
79
+ result = []
80
+ token = ""
81
+ for i in range(len(header)):
82
+ ch = header[i]
83
+ if ch == '.' or ch == '#':
84
+ if token:
85
+ result.append(int(token) if token.isdigit() else token)
86
+ token = ""
87
+ else:
88
+ token += ch
89
+ if token:
90
+ result.append(int(token) if token.isdigit() else token)
91
+ return result
92
+
93
+ @classmethod
94
+ def parse_sheet_headers(cls, headers: Headers):
95
+ return [cls.parse_sheet_header(header)
96
+ for header in headers]
97
+
98
+ @classmethod
99
+ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
100
+ prototype = {}
101
+ for parsed_header in parsed_headers:
102
+ parsed_header0 = parsed_header[0]
103
+ if isinstance(parsed_header0, int):
104
+ raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}")
105
+ cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
106
+ return prototype
107
+
108
+ @classmethod
109
+ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
110
+ [key0, *more_keys] = keys
111
+ key1 = more_keys[0] if more_keys else None
112
+ if isinstance(key1, int):
113
+ placeholder = []
114
+ elif isinstance(key1, str):
115
+ placeholder = {}
116
+ else:
117
+ placeholder = None
118
+ if isinstance(key0, int):
119
+ n = len(parent)
120
+ if key0 == n:
121
+ parent.append(placeholder)
122
+ elif key0 > n:
123
+ raise Exception("Numeric items must occur sequentially.")
124
+ elif isinstance(key0, str):
125
+ if key0 not in parent:
126
+ parent[key0] = placeholder
127
+ if key1 is not None:
128
+ cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
129
+ return parent
130
+
131
+ @classmethod
132
+ def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData:
133
+ if isinstance(value, str):
134
+ lvalue = value.lower()
135
+ # TODO: We could consult a schema to make this less heuristic, but this may do for now
136
+ if lvalue == 'true':
137
+ return True
138
+ elif lvalue == 'false':
139
+ return False
140
+ elif lvalue == 'null' or lvalue == '':
141
+ return None
142
+ elif '|' in value:
143
+ return [cls.parse_item_value(subvalue) for subvalue in value.split('|')]
144
+ else:
145
+ return prefer_number(value)
146
+ else: # presumably a number (int or float)
147
+ return value
148
+
149
+ @classmethod
150
+ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
151
+ if (value is None or value == '') and not force:
152
+ return
153
+ [key, *more_path] = path
154
+ if not more_path:
155
+ datum[key] = value
156
+ else:
157
+ cls.set_path_value(datum[key], more_path, value)
158
+
159
+
160
+ # TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
161
+ #
162
+ # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
163
+ # I am less certain but open to discussion. Among other things, as implemented now,
164
+ # the __init__ method here needs to run and the documentation says that ABC's won't appear
165
+ # in the method resolution order. -kmp 17-Aug-2023
166
+ # See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
167
+ class AbstractTableSetManager:
168
+ """
169
+ The TableSetManager is the spanning class of anything that wants to be able to load a table set,
170
+ regardless of what it wants to load it from. To do this, it must support a load method
171
+ that takes a filename and returns the file content in the form:
172
+ {
173
+ "Sheet1": [
174
+ {...representation of row1 as some kind of dict...},
175
+ {...representation of row2 as some kind of dict...}
176
+ ],
177
+ "Sheet2": [...],
178
+ ...,
179
+ }
180
+ Note that at this level of abstraction, we take no position on what form of representation is used
181
+ for the rows, as long as it is JSON data of some kind. It might be
182
+ {"col1": "val1", "col2": "val2", ...}
183
+ or it might be something more structured like
184
+ {"something": "val1", {"something_else": ["val2"]}}
185
+ Additionally, the values stored might be altered as well. In particular, the most likely alteration
186
+ is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
187
+ happen is not constrained by this class.
188
+ """
189
+
190
+ def __init__(self, **kwargs):
191
+ if kwargs:
192
+ raise ValueError(f"Got unexpected keywords: {kwargs}")
193
+
194
+ # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
195
+ @classmethod
196
+ def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
197
+ """
198
+ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
199
+ For more information, see documentation of AbstractTableSetManager.
200
+ """
201
+ raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.")
202
+
203
+
204
+ class BasicTableSetManager(AbstractTableSetManager):
205
+ """
206
+ A BasicTableManager provides some structure that most kinds of parsers will need.
207
+ In particular, everything will likely need some way of storing headers and some way of storing content
208
+ of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
209
+ of this where there's only one set of headers and only one block of content.
210
+ """
211
+
212
+ def __init__(self, filename: str, **kwargs):
213
+ super().__init__(**kwargs)
214
+ self.filename: str = filename
215
+ self.headers_by_tabname: Dict[str, List[str]] = {}
216
+ self.content_by_tabname: Dict[str, List[AnyJsonData]] = {}
217
+ self.reader_agent: Any = self._get_reader_agent()
218
+
219
+ def tab_headers(self, tabname: str) -> List[str]:
220
+ return self.headers_by_tabname[tabname]
221
+
222
+ def tab_content(self, tabname: str) -> List[AnyJsonData]:
223
+ return self.content_by_tabname[tabname]
224
+
225
+ def _create_tab_processor_state(self, tabname: str) -> Any:
226
+ """
227
+ This method provides for the possibility that some parsers will want auxiliary state,
228
+ (such as parsed headers or a line count or a table of temporary names for objects to cross-link
229
+ or some other such feature) that it carries with it as it moves from line to line parsing things.
230
+ Subclasses might therefore want to make this do something more interesting.
231
+ """
232
+ ignored(tabname) # subclasses might need this, but we don't
233
+ return None
234
+
235
+ def _get_reader_agent(self) -> Any:
236
+ """This function is responsible for opening the workbook and returning a workbook object."""
237
+ raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.")
238
+
239
+ def load_content(self) -> Any:
240
+ raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.")
241
+
242
+
243
+ class TableSetManager(BasicTableSetManager):
244
+
245
+ @classmethod
246
+ def load(cls, filename: str) -> AnyJsonData:
247
+ table_set_manager: TableSetManager = cls(filename)
248
+ return table_set_manager.load_content()
249
+
250
+ def __init__(self, filename: str):
251
+ super().__init__(filename=filename)
252
+
253
+ @property
254
+ def tabnames(self) -> List[str]:
255
+ raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..")
256
+
257
+ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
258
+ """
259
+ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values.
260
+ """
261
+ raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.")
262
+
263
+ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
264
+ """
265
+ This needs to take a state and whatever represents a row and
266
+ must return a list of objects representing column values.
267
+ What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
268
+ """
269
+ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.")
270
+
271
+ def load_content(self) -> AnyJsonData:
272
+ for tabname in self.tabnames:
273
+ sheet_content = []
274
+ state = self._create_tab_processor_state(tabname)
275
+ for row_data in self._raw_row_generator_for_tabname(tabname):
276
+ processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data)
277
+ sheet_content.append(processed_row_data)
278
+ self.content_by_tabname[tabname] = sheet_content
279
+ return self.content_by_tabname
280
+
281
+ @classmethod
282
+ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
283
+ return prefer_number(value)
284
+
285
+
286
+ class XlsxManager(TableSetManager):
287
+ """
288
+ This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
289
+ """
290
+
291
+ @classmethod
292
+ def _all_rows(cls, sheet: Worksheet):
293
+ row_max = sheet.max_row
294
+ for row in range(2, row_max + 1):
295
+ yield row
296
+
297
+ @classmethod
298
+ def _all_cols(cls, sheet: Worksheet):
299
+ col_max = sheet.max_column
300
+ for col in range(1, col_max + 1):
301
+ yield col
302
+
303
+ @property
304
+ def tabnames(self) -> List[str]:
305
+ return self.reader_agent.sheetnames
306
+
307
+ def _get_reader_agent(self) -> Workbook:
308
+ return openpyxl.load_workbook(self.filename)
309
+
310
+ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
311
+ sheet = self.reader_agent[tabname]
312
+ return (self._get_raw_row_content_tuple(sheet, row)
313
+ for row in self._all_rows(sheet))
314
+
315
+ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
316
+ return [sheet.cell(row=row, column=col).value
317
+ for col in self._all_cols(sheet)]
318
+
319
+ def _create_tab_processor_state(self, tabname: str) -> Headers:
320
+ sheet = self.reader_agent[tabname]
321
+ headers: List[str] = [str(sheet.cell(row=1, column=col).value)
322
+ for col in self._all_cols(sheet)]
323
+ self.headers_by_tabname[sheet.title] = headers
324
+ return headers
325
+
326
+ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
327
+ ignored(tabname)
328
+ return {headers[i]: self.parse_cell_value(row_datum)
329
+ for i, row_datum in enumerate(row_data)}
330
+
331
+
332
+ class ItemManagerMixin(BasicTableSetManager):
333
+ """
334
+ This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
335
+ get handled like Items instead of just flat table rows.
336
+ """
337
+
338
+ def __init__(self, filename: str, **kwargs):
339
+ super().__init__(filename=filename, **kwargs)
340
+ self.patch_prototypes_by_tabname: Dict[str, Dict] = {}
341
+ self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {}
342
+
343
+ def sheet_patch_prototype(self, tabname: str) -> Dict:
344
+ return self.patch_prototypes_by_tabname[tabname]
345
+
346
+ def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]:
347
+ return self.parsed_headers_by_tabname[tabname]
348
+
349
+ def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders:
350
+ super()._create_tab_processor_state(tabname)
351
+ # This will create state that allows us to efficiently assign values in the right place on each row
352
+ # by setting up a prototype we can copy and then drop values into.
353
+ self._compile_sheet_headers(tabname)
354
+ return self.sheet_parsed_headers(tabname)
355
+
356
+ def _compile_sheet_headers(self, tabname: str):
357
+ headers = self.headers_by_tabname[tabname]
358
+ parsed_headers = ItemTools.parse_sheet_headers(headers)
359
+ self.parsed_headers_by_tabname[tabname] = parsed_headers
360
+ prototype = ItemTools.compute_patch_prototype(parsed_headers)
361
+ self.patch_prototypes_by_tabname[tabname] = prototype
362
+
363
+ def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData:
364
+ patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname))
365
+ for i, value in enumerate(row_data):
366
+ parsed_value = self.parse_cell_value(value)
367
+ ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
368
+ return patch_item
369
+
370
+ @classmethod
371
+ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
372
+ return ItemTools.parse_item_value(value)
373
+
374
+
375
+ class ItemXlsxManager(ItemManagerMixin, XlsxManager):
376
+ """
377
+ This layers item-style row processing functionality on an XLSX file.
378
+ """
379
+ pass
380
+
381
+
382
+ class CsvManager(TableSetManager):
383
+ """
384
+ This implements the mechanism to get a series of rows out of the sheet in a csv file,
385
+ returning a result that still looks like there could have been multiple tabs.
386
+ """
387
+
388
+ DEFAULT_TAB_NAME = 'Sheet1'
389
+
390
+ def __init__(self, filename: str, tab_name=None):
391
+ super().__init__(filename=filename)
392
+ self.tab_name = tab_name or self.DEFAULT_TAB_NAME
393
+
394
+ @property
395
+ def tabnames(self) -> List[str]:
396
+ return [self.tab_name]
397
+
398
+ def _get_reader_agent(self) -> CsvReader:
399
+ return self._get_csv_reader(self.filename)
400
+
401
+ @classmethod
402
+ def _get_csv_reader(cls, filename) -> CsvReader:
403
+ return csv.reader(open_text_input_file_respecting_byte_order_mark(filename))
404
+
405
+ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
406
+ return self.reader_agent
407
+
408
+ def _create_tab_processor_state(self, tabname: str) -> Headers:
409
+ headers: Headers = self.headers_by_tabname.get(tabname)
410
+ if headers is None:
411
+ self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__()
412
+ print(f"Headers={headers}")
413
+ return headers
414
+
415
+ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
416
+ ignored(tabname)
417
+ return {headers[i]: self.parse_cell_value(row_datum)
418
+ for i, row_datum in enumerate(row_data)}
419
+
420
+
421
+ class ItemCsvManager(ItemManagerMixin, CsvManager):
422
+ """
423
+ This layers item-style row processing functionality on a CSV file.
424
+ """
425
+ pass
426
+
427
+
428
+ class TsvManager(CsvManager):
429
+ """
430
+ TSV files are just CSV files with tabs instead of commas as separators.
431
+ (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
432
+ """
433
+ @classmethod
434
+ def _get_csv_reader(cls, filename) -> CsvReader:
435
+ return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
436
+
437
+
438
+ class ItemTsvManager(ItemManagerMixin, TsvManager):
439
+ """
440
+ This layers item-style row processing functionality on a TSV file.
441
+ """
442
+ pass
443
+
444
+
445
+ class ItemManager(AbstractTableSetManager):
446
+ """
447
+ This class will open a .xlsx or .csv file and load its content in our standard format.
448
+ (See more detailed description in AbstractTableManager.)
449
+ """
450
+
451
+ @classmethod
452
+ def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager:
453
+ if filename.endswith(".xlsx"):
454
+ if tab_name is not None:
455
+ raise ValueError(f".xlsx files don't need tab_name={tab_name!r}")
456
+ reader_agent = ItemXlsxManager(filename)
457
+ elif filename.endswith(".csv"):
458
+ reader_agent = ItemCsvManager(filename, tab_name=tab_name)
459
+ elif filename.endswith(".tsv"):
460
+ reader_agent = ItemTsvManager(filename, tab_name=tab_name)
461
+ else:
462
+ raise ValueError(f"Unknown file type: {filename}")
463
+ return reader_agent
464
+
465
+ @classmethod
466
+ def load(cls, filename: str, tab_name=None) -> AnyJsonData:
467
+ manager = cls.create_implementation_manager(filename, tab_name=tab_name)
468
+ return manager.load_content()
469
+
470
+
471
+ load_items = ItemManager.load
@@ -0,0 +1,92 @@
1
+ import json
2
+ from dcicutils.ff_utils import get_metadata, search_metadata
3
+ from dcicutils.creds_utils import CGAPKeyManager
4
+
5
+
6
+ class VariantUtils:
7
+
8
+ SEARCH_VARIANTS_BY_GENE = (f'/search/?type=VariantSample&limit=1'
9
+ f'&variant.genes.genes_most_severe_gene.display_title=')
10
+ SEARCH_RARE_VARIANTS_BY_GENE = (f'/search/?samplegeno.samplegeno_role=proband&type=VariantSample'
11
+ f'&variant.csq_gnomadg_af_popmax.from=0&variant.csq_gnomadg_af_popmax.to=0.001'
12
+ f'&variant.genes.genes_most_severe_gene.display_title=')
13
+
14
+ def __init__(self, *, env_name) -> None:
15
+ self._key_manager = CGAPKeyManager()
16
+ self.creds = self._key_manager.get_keydict_for_env(env=env_name)
17
+ # Uncomment this if needed
18
+ # self.health = get_health_page(key=self.creds)
19
+ self.base_url = self.creds['server']
20
+
21
+ def get_creds(self):
22
+ return self.creds
23
+
24
+ def get_rare_variants_by_gene(self, *, gene, sort, addon=''):
25
+ """Searches for rare variants on a particular gene"""
26
+ return search_metadata(f'{self.base_url}/{self.SEARCH_RARE_VARIANTS_BY_GENE}{gene}\
27
+ &sort=-{sort}{addon}', key=self.creds)
28
+
29
+ def find_number_of_sample_ids(self, gene):
30
+ """Returns the number of samples that have a mutation on the specified gene"""
31
+ return len(set(variant.get('CALL_INFO')
32
+ for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID')))
33
+
34
+ def get_total_result_count_from_search(self, gene):
35
+ """Returns total number of variants associated with specified gene"""
36
+ res = get_metadata(self.SEARCH_VARIANTS_BY_GENE + gene, key=self.creds)
37
+ return res['total']
38
+
39
+ @staticmethod
40
+ def sort_dict_in_descending_order(unsorted_dict):
41
+ """Sorts dictionary in descending value order"""
42
+ sorted_list = sorted(unsorted_dict.items(), key=lambda x: x[1], reverse=True)
43
+ return dict(sorted_list)
44
+
45
+ def create_dict_of_mutations(self, gene):
46
+ """Creates dictionary of specified gene and mutations that occur 10+ times in database, in the form:
47
+ {gene: {mutation1 pos: #variants, mutation2 pos: #variants, ...}"""
48
+ mutation_dict = {}
49
+ unique_positions = set()
50
+ for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'):
51
+ pos = variant['variant']['POS']
52
+ if pos not in unique_positions:
53
+ unique_positions.add(pos)
54
+ mutation_dict[pos] = 1
55
+ else:
56
+ mutation_dict[pos] += 1
57
+ return {gene: self.sort_dict_in_descending_order({k: v for k, v in mutation_dict.items() if v >= 10})}
58
+
59
+ @staticmethod
60
+ def return_json(file_name):
61
+ with open(file_name, 'r') as f:
62
+ file_content = json.loads(f)
63
+ return file_content
64
+
65
+ @staticmethod
66
+ def create_dict_from_json_file(file_name):
67
+ """Creates dictionary object from specified json file"""
68
+ with open(file_name) as f:
69
+ json_list = f.read()
70
+ return json.loads(json_list)
71
+
72
+ def create_list_of_msa_genes(self):
73
+ """Creates list of genes relating to the brain or nervous system
74
+ (determined by whether keywords 'neur' or 'nerv' in summary)"""
75
+ genes = self.return_json('gene.json')
76
+ return [gene['gene_symbol'] for gene in genes
77
+ if 'nerv' in gene.get('gene_summary', '')
78
+ or 'neur' in gene.get('gene_summary', '')]
79
+
80
+ def create_url(self, gene):
81
+ """Returns a url to the variants at the most commonly mutated position of specified gene"""
82
+ d = self.create_dict_from_json_file('10+sorted_msa_genes_and_mutations.json')
83
+ pos = list(d[gene].keys())[0]
84
+ return self.SEARCH_RARE_VARIANTS_BY_GENE + gene + f'&variant.POS.from={pos}&variant.POS.to={pos}&sort=-DP'
85
+
86
+ def create_list_of_als_park_genes(self):
87
+ """Creates list of genes that relating to Parkinson's or ALS
88
+ (determined by whether keywords 'Parkinson' or 'ALS' in summary)"""
89
+ genes = self.return_json('gene.json')
90
+ return [gene['gene_symbol'] for gene in genes
91
+ if 'Parkinson' in gene.get('gene_summary', '')
92
+ or 'ALS' in gene.get('gene_summary', '')]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcicutils"
3
- version = "7.7.2.1b0" # to become "7.8.0"
3
+ version = "7.8.1.1b1" # to become "7.9.0"
4
4
  description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
5
5
  authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
6
6
  license = "MIT"
@@ -44,6 +44,7 @@ botocore = "^1.20.39"
44
44
  # This value is intentionally pinned and must not be changed casually.
45
45
  elasticsearch = "7.13.4"
46
46
  aws-requests-auth = ">=0.4.2,<1"
47
+ chardet = "^5.2.0"
47
48
  docker = "^4.4.4"
48
49
  gitpython = "^3.1.2"
49
50
  openpyxl = "^3.1.2"
@@ -1,191 +0,0 @@
1
- import copy
2
-
3
- from openpyxl import load_workbook
4
- from openpyxl.worksheet.worksheet import Worksheet
5
- from openpyxl.workbook.workbook import Workbook
6
- from typing import Any, Dict, List, Optional, Union
7
-
8
-
9
- class WorkbookManager:
10
-
11
- @classmethod
12
- def load_workbook(cls, filename: str):
13
- wb = cls(filename)
14
- return wb.load_content()
15
-
16
- def __init__(self, filename: str):
17
- self.filename: str = filename
18
- self.workbook: Optional[Workbook] = None
19
- self.headers_by_sheetname: Dict[List[str]] = {}
20
- self.content_by_sheetname: Dict[List[Any]] = {}
21
-
22
- def sheet_headers(self, sheet: Worksheet) -> List[str]:
23
- return self.headers_by_sheetname[sheet.title]
24
-
25
- def sheet_content(self, sheet: Worksheet) -> List[Any]:
26
- return self.content_by_sheetname[sheet.title]
27
-
28
- @classmethod
29
- def all_rows(cls, sheet: Worksheet):
30
- row_max = sheet.max_row
31
- for row in range(2, row_max + 1):
32
- yield row
33
-
34
- @classmethod
35
- def all_cols(cls, sheet: Worksheet):
36
- col_max = sheet.max_column
37
- for col in range(1, col_max + 1):
38
- yield col
39
-
40
- def load_headers(self, sheet: Worksheet):
41
- headers: List[str] = [str(sheet.cell(row=1, column=col).value)
42
- for col in self.all_cols(sheet)]
43
- self.headers_by_sheetname[sheet.title] = headers
44
-
45
- def load_content(self):
46
- workbook: Workbook = load_workbook(self.filename)
47
- self.workbook = workbook
48
- for sheetname in workbook.sheetnames:
49
- sheet: Worksheet = workbook[sheetname]
50
- self.load_headers(sheet)
51
- content = []
52
- for row in self.all_rows(sheet):
53
- row_dict = self.load_row(sheet=sheet, row=row)
54
- content.append(row_dict)
55
- self.content_by_sheetname[sheetname] = content
56
- return self.content_by_sheetname
57
-
58
- def load_row(self, *, sheet: Worksheet, row: int):
59
- headers = self.sheet_headers(sheet)
60
- row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value
61
- for col in self.all_cols(sheet)}
62
- return row_dict
63
-
64
-
65
- class ItemTools:
66
-
67
- @classmethod
68
- def compute_patch_prototype(cls, parsed_headers):
69
- prototype = {}
70
- for parsed_header in parsed_headers:
71
- parsed_header0 = parsed_header[0]
72
- if isinstance(parsed_header0, int):
73
- raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}")
74
- cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
75
- return prototype
76
-
77
- @classmethod
78
- def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[Union[int, str]]):
79
- [key0, *more_keys] = keys
80
- key1 = more_keys[0] if more_keys else None
81
- if isinstance(key1, int):
82
- placeholder = []
83
- elif isinstance(key1, str):
84
- placeholder = {}
85
- else:
86
- placeholder = None
87
- if isinstance(key0, int):
88
- n = len(parent)
89
- if key0 == n:
90
- parent.append(placeholder)
91
- elif key0 > n:
92
- raise Exception("Numeric items must occur sequentially.")
93
- elif isinstance(key0, str):
94
- if key0 not in parent:
95
- parent[key0] = placeholder
96
- if key1 is not None:
97
- cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
98
- return parent
99
-
100
- @classmethod
101
- def parse_sheet_headers(cls, headers):
102
- return [cls.parse_sheet_header(header)
103
- for header in headers]
104
-
105
- @classmethod
106
- def parse_sheet_header(cls, header) -> List[Union[int, str]]:
107
- result = []
108
- token = ""
109
- for i in range(len(header)):
110
- ch = header[i]
111
- if ch == '.' or ch == '#':
112
- if token:
113
- result.append(int(token) if token.isdigit() else token)
114
- token = ""
115
- else:
116
- token += ch
117
- if token:
118
- result.append(int(token) if token.isdigit() else token)
119
- return result
120
-
121
- @classmethod
122
- def set_path_value(cls, datum, path, value, force=False):
123
- if (value is None or value == '') and not force:
124
- return
125
- [key, *more_path] = path
126
- if not more_path:
127
- datum[key] = value
128
- else:
129
- cls.set_path_value(datum[key], more_path, value)
130
-
131
- @classmethod
132
- def parse_value(cls, value):
133
- if isinstance(value, str):
134
- lvalue = value.lower()
135
- # TODO: We could consult a schema to make this less heuristic, but this may do for now
136
- if lvalue == 'true':
137
- return True
138
- elif lvalue == 'false':
139
- return False
140
- elif lvalue == 'null' or lvalue == '':
141
- return None
142
- elif '|' in value:
143
- return [cls.parse_value(subvalue) for subvalue in value.split('|')]
144
- else:
145
- ch0 = value[0]
146
- if ch0 == '+' or ch0 == '-' or ch0.isdigit():
147
- try:
148
- return int(value)
149
- except Exception:
150
- pass
151
- try:
152
- return float(value)
153
- except Exception:
154
- pass
155
- return value
156
- else: # probably a number
157
- return value
158
-
159
-
160
- class ItemManager(ItemTools, WorkbookManager):
161
-
162
- def __init__(self, filename: str):
163
- super().__init__(filename=filename)
164
- self.patch_prototypes_by_sheetname: Dict[Dict] = {}
165
- self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {}
166
-
167
- def sheet_patch_prototype(self, sheet: Worksheet) -> Dict:
168
- return self.patch_prototypes_by_sheetname[sheet.title]
169
-
170
- def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]:
171
- return self.parsed_headers_by_sheetname[sheet.title]
172
-
173
- def load_headers(self, sheet: Worksheet):
174
- super().load_headers(sheet)
175
- self.compile_sheet_headers(sheet)
176
-
177
- def compile_sheet_headers(self, sheet: Worksheet):
178
- headers = self.headers_by_sheetname[sheet.title]
179
- parsed_headers = self.parse_sheet_headers(headers)
180
- self.parsed_headers_by_sheetname[sheet.title] = parsed_headers
181
- prototype = self.compute_patch_prototype(parsed_headers)
182
- self.patch_prototypes_by_sheetname[sheet.title] = prototype
183
-
184
- def load_row(self, *, sheet: Worksheet, row: int):
185
- parsed_headers = self.sheet_parsed_headers(sheet)
186
- patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet))
187
- for col in self.all_cols(sheet):
188
- value = sheet.cell(row=row, column=col).value
189
- parsed_value = self.parse_value(value)
190
- self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value)
191
- return patch_item
File without changes
File without changes