dcicutils 7.7.2.1b0__py3-none-any.whl → 7.8.1.1b1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of dcicutils might be problematic. Click here for more details.
- dcicutils/ff_utils.py +1 -1
- dcicutils/sheet_utils.py +405 -125
- dcicutils/variant_utils.py +92 -0
- {dcicutils-7.7.2.1b0.dist-info → dcicutils-7.8.1.1b1.dist-info}/METADATA +2 -1
- {dcicutils-7.7.2.1b0.dist-info → dcicutils-7.8.1.1b1.dist-info}/RECORD +8 -7
- {dcicutils-7.7.2.1b0.dist-info → dcicutils-7.8.1.1b1.dist-info}/LICENSE.txt +0 -0
- {dcicutils-7.7.2.1b0.dist-info → dcicutils-7.8.1.1b1.dist-info}/WHEEL +0 -0
- {dcicutils-7.7.2.1b0.dist-info → dcicutils-7.8.1.1b1.dist-info}/entry_points.txt +0 -0
dcicutils/ff_utils.py
CHANGED
@@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None):
|
|
961
961
|
if value.get('isAbstract') is True:
|
962
962
|
continue
|
963
963
|
# some test schemas in local don't have the id field
|
964
|
-
schema_filename = value.get('id')
|
964
|
+
schema_filename = value.get('$id')
|
965
965
|
if schema_filename:
|
966
966
|
schema_name[key] = schema_filename.split('/')[-1][:-5]
|
967
967
|
return schema_name
|
dcicutils/sheet_utils.py
CHANGED
@@ -1,71 +1,102 @@
|
|
1
|
+
import chardet
|
1
2
|
import copy
|
3
|
+
import csv
|
4
|
+
import io
|
5
|
+
import openpyxl
|
2
6
|
|
3
|
-
from
|
7
|
+
from dcicutils.common import AnyJsonData
|
8
|
+
from dcicutils.misc_utils import ignored
|
4
9
|
from openpyxl.worksheet.worksheet import Worksheet
|
5
10
|
from openpyxl.workbook.workbook import Workbook
|
6
|
-
from
|
11
|
+
from tempfile import TemporaryFile
|
12
|
+
from typing import Any, Dict, Iterable, List, Union
|
7
13
|
|
8
14
|
|
9
|
-
|
15
|
+
Header = str
|
16
|
+
Headers = List[str]
|
17
|
+
ParsedHeader = List[Union[str, int]]
|
18
|
+
ParsedHeaders = List[ParsedHeader]
|
19
|
+
SheetCellValue = Union[int, float, str]
|
20
|
+
SheetRow = List[SheetCellValue]
|
21
|
+
CsvReader = type(csv.reader(TemporaryFile()))
|
10
22
|
|
11
|
-
@classmethod
|
12
|
-
def load_workbook(cls, filename: str):
|
13
|
-
wb = cls(filename)
|
14
|
-
return wb.load_content()
|
15
23
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
def prefer_number(value: SheetCellValue):
|
25
|
+
if isinstance(value, str): # the given value might be an int or float, in which case just fall through
|
26
|
+
if not value:
|
27
|
+
return None
|
28
|
+
value = value
|
29
|
+
ch0 = value[0]
|
30
|
+
if ch0 == '+' or ch0 == '-' or ch0.isdigit():
|
31
|
+
try:
|
32
|
+
return int(value)
|
33
|
+
except Exception:
|
34
|
+
pass
|
35
|
+
try:
|
36
|
+
return float(value)
|
37
|
+
except Exception:
|
38
|
+
pass
|
39
|
+
# If we couldn't parse it as an int or float, fall through to returning the original value
|
40
|
+
pass
|
41
|
+
return value
|
21
42
|
|
22
|
-
def sheet_headers(self, sheet: Worksheet) -> List[str]:
|
23
|
-
return self.headers_by_sheetname[sheet.title]
|
24
43
|
|
25
|
-
|
26
|
-
|
44
|
+
def open_text_input_file_respecting_byte_order_mark(filename):
|
45
|
+
"""
|
46
|
+
Opens a file for text input, respecting a byte-order mark (BOM).
|
47
|
+
"""
|
48
|
+
with io.open(filename, 'rb') as fp:
|
49
|
+
leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
|
50
|
+
bom_info = chardet.detect(leading_bytes)
|
51
|
+
detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
|
27
52
|
|
28
|
-
|
29
|
-
def all_rows(cls, sheet: Worksheet):
|
30
|
-
row_max = sheet.max_row
|
31
|
-
for row in range(2, row_max + 1):
|
32
|
-
yield row
|
53
|
+
return io.open(filename, 'r', encoding=detected_encoding)
|
33
54
|
|
34
|
-
@classmethod
|
35
|
-
def all_cols(cls, sheet: Worksheet):
|
36
|
-
col_max = sheet.max_column
|
37
|
-
for col in range(1, col_max + 1):
|
38
|
-
yield col
|
39
55
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
def load_content(self):
|
46
|
-
workbook: Workbook = load_workbook(self.filename)
|
47
|
-
self.workbook = workbook
|
48
|
-
for sheetname in workbook.sheetnames:
|
49
|
-
sheet: Worksheet = workbook[sheetname]
|
50
|
-
self.load_headers(sheet)
|
51
|
-
content = []
|
52
|
-
for row in self.all_rows(sheet):
|
53
|
-
row_dict = self.load_row(sheet=sheet, row=row)
|
54
|
-
content.append(row_dict)
|
55
|
-
self.content_by_sheetname[sheetname] = content
|
56
|
-
return self.content_by_sheetname
|
57
|
-
|
58
|
-
def load_row(self, *, sheet: Worksheet, row: int):
|
59
|
-
headers = self.sheet_headers(sheet)
|
60
|
-
row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value
|
61
|
-
for col in self.all_cols(sheet)}
|
62
|
-
return row_dict
|
56
|
+
class ItemTools:
|
57
|
+
"""
|
58
|
+
Implements operations on table-related data without pre-supposing the specific representation of the table.
|
59
|
+
It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
|
60
|
+
it does not presuppose the source of the data nor where it will be written to.
|
63
61
|
|
62
|
+
For the purpose of this class:
|
64
63
|
|
65
|
-
|
64
|
+
* a 'header' is a string representing the top of a column.
|
65
|
+
|
66
|
+
* a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
|
67
|
+
"a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
|
68
|
+
each numeric token as an int instead of a string.
|
69
|
+
|
70
|
+
* a 'headers' object is just a list of strings, each of which is a 'header'.
|
71
|
+
|
72
|
+
* a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
|
73
|
+
e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
|
74
|
+
|
75
|
+
"""
|
66
76
|
|
67
77
|
@classmethod
|
68
|
-
def
|
78
|
+
def parse_sheet_header(cls, header: Header) -> ParsedHeader:
|
79
|
+
result = []
|
80
|
+
token = ""
|
81
|
+
for i in range(len(header)):
|
82
|
+
ch = header[i]
|
83
|
+
if ch == '.' or ch == '#':
|
84
|
+
if token:
|
85
|
+
result.append(int(token) if token.isdigit() else token)
|
86
|
+
token = ""
|
87
|
+
else:
|
88
|
+
token += ch
|
89
|
+
if token:
|
90
|
+
result.append(int(token) if token.isdigit() else token)
|
91
|
+
return result
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def parse_sheet_headers(cls, headers: Headers):
|
95
|
+
return [cls.parse_sheet_header(header)
|
96
|
+
for header in headers]
|
97
|
+
|
98
|
+
@classmethod
|
99
|
+
def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
|
69
100
|
prototype = {}
|
70
101
|
for parsed_header in parsed_headers:
|
71
102
|
parsed_header0 = parsed_header[0]
|
@@ -75,7 +106,7 @@ class ItemTools:
|
|
75
106
|
return prototype
|
76
107
|
|
77
108
|
@classmethod
|
78
|
-
def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys:
|
109
|
+
def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
|
79
110
|
[key0, *more_keys] = keys
|
80
111
|
key1 = more_keys[0] if more_keys else None
|
81
112
|
if isinstance(key1, int):
|
@@ -98,38 +129,7 @@ class ItemTools:
|
|
98
129
|
return parent
|
99
130
|
|
100
131
|
@classmethod
|
101
|
-
def
|
102
|
-
return [cls.parse_sheet_header(header)
|
103
|
-
for header in headers]
|
104
|
-
|
105
|
-
@classmethod
|
106
|
-
def parse_sheet_header(cls, header) -> List[Union[int, str]]:
|
107
|
-
result = []
|
108
|
-
token = ""
|
109
|
-
for i in range(len(header)):
|
110
|
-
ch = header[i]
|
111
|
-
if ch == '.' or ch == '#':
|
112
|
-
if token:
|
113
|
-
result.append(int(token) if token.isdigit() else token)
|
114
|
-
token = ""
|
115
|
-
else:
|
116
|
-
token += ch
|
117
|
-
if token:
|
118
|
-
result.append(int(token) if token.isdigit() else token)
|
119
|
-
return result
|
120
|
-
|
121
|
-
@classmethod
|
122
|
-
def set_path_value(cls, datum, path, value, force=False):
|
123
|
-
if (value is None or value == '') and not force:
|
124
|
-
return
|
125
|
-
[key, *more_path] = path
|
126
|
-
if not more_path:
|
127
|
-
datum[key] = value
|
128
|
-
else:
|
129
|
-
cls.set_path_value(datum[key], more_path, value)
|
130
|
-
|
131
|
-
@classmethod
|
132
|
-
def parse_value(cls, value):
|
132
|
+
def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData:
|
133
133
|
if isinstance(value, str):
|
134
134
|
lvalue = value.lower()
|
135
135
|
# TODO: We could consult a schema to make this less heuristic, but this may do for now
|
@@ -140,52 +140,332 @@ class ItemTools:
|
|
140
140
|
elif lvalue == 'null' or lvalue == '':
|
141
141
|
return None
|
142
142
|
elif '|' in value:
|
143
|
-
return [cls.
|
143
|
+
return [cls.parse_item_value(subvalue) for subvalue in value.split('|')]
|
144
144
|
else:
|
145
|
-
|
146
|
-
|
147
|
-
try:
|
148
|
-
return int(value)
|
149
|
-
except Exception:
|
150
|
-
pass
|
151
|
-
try:
|
152
|
-
return float(value)
|
153
|
-
except Exception:
|
154
|
-
pass
|
155
|
-
return value
|
156
|
-
else: # probably a number
|
145
|
+
return prefer_number(value)
|
146
|
+
else: # presumably a number (int or float)
|
157
147
|
return value
|
158
148
|
|
149
|
+
@classmethod
|
150
|
+
def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
|
151
|
+
if (value is None or value == '') and not force:
|
152
|
+
return
|
153
|
+
[key, *more_path] = path
|
154
|
+
if not more_path:
|
155
|
+
datum[key] = value
|
156
|
+
else:
|
157
|
+
cls.set_path_value(datum[key], more_path, value)
|
158
|
+
|
159
|
+
|
160
|
+
# TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
|
161
|
+
#
|
162
|
+
# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
|
163
|
+
# I am less certain but open to discussion. Among other things, as implemented now,
|
164
|
+
# the __init__ method here needs to run and the documentation says that ABC's won't appear
|
165
|
+
# in the method resolution order. -kmp 17-Aug-2023
|
166
|
+
# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
|
167
|
+
class AbstractTableSetManager:
|
168
|
+
"""
|
169
|
+
The TableSetManager is the spanning class of anything that wants to be able to load a table set,
|
170
|
+
regardless of what it wants to load it from. To do this, it must support a load method
|
171
|
+
that takes a filename and returns the file content in the form:
|
172
|
+
{
|
173
|
+
"Sheet1": [
|
174
|
+
{...representation of row1 as some kind of dict...},
|
175
|
+
{...representation of row2 as some kind of dict...}
|
176
|
+
],
|
177
|
+
"Sheet2": [...],
|
178
|
+
...,
|
179
|
+
}
|
180
|
+
Note that at this level of abstraction, we take no position on what form of representation is used
|
181
|
+
for the rows, as long as it is JSON data of some kind. It might be
|
182
|
+
{"col1": "val1", "col2": "val2", ...}
|
183
|
+
or it might be something more structured like
|
184
|
+
{"something": "val1", {"something_else": ["val2"]}}
|
185
|
+
Additionally, the values stored might be altered as well. In particular, the most likely alteration
|
186
|
+
is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
|
187
|
+
happen is not constrained by this class.
|
188
|
+
"""
|
189
|
+
|
190
|
+
def __init__(self, **kwargs):
|
191
|
+
if kwargs:
|
192
|
+
raise ValueError(f"Got unexpected keywords: {kwargs}")
|
193
|
+
|
194
|
+
# TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
|
195
|
+
@classmethod
|
196
|
+
def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
|
197
|
+
"""
|
198
|
+
Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
|
199
|
+
For more information, see documentation of AbstractTableSetManager.
|
200
|
+
"""
|
201
|
+
raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.")
|
159
202
|
|
160
|
-
|
203
|
+
|
204
|
+
class BasicTableSetManager(AbstractTableSetManager):
|
205
|
+
"""
|
206
|
+
A BasicTableManager provides some structure that most kinds of parsers will need.
|
207
|
+
In particular, everything will likely need some way of storing headers and some way of storing content
|
208
|
+
of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
|
209
|
+
of this where there's only one set of headers and only one block of content.
|
210
|
+
"""
|
211
|
+
|
212
|
+
def __init__(self, filename: str, **kwargs):
|
213
|
+
super().__init__(**kwargs)
|
214
|
+
self.filename: str = filename
|
215
|
+
self.headers_by_tabname: Dict[str, List[str]] = {}
|
216
|
+
self.content_by_tabname: Dict[str, List[AnyJsonData]] = {}
|
217
|
+
self.reader_agent: Any = self._get_reader_agent()
|
218
|
+
|
219
|
+
def tab_headers(self, tabname: str) -> List[str]:
|
220
|
+
return self.headers_by_tabname[tabname]
|
221
|
+
|
222
|
+
def tab_content(self, tabname: str) -> List[AnyJsonData]:
|
223
|
+
return self.content_by_tabname[tabname]
|
224
|
+
|
225
|
+
def _create_tab_processor_state(self, tabname: str) -> Any:
|
226
|
+
"""
|
227
|
+
This method provides for the possibility that some parsers will want auxiliary state,
|
228
|
+
(such as parsed headers or a line count or a table of temporary names for objects to cross-link
|
229
|
+
or some other such feature) that it carries with it as it moves from line to line parsing things.
|
230
|
+
Subclasses might therefore want to make this do something more interesting.
|
231
|
+
"""
|
232
|
+
ignored(tabname) # subclasses might need this, but we don't
|
233
|
+
return None
|
234
|
+
|
235
|
+
def _get_reader_agent(self) -> Any:
|
236
|
+
"""This function is responsible for opening the workbook and returning a workbook object."""
|
237
|
+
raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.")
|
238
|
+
|
239
|
+
def load_content(self) -> Any:
|
240
|
+
raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.")
|
241
|
+
|
242
|
+
|
243
|
+
class TableSetManager(BasicTableSetManager):
|
244
|
+
|
245
|
+
@classmethod
|
246
|
+
def load(cls, filename: str) -> AnyJsonData:
|
247
|
+
table_set_manager: TableSetManager = cls(filename)
|
248
|
+
return table_set_manager.load_content()
|
161
249
|
|
162
250
|
def __init__(self, filename: str):
|
163
251
|
super().__init__(filename=filename)
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
return
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
self.
|
252
|
+
|
253
|
+
@property
|
254
|
+
def tabnames(self) -> List[str]:
|
255
|
+
raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..")
|
256
|
+
|
257
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
258
|
+
"""
|
259
|
+
Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values.
|
260
|
+
"""
|
261
|
+
raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.")
|
262
|
+
|
263
|
+
def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
|
264
|
+
"""
|
265
|
+
This needs to take a state and whatever represents a row and
|
266
|
+
must return a list of objects representing column values.
|
267
|
+
What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
|
268
|
+
"""
|
269
|
+
raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.")
|
270
|
+
|
271
|
+
def load_content(self) -> AnyJsonData:
|
272
|
+
for tabname in self.tabnames:
|
273
|
+
sheet_content = []
|
274
|
+
state = self._create_tab_processor_state(tabname)
|
275
|
+
for row_data in self._raw_row_generator_for_tabname(tabname):
|
276
|
+
processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data)
|
277
|
+
sheet_content.append(processed_row_data)
|
278
|
+
self.content_by_tabname[tabname] = sheet_content
|
279
|
+
return self.content_by_tabname
|
280
|
+
|
281
|
+
@classmethod
|
282
|
+
def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
|
283
|
+
return prefer_number(value)
|
284
|
+
|
285
|
+
|
286
|
+
class XlsxManager(TableSetManager):
|
287
|
+
"""
|
288
|
+
This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
|
289
|
+
"""
|
290
|
+
|
291
|
+
@classmethod
|
292
|
+
def _all_rows(cls, sheet: Worksheet):
|
293
|
+
row_max = sheet.max_row
|
294
|
+
for row in range(2, row_max + 1):
|
295
|
+
yield row
|
296
|
+
|
297
|
+
@classmethod
|
298
|
+
def _all_cols(cls, sheet: Worksheet):
|
299
|
+
col_max = sheet.max_column
|
300
|
+
for col in range(1, col_max + 1):
|
301
|
+
yield col
|
302
|
+
|
303
|
+
@property
|
304
|
+
def tabnames(self) -> List[str]:
|
305
|
+
return self.reader_agent.sheetnames
|
306
|
+
|
307
|
+
def _get_reader_agent(self) -> Workbook:
|
308
|
+
return openpyxl.load_workbook(self.filename)
|
309
|
+
|
310
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
311
|
+
sheet = self.reader_agent[tabname]
|
312
|
+
return (self._get_raw_row_content_tuple(sheet, row)
|
313
|
+
for row in self._all_rows(sheet))
|
314
|
+
|
315
|
+
def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
|
316
|
+
return [sheet.cell(row=row, column=col).value
|
317
|
+
for col in self._all_cols(sheet)]
|
318
|
+
|
319
|
+
def _create_tab_processor_state(self, tabname: str) -> Headers:
|
320
|
+
sheet = self.reader_agent[tabname]
|
321
|
+
headers: List[str] = [str(sheet.cell(row=1, column=col).value)
|
322
|
+
for col in self._all_cols(sheet)]
|
323
|
+
self.headers_by_tabname[sheet.title] = headers
|
324
|
+
return headers
|
325
|
+
|
326
|
+
def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
327
|
+
ignored(tabname)
|
328
|
+
return {headers[i]: self.parse_cell_value(row_datum)
|
329
|
+
for i, row_datum in enumerate(row_data)}
|
330
|
+
|
331
|
+
|
332
|
+
class ItemManagerMixin(BasicTableSetManager):
|
333
|
+
"""
|
334
|
+
This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
|
335
|
+
get handled like Items instead of just flat table rows.
|
336
|
+
"""
|
337
|
+
|
338
|
+
def __init__(self, filename: str, **kwargs):
|
339
|
+
super().__init__(filename=filename, **kwargs)
|
340
|
+
self.patch_prototypes_by_tabname: Dict[str, Dict] = {}
|
341
|
+
self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {}
|
342
|
+
|
343
|
+
def sheet_patch_prototype(self, tabname: str) -> Dict:
|
344
|
+
return self.patch_prototypes_by_tabname[tabname]
|
345
|
+
|
346
|
+
def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]:
|
347
|
+
return self.parsed_headers_by_tabname[tabname]
|
348
|
+
|
349
|
+
def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders:
|
350
|
+
super()._create_tab_processor_state(tabname)
|
351
|
+
# This will create state that allows us to efficiently assign values in the right place on each row
|
352
|
+
# by setting up a prototype we can copy and then drop values into.
|
353
|
+
self._compile_sheet_headers(tabname)
|
354
|
+
return self.sheet_parsed_headers(tabname)
|
355
|
+
|
356
|
+
def _compile_sheet_headers(self, tabname: str):
|
357
|
+
headers = self.headers_by_tabname[tabname]
|
358
|
+
parsed_headers = ItemTools.parse_sheet_headers(headers)
|
359
|
+
self.parsed_headers_by_tabname[tabname] = parsed_headers
|
360
|
+
prototype = ItemTools.compute_patch_prototype(parsed_headers)
|
361
|
+
self.patch_prototypes_by_tabname[tabname] = prototype
|
362
|
+
|
363
|
+
def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData:
|
364
|
+
patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname))
|
365
|
+
for i, value in enumerate(row_data):
|
366
|
+
parsed_value = self.parse_cell_value(value)
|
367
|
+
ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
|
191
368
|
return patch_item
|
369
|
+
|
370
|
+
@classmethod
|
371
|
+
def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
|
372
|
+
return ItemTools.parse_item_value(value)
|
373
|
+
|
374
|
+
|
375
|
+
class ItemXlsxManager(ItemManagerMixin, XlsxManager):
|
376
|
+
"""
|
377
|
+
This layers item-style row processing functionality on an XLSX file.
|
378
|
+
"""
|
379
|
+
pass
|
380
|
+
|
381
|
+
|
382
|
+
class CsvManager(TableSetManager):
|
383
|
+
"""
|
384
|
+
This implements the mechanism to get a series of rows out of the sheet in a csv file,
|
385
|
+
returning a result that still looks like there could have been multiple tabs.
|
386
|
+
"""
|
387
|
+
|
388
|
+
DEFAULT_TAB_NAME = 'Sheet1'
|
389
|
+
|
390
|
+
def __init__(self, filename: str, tab_name=None):
|
391
|
+
super().__init__(filename=filename)
|
392
|
+
self.tab_name = tab_name or self.DEFAULT_TAB_NAME
|
393
|
+
|
394
|
+
@property
|
395
|
+
def tabnames(self) -> List[str]:
|
396
|
+
return [self.tab_name]
|
397
|
+
|
398
|
+
def _get_reader_agent(self) -> CsvReader:
|
399
|
+
return self._get_csv_reader(self.filename)
|
400
|
+
|
401
|
+
@classmethod
|
402
|
+
def _get_csv_reader(cls, filename) -> CsvReader:
|
403
|
+
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename))
|
404
|
+
|
405
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
406
|
+
return self.reader_agent
|
407
|
+
|
408
|
+
def _create_tab_processor_state(self, tabname: str) -> Headers:
|
409
|
+
headers: Headers = self.headers_by_tabname.get(tabname)
|
410
|
+
if headers is None:
|
411
|
+
self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__()
|
412
|
+
print(f"Headers={headers}")
|
413
|
+
return headers
|
414
|
+
|
415
|
+
def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
416
|
+
ignored(tabname)
|
417
|
+
return {headers[i]: self.parse_cell_value(row_datum)
|
418
|
+
for i, row_datum in enumerate(row_data)}
|
419
|
+
|
420
|
+
|
421
|
+
class ItemCsvManager(ItemManagerMixin, CsvManager):
|
422
|
+
"""
|
423
|
+
This layers item-style row processing functionality on a CSV file.
|
424
|
+
"""
|
425
|
+
pass
|
426
|
+
|
427
|
+
|
428
|
+
class TsvManager(CsvManager):
|
429
|
+
"""
|
430
|
+
TSV files are just CSV files with tabs instead of commas as separators.
|
431
|
+
(We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
|
432
|
+
"""
|
433
|
+
@classmethod
|
434
|
+
def _get_csv_reader(cls, filename) -> CsvReader:
|
435
|
+
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
|
436
|
+
|
437
|
+
|
438
|
+
class ItemTsvManager(ItemManagerMixin, TsvManager):
|
439
|
+
"""
|
440
|
+
This layers item-style row processing functionality on a TSV file.
|
441
|
+
"""
|
442
|
+
pass
|
443
|
+
|
444
|
+
|
445
|
+
class ItemManager(AbstractTableSetManager):
|
446
|
+
"""
|
447
|
+
This class will open a .xlsx or .csv file and load its content in our standard format.
|
448
|
+
(See more detailed description in AbstractTableManager.)
|
449
|
+
"""
|
450
|
+
|
451
|
+
@classmethod
|
452
|
+
def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager:
|
453
|
+
if filename.endswith(".xlsx"):
|
454
|
+
if tab_name is not None:
|
455
|
+
raise ValueError(f".xlsx files don't need tab_name={tab_name!r}")
|
456
|
+
reader_agent = ItemXlsxManager(filename)
|
457
|
+
elif filename.endswith(".csv"):
|
458
|
+
reader_agent = ItemCsvManager(filename, tab_name=tab_name)
|
459
|
+
elif filename.endswith(".tsv"):
|
460
|
+
reader_agent = ItemTsvManager(filename, tab_name=tab_name)
|
461
|
+
else:
|
462
|
+
raise ValueError(f"Unknown file type: {filename}")
|
463
|
+
return reader_agent
|
464
|
+
|
465
|
+
@classmethod
|
466
|
+
def load(cls, filename: str, tab_name=None) -> AnyJsonData:
|
467
|
+
manager = cls.create_implementation_manager(filename, tab_name=tab_name)
|
468
|
+
return manager.load_content()
|
469
|
+
|
470
|
+
|
471
|
+
load_items = ItemManager.load
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import json
|
2
|
+
from dcicutils.ff_utils import get_metadata, search_metadata
|
3
|
+
from dcicutils.creds_utils import CGAPKeyManager
|
4
|
+
|
5
|
+
|
6
|
+
class VariantUtils:
|
7
|
+
|
8
|
+
SEARCH_VARIANTS_BY_GENE = (f'/search/?type=VariantSample&limit=1'
|
9
|
+
f'&variant.genes.genes_most_severe_gene.display_title=')
|
10
|
+
SEARCH_RARE_VARIANTS_BY_GENE = (f'/search/?samplegeno.samplegeno_role=proband&type=VariantSample'
|
11
|
+
f'&variant.csq_gnomadg_af_popmax.from=0&variant.csq_gnomadg_af_popmax.to=0.001'
|
12
|
+
f'&variant.genes.genes_most_severe_gene.display_title=')
|
13
|
+
|
14
|
+
def __init__(self, *, env_name) -> None:
|
15
|
+
self._key_manager = CGAPKeyManager()
|
16
|
+
self.creds = self._key_manager.get_keydict_for_env(env=env_name)
|
17
|
+
# Uncomment this if needed
|
18
|
+
# self.health = get_health_page(key=self.creds)
|
19
|
+
self.base_url = self.creds['server']
|
20
|
+
|
21
|
+
def get_creds(self):
|
22
|
+
return self.creds
|
23
|
+
|
24
|
+
def get_rare_variants_by_gene(self, *, gene, sort, addon=''):
|
25
|
+
"""Searches for rare variants on a particular gene"""
|
26
|
+
return search_metadata(f'{self.base_url}/{self.SEARCH_RARE_VARIANTS_BY_GENE}{gene}\
|
27
|
+
&sort=-{sort}{addon}', key=self.creds)
|
28
|
+
|
29
|
+
def find_number_of_sample_ids(self, gene):
|
30
|
+
"""Returns the number of samples that have a mutation on the specified gene"""
|
31
|
+
return len(set(variant.get('CALL_INFO')
|
32
|
+
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID')))
|
33
|
+
|
34
|
+
def get_total_result_count_from_search(self, gene):
|
35
|
+
"""Returns total number of variants associated with specified gene"""
|
36
|
+
res = get_metadata(self.SEARCH_VARIANTS_BY_GENE + gene, key=self.creds)
|
37
|
+
return res['total']
|
38
|
+
|
39
|
+
@staticmethod
|
40
|
+
def sort_dict_in_descending_order(unsorted_dict):
|
41
|
+
"""Sorts dictionary in descending value order"""
|
42
|
+
sorted_list = sorted(unsorted_dict.items(), key=lambda x: x[1], reverse=True)
|
43
|
+
return dict(sorted_list)
|
44
|
+
|
45
|
+
def create_dict_of_mutations(self, gene):
|
46
|
+
"""Creates dictionary of specified gene and mutations that occur 10+ times in database, in the form:
|
47
|
+
{gene: {mutation1 pos: #variants, mutation2 pos: #variants, ...}"""
|
48
|
+
mutation_dict = {}
|
49
|
+
unique_positions = set()
|
50
|
+
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'):
|
51
|
+
pos = variant['variant']['POS']
|
52
|
+
if pos not in unique_positions:
|
53
|
+
unique_positions.add(pos)
|
54
|
+
mutation_dict[pos] = 1
|
55
|
+
else:
|
56
|
+
mutation_dict[pos] += 1
|
57
|
+
return {gene: self.sort_dict_in_descending_order({k: v for k, v in mutation_dict.items() if v >= 10})}
|
58
|
+
|
59
|
+
@staticmethod
|
60
|
+
def return_json(file_name):
|
61
|
+
with open(file_name, 'r') as f:
|
62
|
+
file_content = json.loads(f)
|
63
|
+
return file_content
|
64
|
+
|
65
|
+
@staticmethod
|
66
|
+
def create_dict_from_json_file(file_name):
|
67
|
+
"""Creates dictionary object from specified json file"""
|
68
|
+
with open(file_name) as f:
|
69
|
+
json_list = f.read()
|
70
|
+
return json.loads(json_list)
|
71
|
+
|
72
|
+
def create_list_of_msa_genes(self):
|
73
|
+
"""Creates list of genes relating to the brain or nervous system
|
74
|
+
(determined by whether keywords 'neur' or 'nerv' in summary)"""
|
75
|
+
genes = self.return_json('gene.json')
|
76
|
+
return [gene['gene_symbol'] for gene in genes
|
77
|
+
if 'nerv' in gene.get('gene_summary', '')
|
78
|
+
or 'neur' in gene.get('gene_summary', '')]
|
79
|
+
|
80
|
+
def create_url(self, gene):
|
81
|
+
"""Returns a url to the variants at the most commonly mutated position of specified gene"""
|
82
|
+
d = self.create_dict_from_json_file('10+sorted_msa_genes_and_mutations.json')
|
83
|
+
pos = list(d[gene].keys())[0]
|
84
|
+
return self.SEARCH_RARE_VARIANTS_BY_GENE + gene + f'&variant.POS.from={pos}&variant.POS.to={pos}&sort=-DP'
|
85
|
+
|
86
|
+
def create_list_of_als_park_genes(self):
|
87
|
+
"""Creates list of genes that relating to Parkinson's or ALS
|
88
|
+
(determined by whether keywords 'Parkinson' or 'ALS' in summary)"""
|
89
|
+
genes = self.return_json('gene.json')
|
90
|
+
return [gene['gene_symbol'] for gene in genes
|
91
|
+
if 'Parkinson' in gene.get('gene_summary', '')
|
92
|
+
or 'ALS' in gene.get('gene_summary', '')]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dcicutils
|
3
|
-
Version: 7.
|
3
|
+
Version: 7.8.1.1b1
|
4
4
|
Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
|
5
5
|
Home-page: https://github.com/4dn-dcic/utils
|
6
6
|
License: MIT
|
@@ -25,6 +25,7 @@ Requires-Dist: PyYAML (>=5.1,<5.5)
|
|
25
25
|
Requires-Dist: aws-requests-auth (>=0.4.2,<1)
|
26
26
|
Requires-Dist: boto3 (>=1.17.39,<2.0.0)
|
27
27
|
Requires-Dist: botocore (>=1.20.39,<2.0.0)
|
28
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
28
29
|
Requires-Dist: docker (>=4.4.4,<5.0.0)
|
29
30
|
Requires-Dist: elasticsearch (==7.13.4)
|
30
31
|
Requires-Dist: gitpython (>=3.1.2,<4.0.0)
|
@@ -23,7 +23,7 @@ dcicutils/env_utils_legacy.py,sha256=J81OAtJHN69o1beHO6q1j7_J6TeblSjnAHlS8VA5KSM
|
|
23
23
|
dcicutils/es_utils.py,sha256=ZksLh5ei7kRUfiFltk8sd2ZSfh15twbstrMzBr8HNw4,7541
|
24
24
|
dcicutils/exceptions.py,sha256=4giQGtpak-omQv7BP6Ckeu91XK5fnDosC8gfdmN_ccA,9931
|
25
25
|
dcicutils/ff_mocks.py,sha256=6RKS4eUiu_Wl8yP_8V0CaV75w4ZdWxdCuL1CVlnMrek,36918
|
26
|
-
dcicutils/ff_utils.py,sha256=
|
26
|
+
dcicutils/ff_utils.py,sha256=cQDboe7GuTaNW-LYqTCDsja6JfeF16DZJKsZDPgxnqw,66454
|
27
27
|
dcicutils/function_cache_decorator.py,sha256=XMyiEGODVr2WoAQ68vcoX_9_Xb9p8pZXdXl7keU8i2g,10026
|
28
28
|
dcicutils/glacier_utils.py,sha256=x4zRGeSBS9c3LeurjR2gvEr_ipDTVpULvRFsIMfOVrs,33704
|
29
29
|
dcicutils/jh_utils.py,sha256=Gpsxb9XEzggF_-Eq3ukjKvTnuyb9V1SCSUXkXsES4Kg,11502
|
@@ -43,13 +43,14 @@ dcicutils/redis_utils.py,sha256=VJ-7g8pOZqR1ZCtdcjKz3-6as2DMUcs1b1zG6wSprH4,6462
|
|
43
43
|
dcicutils/s3_utils.py,sha256=a9eU3Flh8Asc8xPWLGP16A6UQ_FVwhoFQNqm4ZYgSQ4,28852
|
44
44
|
dcicutils/scripts/publish_to_pypi.py,sha256=qmWyjrg5bNQNfpNKFTZdyMXpRmrECnRV9VmNQddUPQA,13576
|
45
45
|
dcicutils/secrets_utils.py,sha256=8dppXAsiHhJzI6NmOcvJV5ldvKkQZzh3Fl-cb8Wm7MI,19745
|
46
|
-
dcicutils/sheet_utils.py,sha256=
|
46
|
+
dcicutils/sheet_utils.py,sha256=J7p_bsn_nCGZZQ1oShZx4jGUVqhriAhV_AvmKvl1JQ4,18864
|
47
47
|
dcicutils/snapshot_utils.py,sha256=ymP7PXH6-yEiXAt75w0ldQFciGNqWBClNxC5gfX2FnY,22961
|
48
48
|
dcicutils/ssl_certificate_utils.py,sha256=F0ifz_wnRRN9dfrfsz7aCp4UDLgHEY8LaK7PjnNvrAQ,9707
|
49
49
|
dcicutils/task_utils.py,sha256=MF8ujmTD6-O2AC2gRGPHyGdUrVKgtr8epT5XU8WtNjk,8082
|
50
50
|
dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
|
51
|
-
dcicutils
|
52
|
-
dcicutils-7.
|
53
|
-
dcicutils-7.
|
54
|
-
dcicutils-7.
|
55
|
-
dcicutils-7.
|
51
|
+
dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
|
52
|
+
dcicutils-7.8.1.1b1.dist-info/LICENSE.txt,sha256=t0_-jIjqxNnymZoNJe-OltRIuuF8qfhN0ATlHyrUJPk,1102
|
53
|
+
dcicutils-7.8.1.1b1.dist-info/METADATA,sha256=ik6xODy1hDg8xBIHMdn4-lQUo4QhLYZNZ8wff66yKfQ,3083
|
54
|
+
dcicutils-7.8.1.1b1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
55
|
+
dcicutils-7.8.1.1b1.dist-info/entry_points.txt,sha256=Z3vezbXsTpTIY4N2F33c5e-WDVQxgz_Vsk1oV_JBN7A,146
|
56
|
+
dcicutils-7.8.1.1b1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|