dcicutils 7.8.0__tar.gz → 7.8.1.1b1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of dcicutils might be problematic. Click here for more details.
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/PKG-INFO +3 -1
- dcicutils-7.8.1.1b1/dcicutils/sheet_utils.py +471 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/pyproject.toml +9 -6
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/LICENSE.txt +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/README.rst +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/__init__.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/base.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/beanstalk_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/cloudformation_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/codebuild_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/command_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/common.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/contribution_scripts.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/contribution_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/creds_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/data_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/deployment_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/diff_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/docker_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ecr_scripts.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ecr_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ecs_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/env_base.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/env_manager.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/env_scripts.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/env_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/env_utils_legacy.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/es_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/exceptions.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ff_mocks.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ff_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/function_cache_decorator.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/glacier_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/jh_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/kibana/dashboards.json +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/kibana/readme.md +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/lang_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/license_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/log_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/misc_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/obfuscation_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/opensearch_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/project_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/qa_checkers.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/qa_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/redis_tools.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/redis_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/s3_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/scripts/publish_to_pypi.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/secrets_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/snapshot_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/ssl_certificate_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/task_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/trace_utils.py +0 -0
- {dcicutils-7.8.0 → dcicutils-7.8.1.1b1}/dcicutils/variant_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dcicutils
|
3
|
-
Version: 7.8.
|
3
|
+
Version: 7.8.1.1b1
|
4
4
|
Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
|
5
5
|
Home-page: https://github.com/4dn-dcic/utils
|
6
6
|
License: MIT
|
@@ -25,9 +25,11 @@ Requires-Dist: PyYAML (>=5.1,<5.5)
|
|
25
25
|
Requires-Dist: aws-requests-auth (>=0.4.2,<1)
|
26
26
|
Requires-Dist: boto3 (>=1.17.39,<2.0.0)
|
27
27
|
Requires-Dist: botocore (>=1.20.39,<2.0.0)
|
28
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
28
29
|
Requires-Dist: docker (>=4.4.4,<5.0.0)
|
29
30
|
Requires-Dist: elasticsearch (==7.13.4)
|
30
31
|
Requires-Dist: gitpython (>=3.1.2,<4.0.0)
|
32
|
+
Requires-Dist: openpyxl (>=3.1.2,<4.0.0)
|
31
33
|
Requires-Dist: opensearch-py (>=2.0.1,<3.0.0)
|
32
34
|
Requires-Dist: pyOpenSSL (>=23.1.1,<24.0.0)
|
33
35
|
Requires-Dist: pytz (>=2020.4)
|
@@ -0,0 +1,471 @@
|
|
1
|
+
import chardet
|
2
|
+
import copy
|
3
|
+
import csv
|
4
|
+
import io
|
5
|
+
import openpyxl
|
6
|
+
|
7
|
+
from dcicutils.common import AnyJsonData
|
8
|
+
from dcicutils.misc_utils import ignored
|
9
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
10
|
+
from openpyxl.workbook.workbook import Workbook
|
11
|
+
from tempfile import TemporaryFile
|
12
|
+
from typing import Any, Dict, Iterable, List, Union
|
13
|
+
|
14
|
+
|
15
|
+
Header = str
|
16
|
+
Headers = List[str]
|
17
|
+
ParsedHeader = List[Union[str, int]]
|
18
|
+
ParsedHeaders = List[ParsedHeader]
|
19
|
+
SheetCellValue = Union[int, float, str]
|
20
|
+
SheetRow = List[SheetCellValue]
|
21
|
+
CsvReader = type(csv.reader(TemporaryFile()))
|
22
|
+
|
23
|
+
|
24
|
+
def prefer_number(value: SheetCellValue):
|
25
|
+
if isinstance(value, str): # the given value might be an int or float, in which case just fall through
|
26
|
+
if not value:
|
27
|
+
return None
|
28
|
+
value = value
|
29
|
+
ch0 = value[0]
|
30
|
+
if ch0 == '+' or ch0 == '-' or ch0.isdigit():
|
31
|
+
try:
|
32
|
+
return int(value)
|
33
|
+
except Exception:
|
34
|
+
pass
|
35
|
+
try:
|
36
|
+
return float(value)
|
37
|
+
except Exception:
|
38
|
+
pass
|
39
|
+
# If we couldn't parse it as an int or float, fall through to returning the original value
|
40
|
+
pass
|
41
|
+
return value
|
42
|
+
|
43
|
+
|
44
|
+
def open_text_input_file_respecting_byte_order_mark(filename):
|
45
|
+
"""
|
46
|
+
Opens a file for text input, respecting a byte-order mark (BOM).
|
47
|
+
"""
|
48
|
+
with io.open(filename, 'rb') as fp:
|
49
|
+
leading_bytes = fp.read(4 * 8) # 4 bytes is all we need
|
50
|
+
bom_info = chardet.detect(leading_bytes)
|
51
|
+
detected_encoding = bom_info and bom_info.get('encoding') # tread lightly
|
52
|
+
|
53
|
+
return io.open(filename, 'r', encoding=detected_encoding)
|
54
|
+
|
55
|
+
|
56
|
+
class ItemTools:
|
57
|
+
"""
|
58
|
+
Implements operations on table-related data without pre-supposing the specific representation of the table.
|
59
|
+
It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because
|
60
|
+
it does not presuppose the source of the data nor where it will be written to.
|
61
|
+
|
62
|
+
For the purpose of this class:
|
63
|
+
|
64
|
+
* a 'header' is a string representing the top of a column.
|
65
|
+
|
66
|
+
* a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that
|
67
|
+
"a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing
|
68
|
+
each numeric token as an int instead of a string.
|
69
|
+
|
70
|
+
* a 'headers' object is just a list of strings, each of which is a 'header'.
|
71
|
+
|
72
|
+
* a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'.
|
73
|
+
e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]].
|
74
|
+
|
75
|
+
"""
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def parse_sheet_header(cls, header: Header) -> ParsedHeader:
|
79
|
+
result = []
|
80
|
+
token = ""
|
81
|
+
for i in range(len(header)):
|
82
|
+
ch = header[i]
|
83
|
+
if ch == '.' or ch == '#':
|
84
|
+
if token:
|
85
|
+
result.append(int(token) if token.isdigit() else token)
|
86
|
+
token = ""
|
87
|
+
else:
|
88
|
+
token += ch
|
89
|
+
if token:
|
90
|
+
result.append(int(token) if token.isdigit() else token)
|
91
|
+
return result
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def parse_sheet_headers(cls, headers: Headers):
|
95
|
+
return [cls.parse_sheet_header(header)
|
96
|
+
for header in headers]
|
97
|
+
|
98
|
+
@classmethod
|
99
|
+
def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
|
100
|
+
prototype = {}
|
101
|
+
for parsed_header in parsed_headers:
|
102
|
+
parsed_header0 = parsed_header[0]
|
103
|
+
if isinstance(parsed_header0, int):
|
104
|
+
raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}")
|
105
|
+
cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
|
106
|
+
return prototype
|
107
|
+
|
108
|
+
@classmethod
|
109
|
+
def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
|
110
|
+
[key0, *more_keys] = keys
|
111
|
+
key1 = more_keys[0] if more_keys else None
|
112
|
+
if isinstance(key1, int):
|
113
|
+
placeholder = []
|
114
|
+
elif isinstance(key1, str):
|
115
|
+
placeholder = {}
|
116
|
+
else:
|
117
|
+
placeholder = None
|
118
|
+
if isinstance(key0, int):
|
119
|
+
n = len(parent)
|
120
|
+
if key0 == n:
|
121
|
+
parent.append(placeholder)
|
122
|
+
elif key0 > n:
|
123
|
+
raise Exception("Numeric items must occur sequentially.")
|
124
|
+
elif isinstance(key0, str):
|
125
|
+
if key0 not in parent:
|
126
|
+
parent[key0] = placeholder
|
127
|
+
if key1 is not None:
|
128
|
+
cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys)
|
129
|
+
return parent
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData:
|
133
|
+
if isinstance(value, str):
|
134
|
+
lvalue = value.lower()
|
135
|
+
# TODO: We could consult a schema to make this less heuristic, but this may do for now
|
136
|
+
if lvalue == 'true':
|
137
|
+
return True
|
138
|
+
elif lvalue == 'false':
|
139
|
+
return False
|
140
|
+
elif lvalue == 'null' or lvalue == '':
|
141
|
+
return None
|
142
|
+
elif '|' in value:
|
143
|
+
return [cls.parse_item_value(subvalue) for subvalue in value.split('|')]
|
144
|
+
else:
|
145
|
+
return prefer_number(value)
|
146
|
+
else: # presumably a number (int or float)
|
147
|
+
return value
|
148
|
+
|
149
|
+
@classmethod
|
150
|
+
def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
|
151
|
+
if (value is None or value == '') and not force:
|
152
|
+
return
|
153
|
+
[key, *more_path] = path
|
154
|
+
if not more_path:
|
155
|
+
datum[key] = value
|
156
|
+
else:
|
157
|
+
cls.set_path_value(datum[key], more_path, value)
|
158
|
+
|
159
|
+
|
160
|
+
# TODO: Consider whether this might want to be an abstract base class. Some change might be needed.
|
161
|
+
#
|
162
|
+
# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class.
|
163
|
+
# I am less certain but open to discussion. Among other things, as implemented now,
|
164
|
+
# the __init__ method here needs to run and the documentation says that ABC's won't appear
|
165
|
+
# in the method resolution order. -kmp 17-Aug-2023
|
166
|
+
# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535
|
167
|
+
class AbstractTableSetManager:
|
168
|
+
"""
|
169
|
+
The TableSetManager is the spanning class of anything that wants to be able to load a table set,
|
170
|
+
regardless of what it wants to load it from. To do this, it must support a load method
|
171
|
+
that takes a filename and returns the file content in the form:
|
172
|
+
{
|
173
|
+
"Sheet1": [
|
174
|
+
{...representation of row1 as some kind of dict...},
|
175
|
+
{...representation of row2 as some kind of dict...}
|
176
|
+
],
|
177
|
+
"Sheet2": [...],
|
178
|
+
...,
|
179
|
+
}
|
180
|
+
Note that at this level of abstraction, we take no position on what form of representation is used
|
181
|
+
for the rows, as long as it is JSON data of some kind. It might be
|
182
|
+
{"col1": "val1", "col2": "val2", ...}
|
183
|
+
or it might be something more structured like
|
184
|
+
{"something": "val1", {"something_else": ["val2"]}}
|
185
|
+
Additionally, the values stored might be altered as well. In particular, the most likely alteration
|
186
|
+
is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations
|
187
|
+
happen is not constrained by this class.
|
188
|
+
"""
|
189
|
+
|
190
|
+
def __init__(self, **kwargs):
|
191
|
+
if kwargs:
|
192
|
+
raise ValueError(f"Got unexpected keywords: {kwargs}")
|
193
|
+
|
194
|
+
# TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
|
195
|
+
@classmethod
|
196
|
+
def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
|
197
|
+
"""
|
198
|
+
Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
|
199
|
+
For more information, see documentation of AbstractTableSetManager.
|
200
|
+
"""
|
201
|
+
raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.")
|
202
|
+
|
203
|
+
|
204
|
+
class BasicTableSetManager(AbstractTableSetManager):
|
205
|
+
"""
|
206
|
+
A BasicTableManager provides some structure that most kinds of parsers will need.
|
207
|
+
In particular, everything will likely need some way of storing headers and some way of storing content
|
208
|
+
of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case
|
209
|
+
of this where there's only one set of headers and only one block of content.
|
210
|
+
"""
|
211
|
+
|
212
|
+
def __init__(self, filename: str, **kwargs):
|
213
|
+
super().__init__(**kwargs)
|
214
|
+
self.filename: str = filename
|
215
|
+
self.headers_by_tabname: Dict[str, List[str]] = {}
|
216
|
+
self.content_by_tabname: Dict[str, List[AnyJsonData]] = {}
|
217
|
+
self.reader_agent: Any = self._get_reader_agent()
|
218
|
+
|
219
|
+
def tab_headers(self, tabname: str) -> List[str]:
|
220
|
+
return self.headers_by_tabname[tabname]
|
221
|
+
|
222
|
+
def tab_content(self, tabname: str) -> List[AnyJsonData]:
|
223
|
+
return self.content_by_tabname[tabname]
|
224
|
+
|
225
|
+
def _create_tab_processor_state(self, tabname: str) -> Any:
|
226
|
+
"""
|
227
|
+
This method provides for the possibility that some parsers will want auxiliary state,
|
228
|
+
(such as parsed headers or a line count or a table of temporary names for objects to cross-link
|
229
|
+
or some other such feature) that it carries with it as it moves from line to line parsing things.
|
230
|
+
Subclasses might therefore want to make this do something more interesting.
|
231
|
+
"""
|
232
|
+
ignored(tabname) # subclasses might need this, but we don't
|
233
|
+
return None
|
234
|
+
|
235
|
+
def _get_reader_agent(self) -> Any:
|
236
|
+
"""This function is responsible for opening the workbook and returning a workbook object."""
|
237
|
+
raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.")
|
238
|
+
|
239
|
+
def load_content(self) -> Any:
|
240
|
+
raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.")
|
241
|
+
|
242
|
+
|
243
|
+
class TableSetManager(BasicTableSetManager):
|
244
|
+
|
245
|
+
@classmethod
|
246
|
+
def load(cls, filename: str) -> AnyJsonData:
|
247
|
+
table_set_manager: TableSetManager = cls(filename)
|
248
|
+
return table_set_manager.load_content()
|
249
|
+
|
250
|
+
def __init__(self, filename: str):
|
251
|
+
super().__init__(filename=filename)
|
252
|
+
|
253
|
+
@property
|
254
|
+
def tabnames(self) -> List[str]:
|
255
|
+
raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..")
|
256
|
+
|
257
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
258
|
+
"""
|
259
|
+
Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values.
|
260
|
+
"""
|
261
|
+
raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.")
|
262
|
+
|
263
|
+
def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
|
264
|
+
"""
|
265
|
+
This needs to take a state and whatever represents a row and
|
266
|
+
must return a list of objects representing column values.
|
267
|
+
What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
|
268
|
+
"""
|
269
|
+
raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.")
|
270
|
+
|
271
|
+
def load_content(self) -> AnyJsonData:
|
272
|
+
for tabname in self.tabnames:
|
273
|
+
sheet_content = []
|
274
|
+
state = self._create_tab_processor_state(tabname)
|
275
|
+
for row_data in self._raw_row_generator_for_tabname(tabname):
|
276
|
+
processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data)
|
277
|
+
sheet_content.append(processed_row_data)
|
278
|
+
self.content_by_tabname[tabname] = sheet_content
|
279
|
+
return self.content_by_tabname
|
280
|
+
|
281
|
+
@classmethod
|
282
|
+
def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
|
283
|
+
return prefer_number(value)
|
284
|
+
|
285
|
+
|
286
|
+
class XlsxManager(TableSetManager):
|
287
|
+
"""
|
288
|
+
This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
|
289
|
+
"""
|
290
|
+
|
291
|
+
@classmethod
|
292
|
+
def _all_rows(cls, sheet: Worksheet):
|
293
|
+
row_max = sheet.max_row
|
294
|
+
for row in range(2, row_max + 1):
|
295
|
+
yield row
|
296
|
+
|
297
|
+
@classmethod
|
298
|
+
def _all_cols(cls, sheet: Worksheet):
|
299
|
+
col_max = sheet.max_column
|
300
|
+
for col in range(1, col_max + 1):
|
301
|
+
yield col
|
302
|
+
|
303
|
+
@property
|
304
|
+
def tabnames(self) -> List[str]:
|
305
|
+
return self.reader_agent.sheetnames
|
306
|
+
|
307
|
+
def _get_reader_agent(self) -> Workbook:
|
308
|
+
return openpyxl.load_workbook(self.filename)
|
309
|
+
|
310
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
311
|
+
sheet = self.reader_agent[tabname]
|
312
|
+
return (self._get_raw_row_content_tuple(sheet, row)
|
313
|
+
for row in self._all_rows(sheet))
|
314
|
+
|
315
|
+
def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
|
316
|
+
return [sheet.cell(row=row, column=col).value
|
317
|
+
for col in self._all_cols(sheet)]
|
318
|
+
|
319
|
+
def _create_tab_processor_state(self, tabname: str) -> Headers:
|
320
|
+
sheet = self.reader_agent[tabname]
|
321
|
+
headers: List[str] = [str(sheet.cell(row=1, column=col).value)
|
322
|
+
for col in self._all_cols(sheet)]
|
323
|
+
self.headers_by_tabname[sheet.title] = headers
|
324
|
+
return headers
|
325
|
+
|
326
|
+
def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
327
|
+
ignored(tabname)
|
328
|
+
return {headers[i]: self.parse_cell_value(row_datum)
|
329
|
+
for i, row_datum in enumerate(row_data)}
|
330
|
+
|
331
|
+
|
332
|
+
class ItemManagerMixin(BasicTableSetManager):
|
333
|
+
"""
|
334
|
+
This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows
|
335
|
+
get handled like Items instead of just flat table rows.
|
336
|
+
"""
|
337
|
+
|
338
|
+
def __init__(self, filename: str, **kwargs):
|
339
|
+
super().__init__(filename=filename, **kwargs)
|
340
|
+
self.patch_prototypes_by_tabname: Dict[str, Dict] = {}
|
341
|
+
self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {}
|
342
|
+
|
343
|
+
def sheet_patch_prototype(self, tabname: str) -> Dict:
|
344
|
+
return self.patch_prototypes_by_tabname[tabname]
|
345
|
+
|
346
|
+
def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]:
|
347
|
+
return self.parsed_headers_by_tabname[tabname]
|
348
|
+
|
349
|
+
def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders:
|
350
|
+
super()._create_tab_processor_state(tabname)
|
351
|
+
# This will create state that allows us to efficiently assign values in the right place on each row
|
352
|
+
# by setting up a prototype we can copy and then drop values into.
|
353
|
+
self._compile_sheet_headers(tabname)
|
354
|
+
return self.sheet_parsed_headers(tabname)
|
355
|
+
|
356
|
+
def _compile_sheet_headers(self, tabname: str):
|
357
|
+
headers = self.headers_by_tabname[tabname]
|
358
|
+
parsed_headers = ItemTools.parse_sheet_headers(headers)
|
359
|
+
self.parsed_headers_by_tabname[tabname] = parsed_headers
|
360
|
+
prototype = ItemTools.compute_patch_prototype(parsed_headers)
|
361
|
+
self.patch_prototypes_by_tabname[tabname] = prototype
|
362
|
+
|
363
|
+
def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData:
|
364
|
+
patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname))
|
365
|
+
for i, value in enumerate(row_data):
|
366
|
+
parsed_value = self.parse_cell_value(value)
|
367
|
+
ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value)
|
368
|
+
return patch_item
|
369
|
+
|
370
|
+
@classmethod
|
371
|
+
def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
|
372
|
+
return ItemTools.parse_item_value(value)
|
373
|
+
|
374
|
+
|
375
|
+
class ItemXlsxManager(ItemManagerMixin, XlsxManager):
|
376
|
+
"""
|
377
|
+
This layers item-style row processing functionality on an XLSX file.
|
378
|
+
"""
|
379
|
+
pass
|
380
|
+
|
381
|
+
|
382
|
+
class CsvManager(TableSetManager):
|
383
|
+
"""
|
384
|
+
This implements the mechanism to get a series of rows out of the sheet in a csv file,
|
385
|
+
returning a result that still looks like there could have been multiple tabs.
|
386
|
+
"""
|
387
|
+
|
388
|
+
DEFAULT_TAB_NAME = 'Sheet1'
|
389
|
+
|
390
|
+
def __init__(self, filename: str, tab_name=None):
|
391
|
+
super().__init__(filename=filename)
|
392
|
+
self.tab_name = tab_name or self.DEFAULT_TAB_NAME
|
393
|
+
|
394
|
+
@property
|
395
|
+
def tabnames(self) -> List[str]:
|
396
|
+
return [self.tab_name]
|
397
|
+
|
398
|
+
def _get_reader_agent(self) -> CsvReader:
|
399
|
+
return self._get_csv_reader(self.filename)
|
400
|
+
|
401
|
+
@classmethod
|
402
|
+
def _get_csv_reader(cls, filename) -> CsvReader:
|
403
|
+
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename))
|
404
|
+
|
405
|
+
def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
|
406
|
+
return self.reader_agent
|
407
|
+
|
408
|
+
def _create_tab_processor_state(self, tabname: str) -> Headers:
|
409
|
+
headers: Headers = self.headers_by_tabname.get(tabname)
|
410
|
+
if headers is None:
|
411
|
+
self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__()
|
412
|
+
print(f"Headers={headers}")
|
413
|
+
return headers
|
414
|
+
|
415
|
+
def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
|
416
|
+
ignored(tabname)
|
417
|
+
return {headers[i]: self.parse_cell_value(row_datum)
|
418
|
+
for i, row_datum in enumerate(row_data)}
|
419
|
+
|
420
|
+
|
421
|
+
class ItemCsvManager(ItemManagerMixin, CsvManager):
|
422
|
+
"""
|
423
|
+
This layers item-style row processing functionality on a CSV file.
|
424
|
+
"""
|
425
|
+
pass
|
426
|
+
|
427
|
+
|
428
|
+
class TsvManager(CsvManager):
|
429
|
+
"""
|
430
|
+
TSV files are just CSV files with tabs instead of commas as separators.
|
431
|
+
(We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
|
432
|
+
"""
|
433
|
+
@classmethod
|
434
|
+
def _get_csv_reader(cls, filename) -> CsvReader:
|
435
|
+
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
|
436
|
+
|
437
|
+
|
438
|
+
class ItemTsvManager(ItemManagerMixin, TsvManager):
|
439
|
+
"""
|
440
|
+
This layers item-style row processing functionality on a TSV file.
|
441
|
+
"""
|
442
|
+
pass
|
443
|
+
|
444
|
+
|
445
|
+
class ItemManager(AbstractTableSetManager):
|
446
|
+
"""
|
447
|
+
This class will open a .xlsx or .csv file and load its content in our standard format.
|
448
|
+
(See more detailed description in AbstractTableManager.)
|
449
|
+
"""
|
450
|
+
|
451
|
+
@classmethod
|
452
|
+
def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager:
|
453
|
+
if filename.endswith(".xlsx"):
|
454
|
+
if tab_name is not None:
|
455
|
+
raise ValueError(f".xlsx files don't need tab_name={tab_name!r}")
|
456
|
+
reader_agent = ItemXlsxManager(filename)
|
457
|
+
elif filename.endswith(".csv"):
|
458
|
+
reader_agent = ItemCsvManager(filename, tab_name=tab_name)
|
459
|
+
elif filename.endswith(".tsv"):
|
460
|
+
reader_agent = ItemTsvManager(filename, tab_name=tab_name)
|
461
|
+
else:
|
462
|
+
raise ValueError(f"Unknown file type: {filename}")
|
463
|
+
return reader_agent
|
464
|
+
|
465
|
+
@classmethod
|
466
|
+
def load(cls, filename: str, tab_name=None) -> AnyJsonData:
|
467
|
+
manager = cls.create_implementation_manager(filename, tab_name=tab_name)
|
468
|
+
return manager.load_content()
|
469
|
+
|
470
|
+
|
471
|
+
load_items = ItemManager.load
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dcicutils"
|
3
|
-
version = "7.8.0"
|
3
|
+
version = "7.8.1.1b1" # to become "7.9.0"
|
4
4
|
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
|
5
5
|
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
|
6
6
|
license = "MIT"
|
@@ -37,28 +37,31 @@ classifiers = [
|
|
37
37
|
|
38
38
|
[tool.poetry.dependencies]
|
39
39
|
python = ">=3.7,<3.10"
|
40
|
+
|
40
41
|
boto3 = "^1.17.39"
|
41
42
|
botocore = "^1.20.39"
|
42
43
|
# The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version.
|
43
44
|
# This value is intentionally pinned and must not be changed casually.
|
44
45
|
elasticsearch = "7.13.4"
|
45
46
|
aws-requests-auth = ">=0.4.2,<1"
|
47
|
+
chardet = "^5.2.0"
|
46
48
|
docker = "^4.4.4"
|
47
49
|
gitpython = "^3.1.2"
|
50
|
+
openpyxl = "^3.1.2"
|
51
|
+
opensearch-py = "^2.0.1"
|
52
|
+
pyOpenSSL = "^23.1.1"
|
53
|
+
PyJWT = "^2.6.0"
|
48
54
|
pytz = ">=2020.4"
|
49
55
|
PyYAML = ">=5.1,<5.5"
|
56
|
+
redis = "^4.5.1"
|
50
57
|
requests = "^2.21.0"
|
51
58
|
rfc3986 = "^1.4.0"
|
52
59
|
structlog = "^19.2.0"
|
53
60
|
toml = ">=0.10.1,<1"
|
61
|
+
tqdm = "^4.65.0"
|
54
62
|
typing-extensions = ">=3.8" # Fourfront uses 3.8
|
55
63
|
urllib3 = "^1.26.6"
|
56
64
|
webtest = "^2.0.34"
|
57
|
-
opensearch-py = "^2.0.1"
|
58
|
-
redis = "^4.5.1"
|
59
|
-
pyOpenSSL = "^23.1.1"
|
60
|
-
PyJWT = "^2.6.0"
|
61
|
-
tqdm = "^4.65.0"
|
62
65
|
|
63
66
|
|
64
67
|
[tool.poetry.dev-dependencies]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|