CytoTable 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/constants.py +0 -7
- cytotable/convert.py +309 -172
- cytotable/presets.py +56 -0
- cytotable/utils.py +155 -49
- {cytotable-0.0.9.dist-info → cytotable-0.0.11.dist-info}/METADATA +1 -1
- cytotable-0.0.11.dist-info/RECORD +11 -0
- {cytotable-0.0.9.dist-info → cytotable-0.0.11.dist-info}/WHEEL +1 -1
- cytotable-0.0.9.dist-info/RECORD +0 -11
- {cytotable-0.0.9.dist-info → cytotable-0.0.11.dist-info}/LICENSE +0 -0
cytotable/__init__.py
CHANGED
cytotable/constants.py
CHANGED
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
|
68
68
|
],
|
69
69
|
}
|
70
70
|
|
71
|
-
# metadata column names and types for internal use within CytoTable
|
72
|
-
CYOTABLE_META_COLUMN_TYPES = {
|
73
|
-
"cytotable_meta_source_path": "VARCHAR",
|
74
|
-
"cytotable_meta_offset": "BIGINT",
|
75
|
-
"cytotable_meta_rownum": "BIGINT",
|
76
|
-
}
|
77
|
-
|
78
71
|
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
79
72
|
"data-producer": "https://github.com/cytomining/CytoTable",
|
80
73
|
"data-producer-version": str(_get_cytotable_version()),
|
cytotable/convert.py
CHANGED
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
|
|
4
4
|
|
5
5
|
import itertools
|
6
6
|
import logging
|
7
|
-
import uuid
|
8
7
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
|
9
8
|
|
10
9
|
import parsl
|
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
|
|
33
32
|
|
34
33
|
Args:
|
35
34
|
source: Dict[str, Any]
|
36
|
-
Contains
|
35
|
+
Contains source data details. Represents a single
|
37
36
|
file or table of some kind.
|
38
37
|
sort_output:
|
39
38
|
Specifies whether to sort cytotable output or not.
|
@@ -43,10 +42,7 @@ def _get_table_columns_and_types(
|
|
43
42
|
list of dictionaries which each include column level information
|
44
43
|
"""
|
45
44
|
|
46
|
-
import pathlib
|
47
|
-
|
48
45
|
import duckdb
|
49
|
-
from cloudpathlib import AnyPath
|
50
46
|
|
51
47
|
from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
|
52
48
|
|
@@ -89,7 +85,7 @@ def _get_table_columns_and_types(
|
|
89
85
|
# with exception handling to read mixed-type data
|
90
86
|
# using sqlite3 and special utility function
|
91
87
|
try:
|
92
|
-
# isolate using new connection to read data
|
88
|
+
# isolate using new connection to read data based on pageset
|
93
89
|
# and export directly to parquet via duckdb (avoiding need to return data to python)
|
94
90
|
# perform the query and create a list of dictionaries with the column data for table
|
95
91
|
with _duckdb_reader() as ddb_reader:
|
@@ -109,13 +105,8 @@ def _get_table_columns_and_types(
|
|
109
105
|
arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
|
110
106
|
source_path=str(source["source_path"]),
|
111
107
|
table_name=str(source["table_name"]),
|
112
|
-
|
113
|
-
|
114
|
-
chunk_size=5,
|
115
|
-
# offset is set to 0 start at first row
|
116
|
-
# result from table
|
117
|
-
offset=0,
|
118
|
-
add_cytotable_meta=False,
|
108
|
+
page_key=source["page_key"],
|
109
|
+
pageset=source["pagesets"][0],
|
119
110
|
sort_output=sort_output,
|
120
111
|
)
|
121
112
|
with _duckdb_reader() as ddb_reader:
|
@@ -183,13 +174,114 @@ def _prep_cast_column_data_types(
|
|
183
174
|
|
184
175
|
|
185
176
|
@python_app
|
186
|
-
def
|
177
|
+
def _set_tablenumber(
|
178
|
+
sources: Dict[str, List[Dict[str, Any]]],
|
179
|
+
add_tablenumber: Optional[bool] = None,
|
180
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
181
|
+
"""
|
182
|
+
Gathers a "TableNumber" from the image table (if CSV) or
|
183
|
+
SQLite file (if SQLite source) which is a unique identifier
|
184
|
+
intended to help differentiate between imagenumbers
|
185
|
+
to create distinct records for single-cell profiles
|
186
|
+
referenced across multiple source data exports.
|
187
|
+
For example, ImageNumber column values from CellProfiler
|
188
|
+
will repeat across exports, meaning we may lose distinction
|
189
|
+
when combining multiple export files together through CytoTable.
|
190
|
+
|
191
|
+
Note:
|
192
|
+
- If using CSV data sources, the image.csv table is used for checksum.
|
193
|
+
- If using SQLite data sources, the entire SQLite database is used for checksum.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
sources: Dict[str, List[Dict[str, Any]]]
|
197
|
+
Contains metadata about data tables and related contents.
|
198
|
+
add_tablenumber: Optional[bool]
|
199
|
+
Whether to add a calculated tablenumber.
|
200
|
+
Note: when False, adds None as the tablenumber
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
List[Dict[str, Any]]
|
204
|
+
New source group with added TableNumber details.
|
205
|
+
"""
|
206
|
+
|
207
|
+
from cloudpathlib import AnyPath
|
208
|
+
|
209
|
+
from cytotable.utils import _gather_tablenumber_checksum
|
210
|
+
|
211
|
+
image_table_groups = {
|
212
|
+
# create a data structure with the common parent for each dataset
|
213
|
+
# and the calculated checksum from the image table.
|
214
|
+
# note: the source_path parent is used for non-SQLite files
|
215
|
+
# whereas the direct source path is used for SQLite files.
|
216
|
+
(
|
217
|
+
str(source["source_path"].parent)
|
218
|
+
if source["source_path"].suffix != "sqlite"
|
219
|
+
else source["source_path"]
|
220
|
+
): source["source_path"]
|
221
|
+
for source_group_name, source_group_vals in sources.items()
|
222
|
+
# use the image tables references only for the basis of the
|
223
|
+
# these calculations.
|
224
|
+
if any(
|
225
|
+
value in str(AnyPath(source_group_name).stem).lower()
|
226
|
+
for value in ["image", "per_image"]
|
227
|
+
)
|
228
|
+
for source in source_group_vals
|
229
|
+
}
|
230
|
+
|
231
|
+
# determine if we need to add tablenumber data
|
232
|
+
if (
|
233
|
+
# case for detecting multiple image tables which need to be differentiated
|
234
|
+
add_tablenumber is None
|
235
|
+
and (len(image_table_groups) <= 1)
|
236
|
+
) or (
|
237
|
+
# case for explicitly set no tablenumbers
|
238
|
+
add_tablenumber
|
239
|
+
is False
|
240
|
+
):
|
241
|
+
return {
|
242
|
+
source_group_name: [
|
243
|
+
dict(
|
244
|
+
source,
|
245
|
+
**{
|
246
|
+
"tablenumber": None,
|
247
|
+
},
|
248
|
+
)
|
249
|
+
for source in source_group_vals
|
250
|
+
]
|
251
|
+
for source_group_name, source_group_vals in sources.items()
|
252
|
+
}
|
253
|
+
|
254
|
+
# gather the image table from the source_group
|
255
|
+
tablenumber_table = {
|
256
|
+
# create a data structure with the common parent for each dataset
|
257
|
+
# and the calculated checksum from the image table
|
258
|
+
group: _gather_tablenumber_checksum(path)
|
259
|
+
for group, path in image_table_groups.items()
|
260
|
+
}
|
261
|
+
|
262
|
+
# return a modified sources data structure with the tablenumber added
|
263
|
+
return {
|
264
|
+
source_group_name: [
|
265
|
+
dict(
|
266
|
+
source,
|
267
|
+
**{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
|
268
|
+
)
|
269
|
+
for source in source_group_vals
|
270
|
+
if str(source["source_path"].parent) in list(tablenumber_table.keys())
|
271
|
+
]
|
272
|
+
for source_group_name, source_group_vals in sources.items()
|
273
|
+
}
|
274
|
+
|
275
|
+
|
276
|
+
@python_app
|
277
|
+
def _get_table_keyset_pagination_sets(
|
187
278
|
chunk_size: int,
|
279
|
+
page_key: str,
|
188
280
|
source: Optional[Dict[str, Any]] = None,
|
189
281
|
sql_stmt: Optional[str] = None,
|
190
|
-
) -> Union[List[int], None]:
|
282
|
+
) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
|
191
283
|
"""
|
192
|
-
Get table data chunk
|
284
|
+
Get table data chunk keys for later use in capturing segments
|
193
285
|
of values. This work also provides a chance to catch problematic
|
194
286
|
input data which will be ignored with warnings.
|
195
287
|
|
@@ -199,21 +291,27 @@ def _get_table_chunk_offsets(
|
|
199
291
|
file or table of some kind.
|
200
292
|
chunk_size: int
|
201
293
|
The size in rowcount of the chunks to create.
|
294
|
+
page_key: str
|
295
|
+
The column name to be used to identify pagination chunks.
|
296
|
+
Expected to be of numeric type (int, float) for ordering.
|
297
|
+
sql_stmt:
|
298
|
+
Optional sql statement to form the pagination set from.
|
299
|
+
Default behavior extracts pagination sets from the full
|
300
|
+
data source.
|
202
301
|
|
203
302
|
Returns:
|
204
|
-
List[
|
205
|
-
List of
|
206
|
-
the data later on.
|
303
|
+
List[Any]
|
304
|
+
List of keys to use for reading the data later on.
|
207
305
|
"""
|
208
306
|
|
209
307
|
import logging
|
210
|
-
import
|
308
|
+
import sqlite3
|
309
|
+
from contextlib import closing
|
211
310
|
|
212
311
|
import duckdb
|
213
|
-
from cloudpathlib import AnyPath, CloudPath
|
214
312
|
|
215
313
|
from cytotable.exceptions import NoInputDataException
|
216
|
-
from cytotable.utils import _duckdb_reader
|
314
|
+
from cytotable.utils import _duckdb_reader, _generate_pagesets
|
217
315
|
|
218
316
|
logger = logging.getLogger(__name__)
|
219
317
|
|
@@ -223,18 +321,29 @@ def _get_table_chunk_offsets(
|
|
223
321
|
source_type = str(source_path.suffix).lower()
|
224
322
|
|
225
323
|
try:
|
226
|
-
# gather the total rowcount from csv or sqlite data input sources
|
227
324
|
with _duckdb_reader() as ddb_reader:
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
).
|
235
|
-
|
325
|
+
if source_type == ".csv":
|
326
|
+
sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
|
327
|
+
else:
|
328
|
+
sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
|
329
|
+
|
330
|
+
page_keys = [
|
331
|
+
results[0] for results in ddb_reader.execute(sql_query).fetchall()
|
332
|
+
]
|
333
|
+
|
334
|
+
# exception case for when we have mixed types
|
335
|
+
# (i.e. integer col with string and ints) in a sqlite column
|
336
|
+
except duckdb.TypeMismatchException:
|
337
|
+
with closing(sqlite3.connect(source_path)) as cx:
|
338
|
+
with cx:
|
339
|
+
page_keys = [
|
340
|
+
key[0]
|
341
|
+
for key in cx.execute(
|
342
|
+
f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
|
343
|
+
).fetchall()
|
344
|
+
if isinstance(key[0], (int, float))
|
345
|
+
]
|
236
346
|
|
237
|
-
# catch input errors which will result in skipped files
|
238
347
|
except (
|
239
348
|
duckdb.InvalidInputException,
|
240
349
|
NoInputDataException,
|
@@ -245,34 +354,20 @@ def _get_table_chunk_offsets(
|
|
245
354
|
|
246
355
|
return None
|
247
356
|
|
248
|
-
# find chunk offsets from sql statement
|
249
357
|
elif sql_stmt is not None:
|
250
|
-
# gather the total rowcount from csv or sqlite data input sources
|
251
358
|
with _duckdb_reader() as ddb_reader:
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
f"SELECT COUNT(*) FROM ({sql_stmt})"
|
256
|
-
).fetchone()[0]
|
257
|
-
)
|
359
|
+
sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
|
360
|
+
page_keys = ddb_reader.execute(sql_query).fetchall()
|
361
|
+
page_keys = [key[0] for key in page_keys]
|
258
362
|
|
259
|
-
return
|
260
|
-
range(
|
261
|
-
0,
|
262
|
-
# gather rowcount from table and use as maximum for range
|
263
|
-
rowcount,
|
264
|
-
# step through using chunk size
|
265
|
-
chunk_size,
|
266
|
-
)
|
267
|
-
)
|
363
|
+
return _generate_pagesets(page_keys, chunk_size)
|
268
364
|
|
269
365
|
|
270
366
|
@python_app
|
271
|
-
def
|
367
|
+
def _source_pageset_to_parquet(
|
272
368
|
source_group_name: str,
|
273
369
|
source: Dict[str, Any],
|
274
|
-
|
275
|
-
offset: int,
|
370
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
276
371
|
dest_path: str,
|
277
372
|
sort_output: bool,
|
278
373
|
) -> str:
|
@@ -285,10 +380,8 @@ def _source_chunk_to_parquet(
|
|
285
380
|
source: Dict[str, Any]
|
286
381
|
Contains the source data to be chunked. Represents a single
|
287
382
|
file or table of some kind along with collected information about table.
|
288
|
-
|
289
|
-
|
290
|
-
offset: int
|
291
|
-
The offset for chunking the data from source.
|
383
|
+
pageset: Tuple[int, int]
|
384
|
+
The pageset for chunking the data from source.
|
292
385
|
dest_path: str
|
293
386
|
Path to store the output data.
|
294
387
|
sort_output: bool
|
@@ -303,9 +396,7 @@ def _source_chunk_to_parquet(
|
|
303
396
|
|
304
397
|
import duckdb
|
305
398
|
from cloudpathlib import AnyPath
|
306
|
-
from pyarrow import parquet
|
307
399
|
|
308
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
309
400
|
from cytotable.utils import (
|
310
401
|
_duckdb_reader,
|
311
402
|
_sqlite_mixed_type_query_to_parquet,
|
@@ -319,26 +410,18 @@ def _source_chunk_to_parquet(
|
|
319
410
|
)
|
320
411
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
321
412
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
413
|
+
# build tablenumber segment addition (if necessary)
|
414
|
+
tablenumber_sql = (
|
415
|
+
# to become tablenumber in sql select later with bigint (8-byte integer)
|
416
|
+
# we cast here to bigint to avoid concat or join conflicts later due to
|
417
|
+
# misaligned automatic data typing.
|
418
|
+
f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
|
419
|
+
if source["tablenumber"] is not None
|
420
|
+
# don't introduce the column if we aren't supposed to add tablenumber
|
421
|
+
# as per parameter.
|
422
|
+
else ""
|
326
423
|
)
|
327
|
-
# build the column selection block of query
|
328
424
|
|
329
|
-
# add cytotable metadata columns
|
330
|
-
cytotable_metadata_cols = [
|
331
|
-
(
|
332
|
-
f"CAST( '{source_path_str}' "
|
333
|
-
f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
|
334
|
-
' AS "cytotable_meta_source_path"'
|
335
|
-
),
|
336
|
-
f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
|
337
|
-
(
|
338
|
-
f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
|
339
|
-
' AS "cytotable_meta_rownum"'
|
340
|
-
),
|
341
|
-
]
|
342
425
|
# add source table columns
|
343
426
|
casted_source_cols = [
|
344
427
|
# here we cast the column to the specified type ensure the colname remains the same
|
@@ -346,10 +429,10 @@ def _source_chunk_to_parquet(
|
|
346
429
|
for column in source["columns"]
|
347
430
|
]
|
348
431
|
|
349
|
-
# create selection statement from lists above
|
350
|
-
select_columns = ",".join(
|
432
|
+
# create selection statement from tablenumber_sql + lists above
|
433
|
+
select_columns = tablenumber_sql + ",".join(
|
351
434
|
# if we should sort the output, add the metadata_cols
|
352
|
-
|
435
|
+
casted_source_cols
|
353
436
|
if sort_output
|
354
437
|
else casted_source_cols
|
355
438
|
)
|
@@ -364,7 +447,8 @@ def _source_chunk_to_parquet(
|
|
364
447
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
365
448
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
366
449
|
|
367
|
-
|
450
|
+
# form a filepath which indicates the pageset
|
451
|
+
result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
|
368
452
|
|
369
453
|
# Attempt to read the data to parquet file
|
370
454
|
# using duckdb for extraction and pyarrow for
|
@@ -377,14 +461,9 @@ def _source_chunk_to_parquet(
|
|
377
461
|
table=ddb_reader.execute(
|
378
462
|
f"""
|
379
463
|
{base_query}
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
"""
|
384
|
-
if sort_output
|
385
|
-
else f"""
|
386
|
-
{base_query}
|
387
|
-
LIMIT {chunk_size} OFFSET {offset}
|
464
|
+
WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
|
465
|
+
/* optional ordering per pageset */
|
466
|
+
{"ORDER BY " + source['page_key'] if sort_output else ""};
|
388
467
|
"""
|
389
468
|
).arrow(),
|
390
469
|
where=result_filepath,
|
@@ -406,10 +485,10 @@ def _source_chunk_to_parquet(
|
|
406
485
|
table=_sqlite_mixed_type_query_to_parquet(
|
407
486
|
source_path=str(source["source_path"]),
|
408
487
|
table_name=str(source["table_name"]),
|
409
|
-
|
410
|
-
|
411
|
-
add_cytotable_meta=True if sort_output else False,
|
488
|
+
page_key=source["page_key"],
|
489
|
+
pageset=pageset,
|
412
490
|
sort_output=sort_output,
|
491
|
+
tablenumber=source["tablenumber"],
|
413
492
|
),
|
414
493
|
where=result_filepath,
|
415
494
|
)
|
@@ -458,10 +537,7 @@ def _prepend_column_name(
|
|
458
537
|
|
459
538
|
import pyarrow.parquet as parquet
|
460
539
|
|
461
|
-
from cytotable.constants import
|
462
|
-
CYOTABLE_META_COLUMN_TYPES,
|
463
|
-
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
464
|
-
)
|
540
|
+
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
465
541
|
from cytotable.utils import _write_parquet_table_with_metadata
|
466
542
|
|
467
543
|
logger = logging.getLogger(__name__)
|
@@ -472,7 +548,7 @@ def _prepend_column_name(
|
|
472
548
|
if len(targets) == 0:
|
473
549
|
logger.warning(
|
474
550
|
msg=(
|
475
|
-
"Skipping column name prepend operations"
|
551
|
+
"Skipping column name prepend operations "
|
476
552
|
"because no compartments or metadata were provided."
|
477
553
|
)
|
478
554
|
)
|
@@ -509,10 +585,8 @@ def _prepend_column_name(
|
|
509
585
|
# source_group_name_stem: 'Cells'
|
510
586
|
# column_name: 'AreaShape_Area'
|
511
587
|
# updated_column_name: 'Cells_AreaShape_Area'
|
512
|
-
if (
|
513
|
-
|
514
|
-
and not column_name.startswith(source_group_name_stem.capitalize())
|
515
|
-
and column_name not in CYOTABLE_META_COLUMN_TYPES
|
588
|
+
if column_name not in identifying_columns and not column_name.startswith(
|
589
|
+
source_group_name_stem.capitalize()
|
516
590
|
):
|
517
591
|
updated_column_names.append(f"{source_group_name_stem}_{column_name}")
|
518
592
|
# if-condition for prepending 'Metadata_' to column name
|
@@ -574,6 +648,7 @@ def _concat_source_group(
|
|
574
648
|
source_group: List[Dict[str, Any]],
|
575
649
|
dest_path: str,
|
576
650
|
common_schema: Optional[List[Tuple[str, str]]] = None,
|
651
|
+
sort_output: bool = True,
|
577
652
|
) -> List[Dict[str, Any]]:
|
578
653
|
"""
|
579
654
|
Concatenate group of source data together as single file.
|
@@ -620,6 +695,8 @@ def _concat_source_group(
|
|
620
695
|
common_schema: List[Tuple[str, str]] (Default value = None)
|
621
696
|
Common schema to use for concatenation amongst arrow tables
|
622
697
|
which may have slightly different but compatible schema.
|
698
|
+
sort_output: bool
|
699
|
+
Specifies whether to sort cytotable output or not.
|
623
700
|
|
624
701
|
Returns:
|
625
702
|
List[Dict[str, Any]]
|
@@ -637,7 +714,7 @@ def _concat_source_group(
|
|
637
714
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
638
715
|
)
|
639
716
|
from cytotable.exceptions import SchemaException
|
640
|
-
from cytotable.utils import
|
717
|
+
from cytotable.utils import _natural_sort
|
641
718
|
|
642
719
|
# build a result placeholder
|
643
720
|
concatted: List[Dict[str, Any]] = [
|
@@ -676,7 +753,10 @@ def _concat_source_group(
|
|
676
753
|
# (all must be the same schema)
|
677
754
|
with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
|
678
755
|
for source in source_group:
|
679
|
-
|
756
|
+
tables = [table for table in source["table"]]
|
757
|
+
if sort_output:
|
758
|
+
tables = _natural_sort(tables)
|
759
|
+
for table in tables:
|
680
760
|
# if we haven't inferred the common schema
|
681
761
|
# check that our file matches the expected schema, otherwise raise an error
|
682
762
|
if common_schema is None and not writer_schema.equals(
|
@@ -720,7 +800,6 @@ def _concat_source_group(
|
|
720
800
|
def _prepare_join_sql(
|
721
801
|
sources: Dict[str, List[Dict[str, Any]]],
|
722
802
|
joins: str,
|
723
|
-
sort_output: bool,
|
724
803
|
) -> str:
|
725
804
|
"""
|
726
805
|
Prepare join SQL statement with actual locations of data based on the sources.
|
@@ -741,8 +820,6 @@ def _prepare_join_sql(
|
|
741
820
|
"""
|
742
821
|
import pathlib
|
743
822
|
|
744
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
745
|
-
|
746
823
|
# replace with real location of sources for join sql
|
747
824
|
order_by_tables = []
|
748
825
|
for key, val in sources.items():
|
@@ -754,25 +831,17 @@ def _prepare_join_sql(
|
|
754
831
|
)
|
755
832
|
order_by_tables.append(table_name)
|
756
833
|
|
757
|
-
# create order by statement with from all tables using cytotable metadata
|
758
|
-
order_by_sql = "ORDER BY " + ", ".join(
|
759
|
-
[
|
760
|
-
f"{table}.{meta_column}"
|
761
|
-
for table in order_by_tables
|
762
|
-
for meta_column in CYOTABLE_META_COLUMN_TYPES
|
763
|
-
]
|
764
|
-
)
|
765
|
-
|
766
834
|
# add the order by statements to the join
|
767
|
-
return joins
|
835
|
+
return joins
|
768
836
|
|
769
837
|
|
770
838
|
@python_app
|
771
|
-
def
|
839
|
+
def _join_source_pageset(
|
772
840
|
dest_path: str,
|
773
841
|
joins: str,
|
774
|
-
|
775
|
-
|
842
|
+
page_key: str,
|
843
|
+
pageset: Tuple[int, int],
|
844
|
+
sort_output: bool,
|
776
845
|
drop_null: bool,
|
777
846
|
) -> str:
|
778
847
|
"""
|
@@ -798,31 +867,20 @@ def _join_source_chunk(
|
|
798
867
|
|
799
868
|
import pathlib
|
800
869
|
|
801
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
802
870
|
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
803
871
|
|
804
|
-
# Attempt to read the data to parquet file
|
805
|
-
# using duckdb for extraction and pyarrow for
|
806
|
-
# writing data to a parquet file.
|
807
|
-
# read data with chunk size + offset
|
808
|
-
# and export to parquet
|
809
|
-
exclude_meta_cols = [
|
810
|
-
f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
|
811
|
-
]
|
812
|
-
|
813
872
|
with _duckdb_reader() as ddb_reader:
|
814
873
|
result = ddb_reader.execute(
|
815
874
|
f"""
|
816
|
-
|
875
|
+
WITH joined AS (
|
817
876
|
{joins}
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
"""
|
877
|
+
)
|
878
|
+
SELECT *
|
879
|
+
FROM joined
|
880
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
881
|
+
/* optional sorting per pagset */
|
882
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
883
|
+
"""
|
826
884
|
).arrow()
|
827
885
|
|
828
886
|
# drop nulls if specified
|
@@ -847,10 +905,8 @@ def _join_source_chunk(
|
|
847
905
|
f"{str(pathlib.Path(dest_path).parent)}/"
|
848
906
|
# use the dest_path stem in the name
|
849
907
|
f"{str(pathlib.Path(dest_path).stem)}-"
|
850
|
-
#
|
851
|
-
|
852
|
-
# and before they are brought together as one dataset
|
853
|
-
f"{str(uuid.uuid4().hex)}.parquet"
|
908
|
+
# add the pageset indication to the filename
|
909
|
+
f"{pageset[0]}-{pageset[1]}.parquet"
|
854
910
|
)
|
855
911
|
|
856
912
|
# write the result
|
@@ -867,6 +923,7 @@ def _concat_join_sources(
|
|
867
923
|
sources: Dict[str, List[Dict[str, Any]]],
|
868
924
|
dest_path: str,
|
869
925
|
join_sources: List[str],
|
926
|
+
sort_output: bool = True,
|
870
927
|
) -> str:
|
871
928
|
"""
|
872
929
|
Concatenate join sources from parquet-based chunks.
|
@@ -883,6 +940,8 @@ def _concat_join_sources(
|
|
883
940
|
join_sources: List[str]:
|
884
941
|
List of local filepath destination for join source chunks
|
885
942
|
which will be concatenated.
|
943
|
+
sort_output: bool
|
944
|
+
Specifies whether to sort cytotable output or not.
|
886
945
|
|
887
946
|
Returns:
|
888
947
|
str
|
@@ -898,7 +957,7 @@ def _concat_join_sources(
|
|
898
957
|
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
899
958
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
900
959
|
)
|
901
|
-
from cytotable.utils import
|
960
|
+
from cytotable.utils import _natural_sort
|
902
961
|
|
903
962
|
# remove the unjoined concatted compartments to prepare final dest_path usage
|
904
963
|
# (we now have joined results)
|
@@ -918,7 +977,11 @@ def _concat_join_sources(
|
|
918
977
|
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
919
978
|
)
|
920
979
|
with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
|
921
|
-
for table_path in
|
980
|
+
for table_path in (
|
981
|
+
join_sources
|
982
|
+
if not sort_output
|
983
|
+
else _natural_sort(list_to_sort=join_sources)
|
984
|
+
):
|
922
985
|
writer.write_table(
|
923
986
|
parquet.read_table(
|
924
987
|
table_path,
|
@@ -1042,9 +1105,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1042
1105
|
infer_common_schema: bool,
|
1043
1106
|
drop_null: bool,
|
1044
1107
|
sort_output: bool,
|
1108
|
+
page_keys: Dict[str, str],
|
1045
1109
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1110
|
+
add_tablenumber: Optional[bool] = None,
|
1046
1111
|
**kwargs,
|
1047
|
-
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
1112
|
+
) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
|
1048
1113
|
"""
|
1049
1114
|
Export data to parquet.
|
1050
1115
|
|
@@ -1082,6 +1147,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1082
1147
|
Whether to drop null results.
|
1083
1148
|
sort_output: bool
|
1084
1149
|
Specifies whether to sort cytotable output or not.
|
1150
|
+
page_keys: Dict[str, str]
|
1151
|
+
A dictionary which defines which column names are used for keyset pagination
|
1152
|
+
in order to perform data extraction.
|
1085
1153
|
data_type_cast_map: Dict[str, str]
|
1086
1154
|
A dictionary mapping data type groups to specific types.
|
1087
1155
|
Roughly includes Arrow data types language from:
|
@@ -1112,16 +1180,35 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1112
1180
|
# expand the destination path
|
1113
1181
|
expanded_dest_path = _expand_path(path=dest_path)
|
1114
1182
|
|
1115
|
-
#
|
1116
|
-
|
1183
|
+
# check that each source group name has a pagination key
|
1184
|
+
for source_group_name in sources.keys():
|
1185
|
+
matching_keys = [
|
1186
|
+
key for key in page_keys.keys() if key.lower() in source_group_name.lower()
|
1187
|
+
]
|
1188
|
+
if not matching_keys:
|
1189
|
+
raise CytoTableException(
|
1190
|
+
f"No matching key found in page_keys for source_group_name: {source_group_name}."
|
1191
|
+
"Please include a pagination key based on a column name from the table."
|
1192
|
+
)
|
1193
|
+
|
1194
|
+
# prepare pagesets for chunked data export from source tables
|
1195
|
+
pagesets_prepared = {
|
1117
1196
|
source_group_name: [
|
1118
1197
|
dict(
|
1119
1198
|
source,
|
1120
1199
|
**{
|
1121
|
-
"
|
1200
|
+
"page_key": (
|
1201
|
+
page_key := [
|
1202
|
+
value
|
1203
|
+
for key, value in page_keys.items()
|
1204
|
+
if key.lower() in source_group_name.lower()
|
1205
|
+
][0]
|
1206
|
+
),
|
1207
|
+
"pagesets": _get_table_keyset_pagination_sets(
|
1122
1208
|
source=source,
|
1123
1209
|
chunk_size=chunk_size,
|
1124
|
-
|
1210
|
+
page_key=page_key,
|
1211
|
+
),
|
1125
1212
|
},
|
1126
1213
|
)
|
1127
1214
|
for source in source_group_vals
|
@@ -1129,17 +1216,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1129
1216
|
for source_group_name, source_group_vals in sources.items()
|
1130
1217
|
}
|
1131
1218
|
|
1132
|
-
# if
|
1219
|
+
# if pagesets is none and we haven't halted, remove the file as there
|
1133
1220
|
# were input formatting errors which will create challenges downstream
|
1134
1221
|
invalid_files_dropped = {
|
1135
1222
|
source_group_name: [
|
1136
|
-
# ensure we have
|
1223
|
+
# ensure we have pagesets
|
1137
1224
|
source
|
1138
1225
|
for source in source_group_vals
|
1139
|
-
if source["
|
1226
|
+
if source["pagesets"] is not None
|
1140
1227
|
]
|
1141
1228
|
for source_group_name, source_group_vals in evaluate_futures(
|
1142
|
-
|
1229
|
+
pagesets_prepared
|
1143
1230
|
).items()
|
1144
1231
|
# ensure we have source_groups with at least one source table
|
1145
1232
|
if len(source_group_vals) > 0
|
@@ -1164,6 +1251,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1164
1251
|
for source_group_name, source_group_vals in invalid_files_dropped.items()
|
1165
1252
|
}
|
1166
1253
|
|
1254
|
+
# add tablenumber details, appending None if not add_tablenumber
|
1255
|
+
tablenumber_prepared = _set_tablenumber(
|
1256
|
+
sources=evaluate_futures(column_names_and_types_gathered),
|
1257
|
+
add_tablenumber=add_tablenumber,
|
1258
|
+
).result()
|
1259
|
+
|
1167
1260
|
results = {
|
1168
1261
|
source_group_name: [
|
1169
1262
|
dict(
|
@@ -1172,12 +1265,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1172
1265
|
"table": [
|
1173
1266
|
# perform column renaming and create potential return result
|
1174
1267
|
_prepend_column_name(
|
1175
|
-
# perform chunked data export to parquet using
|
1176
|
-
table_path=
|
1268
|
+
# perform chunked data export to parquet using pagesets
|
1269
|
+
table_path=_source_pageset_to_parquet(
|
1177
1270
|
source_group_name=source_group_name,
|
1178
1271
|
source=source,
|
1179
|
-
|
1180
|
-
offset=offset,
|
1272
|
+
pageset=pageset,
|
1181
1273
|
dest_path=expanded_dest_path,
|
1182
1274
|
sort_output=sort_output,
|
1183
1275
|
),
|
@@ -1186,14 +1278,14 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1186
1278
|
metadata=metadata,
|
1187
1279
|
compartments=compartments,
|
1188
1280
|
)
|
1189
|
-
for
|
1281
|
+
for pageset in source["pagesets"]
|
1190
1282
|
]
|
1191
1283
|
},
|
1192
1284
|
)
|
1193
1285
|
for source in source_group_vals
|
1194
1286
|
]
|
1195
1287
|
for source_group_name, source_group_vals in evaluate_futures(
|
1196
|
-
|
1288
|
+
tablenumber_prepared
|
1197
1289
|
).items()
|
1198
1290
|
}
|
1199
1291
|
|
@@ -1227,6 +1319,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1227
1319
|
source_group=source_group_vals[0]["sources"],
|
1228
1320
|
dest_path=expanded_dest_path,
|
1229
1321
|
common_schema=source_group_vals[0]["common_schema"],
|
1322
|
+
sort_output=sort_output,
|
1230
1323
|
)
|
1231
1324
|
for source_group_name, source_group_vals in evaluate_futures(
|
1232
1325
|
common_schema_determined
|
@@ -1240,39 +1333,50 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1240
1333
|
evaluated_results = evaluate_futures(results)
|
1241
1334
|
|
1242
1335
|
prepared_joins_sql = _prepare_join_sql(
|
1243
|
-
sources=evaluated_results, joins=joins
|
1336
|
+
sources=evaluated_results, joins=joins
|
1244
1337
|
).result()
|
1245
1338
|
|
1339
|
+
page_key_join = [
|
1340
|
+
value for key, value in page_keys.items() if key.lower() == "join"
|
1341
|
+
][0]
|
1342
|
+
|
1246
1343
|
# map joined results based on the join groups gathered above
|
1247
1344
|
# note: after mapping we end up with a list of strings (task returns str)
|
1248
1345
|
join_sources_result = [
|
1249
|
-
|
1346
|
+
_join_source_pageset(
|
1250
1347
|
# gather the result of concatted sources prior to
|
1251
1348
|
# join group merging as each mapped task run will need
|
1252
1349
|
# full concat results
|
1253
1350
|
dest_path=expanded_dest_path,
|
1254
1351
|
joins=prepared_joins_sql,
|
1255
|
-
|
1256
|
-
|
1352
|
+
page_key=page_key_join,
|
1353
|
+
pageset=pageset,
|
1354
|
+
sort_output=sort_output,
|
1257
1355
|
drop_null=drop_null,
|
1258
1356
|
)
|
1259
1357
|
# create join group for querying the concatenated
|
1260
1358
|
# data in order to perform memory-safe joining
|
1261
1359
|
# per user chunk size specification.
|
1262
|
-
for
|
1360
|
+
for pageset in _get_table_keyset_pagination_sets(
|
1263
1361
|
sql_stmt=prepared_joins_sql,
|
1264
1362
|
chunk_size=chunk_size,
|
1363
|
+
page_key=page_key_join,
|
1265
1364
|
).result()
|
1266
1365
|
]
|
1267
1366
|
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1367
|
+
if concat:
|
1368
|
+
# concat our join chunks together as one cohesive dataset
|
1369
|
+
# return results in common format which includes metadata
|
1370
|
+
# for lineage and debugging
|
1371
|
+
results = _concat_join_sources(
|
1372
|
+
dest_path=expanded_dest_path,
|
1373
|
+
join_sources=[join.result() for join in join_sources_result],
|
1374
|
+
sources=evaluated_results,
|
1375
|
+
sort_output=sort_output,
|
1376
|
+
)
|
1377
|
+
else:
|
1378
|
+
# else we leave the joined chunks as-is and return them
|
1379
|
+
return evaluate_futures(join_sources_result)
|
1276
1380
|
|
1277
1381
|
# wrap the final result as a future and return
|
1278
1382
|
return evaluate_futures(results)
|
@@ -1293,11 +1397,13 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1293
1397
|
infer_common_schema: bool = True,
|
1294
1398
|
drop_null: bool = False,
|
1295
1399
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1400
|
+
add_tablenumber: Optional[bool] = None,
|
1401
|
+
page_keys: Optional[Dict[str, str]] = None,
|
1296
1402
|
sort_output: bool = True,
|
1297
1403
|
preset: Optional[str] = "cellprofiler_csv",
|
1298
1404
|
parsl_config: Optional[parsl.Config] = None,
|
1299
1405
|
**kwargs,
|
1300
|
-
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
1406
|
+
) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
|
1301
1407
|
"""
|
1302
1408
|
Convert file-based data from various sources to Pycytominer-compatible standards.
|
1303
1409
|
|
@@ -1341,6 +1447,17 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1341
1447
|
A dictionary mapping data type groups to specific types.
|
1342
1448
|
Roughly includes Arrow data types language from:
|
1343
1449
|
https://arrow.apache.org/docs/python/api/datatypes.html
|
1450
|
+
add_tablenumber: Optional[bool]
|
1451
|
+
Whether to add a calculated tablenumber which helps differentiate
|
1452
|
+
various repeated values (such as ObjectNumber) within source data.
|
1453
|
+
Useful for processing multiple SQLite or CSV data sources together
|
1454
|
+
to retain distinction from each dataset.
|
1455
|
+
page_keys: str:
|
1456
|
+
The table and column names to be used for key pagination.
|
1457
|
+
Uses the form: {"table_name":"column_name"}.
|
1458
|
+
Expects columns to include numeric data (ints or floats).
|
1459
|
+
Interacts with the `chunk_size` parameter to form
|
1460
|
+
pages of `chunk_size`.
|
1344
1461
|
sort_output: bool (Default value = True)
|
1345
1462
|
Specifies whether to sort cytotable output or not.
|
1346
1463
|
drop_null: bool (Default value = False)
|
@@ -1440,6 +1557,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1440
1557
|
if chunk_size is None
|
1441
1558
|
else chunk_size
|
1442
1559
|
)
|
1560
|
+
page_keys = (
|
1561
|
+
cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
|
1562
|
+
if page_keys is None
|
1563
|
+
else page_keys
|
1564
|
+
)
|
1565
|
+
|
1566
|
+
# Raise an exception for scenarios where one configures CytoTable to join
|
1567
|
+
# but does not provide a pagination key for the joins.
|
1568
|
+
if join and (page_keys is None or "join" not in page_keys.keys()):
|
1569
|
+
raise CytoTableException(
|
1570
|
+
(
|
1571
|
+
"When using join=True one must pass a 'join' pagination key "
|
1572
|
+
"in the page_keys parameter. The 'join' pagination key is a column "
|
1573
|
+
"name found within the joined results based on the SQL provided from "
|
1574
|
+
"the joins parameter. This special key is required as not all columns "
|
1575
|
+
"from the source tables might not be included."
|
1576
|
+
)
|
1577
|
+
)
|
1443
1578
|
|
1444
1579
|
# send sources to be written to parquet if selected
|
1445
1580
|
if dest_datatype == "parquet":
|
@@ -1457,7 +1592,9 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1457
1592
|
infer_common_schema=infer_common_schema,
|
1458
1593
|
drop_null=drop_null,
|
1459
1594
|
data_type_cast_map=data_type_cast_map,
|
1595
|
+
add_tablenumber=add_tablenumber,
|
1460
1596
|
sort_output=sort_output,
|
1597
|
+
page_keys=cast(dict, page_keys),
|
1461
1598
|
**kwargs,
|
1462
1599
|
)
|
1463
1600
|
|
cytotable/presets.py
CHANGED
@@ -22,6 +22,16 @@ config = {
|
|
22
22
|
"Parent_Cells",
|
23
23
|
"Parent_Nuclei",
|
24
24
|
),
|
25
|
+
# pagination keys for use with this data
|
26
|
+
# of the rough format "table" -> "column".
|
27
|
+
# note: page keys are expected to be numeric (int, float)
|
28
|
+
"CONFIG_PAGE_KEYS": {
|
29
|
+
"image": "ImageNumber",
|
30
|
+
"cells": "ObjectNumber",
|
31
|
+
"nuclei": "ObjectNumber",
|
32
|
+
"cytoplasm": "ObjectNumber",
|
33
|
+
"join": "Cytoplasm_Number_Object_Number",
|
34
|
+
},
|
25
35
|
# chunk size to use for join operations to help with possible performance issues
|
26
36
|
# note: this number is an estimate and is may need changes contingent on data
|
27
37
|
# and system used by this library.
|
@@ -61,6 +71,16 @@ config = {
|
|
61
71
|
"Parent_Cells",
|
62
72
|
"Parent_Nuclei",
|
63
73
|
),
|
74
|
+
# pagination keys for use with this data
|
75
|
+
# of the rough format "table" -> "column".
|
76
|
+
# note: page keys are expected to be numeric (int, float)
|
77
|
+
"CONFIG_PAGE_KEYS": {
|
78
|
+
"image": "ImageNumber",
|
79
|
+
"cells": "Cells_Number_Object_Number",
|
80
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
81
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
82
|
+
"join": "Cytoplasm_Number_Object_Number",
|
83
|
+
},
|
64
84
|
# chunk size to use for join operations to help with possible performance issues
|
65
85
|
# note: this number is an estimate and is may need changes contingent on data
|
66
86
|
# and system used by this library.
|
@@ -104,6 +124,16 @@ config = {
|
|
104
124
|
"Parent_Cells",
|
105
125
|
"Parent_Nuclei",
|
106
126
|
),
|
127
|
+
# pagination keys for use with this data
|
128
|
+
# of the rough format "table" -> "column".
|
129
|
+
# note: page keys are expected to be numeric (int, float)
|
130
|
+
"CONFIG_PAGE_KEYS": {
|
131
|
+
"image": "ImageNumber",
|
132
|
+
"cells": "ObjectNumber",
|
133
|
+
"nuclei": "ObjectNumber",
|
134
|
+
"cytoplasm": "ObjectNumber",
|
135
|
+
"join": "Cytoplasm_Number_Object_Number",
|
136
|
+
},
|
107
137
|
# chunk size to use for join operations to help with possible performance issues
|
108
138
|
# note: this number is an estimate and is may need changes contingent on data
|
109
139
|
# and system used by this library.
|
@@ -155,6 +185,16 @@ config = {
|
|
155
185
|
"Cells_Number_Object_Number",
|
156
186
|
"Nuclei_Number_Object_Number",
|
157
187
|
),
|
188
|
+
# pagination keys for use with this data
|
189
|
+
# of the rough format "table" -> "column".
|
190
|
+
# note: page keys are expected to be numeric (int, float)
|
191
|
+
"CONFIG_PAGE_KEYS": {
|
192
|
+
"image": "ImageNumber",
|
193
|
+
"cells": "Cells_Number_Object_Number",
|
194
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
195
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
196
|
+
"join": "Cytoplasm_Number_Object_Number",
|
197
|
+
},
|
158
198
|
# chunk size to use for join operations to help with possible performance issues
|
159
199
|
# note: this number is an estimate and is may need changes contingent on data
|
160
200
|
# and system used by this library.
|
@@ -203,6 +243,16 @@ config = {
|
|
203
243
|
"Cells_ObjectNumber",
|
204
244
|
"Nuclei_ObjectNumber",
|
205
245
|
),
|
246
|
+
# pagination keys for use with this data
|
247
|
+
# of the rough format "table" -> "column".
|
248
|
+
# note: page keys are expected to be numeric (int, float)
|
249
|
+
"CONFIG_PAGE_KEYS": {
|
250
|
+
"image": "ImageNumber",
|
251
|
+
"cells": "ObjectNumber",
|
252
|
+
"nuclei": "ObjectNumber",
|
253
|
+
"cytoplasm": "ObjectNumber",
|
254
|
+
"join": "Cytoplasm_Number_Object_Number",
|
255
|
+
},
|
206
256
|
# chunk size to use for join operations to help with possible performance issues
|
207
257
|
# note: this number is an estimate and is may need changes contingent on data
|
208
258
|
# and system used by this library.
|
@@ -248,6 +298,12 @@ config = {
|
|
248
298
|
"Z",
|
249
299
|
"T",
|
250
300
|
),
|
301
|
+
# pagination keys for use with this data
|
302
|
+
# of the rough format "table" -> "column".
|
303
|
+
# note: page keys are expected to be numeric (int, float)
|
304
|
+
"CONFIG_PAGE_KEYS": {
|
305
|
+
"test": '"OBJECT ID"',
|
306
|
+
},
|
251
307
|
# chunk size to use for join operations to help with possible performance issues
|
252
308
|
# note: this number is an estimate and is may need changes contingent on data
|
253
309
|
# and system used by this library.
|
cytotable/utils.py
CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import duckdb
|
11
11
|
import parsl
|
@@ -166,6 +166,12 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
166
166
|
https://duckdb.org/docs/sql/configuration#configuration-reference
|
167
167
|
*/
|
168
168
|
PRAGMA preserve_insertion_order=FALSE;
|
169
|
+
|
170
|
+
/*
|
171
|
+
Disable progress bar from displaying (defaults to TRUE)
|
172
|
+
See earlier documentation references above for more information.
|
173
|
+
*/
|
174
|
+
SET enable_progress_bar=FALSE;
|
169
175
|
""",
|
170
176
|
)
|
171
177
|
|
@@ -173,10 +179,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
173
179
|
def _sqlite_mixed_type_query_to_parquet(
|
174
180
|
source_path: str,
|
175
181
|
table_name: str,
|
176
|
-
|
177
|
-
|
182
|
+
page_key: str,
|
183
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
178
184
|
sort_output: bool,
|
179
|
-
|
185
|
+
tablenumber: Optional[int] = None,
|
180
186
|
) -> str:
|
181
187
|
"""
|
182
188
|
Performs SQLite table data extraction where one or many
|
@@ -188,14 +194,17 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
188
194
|
A str which is a path to a SQLite database file.
|
189
195
|
table_name: str:
|
190
196
|
The name of the table being queried.
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
The
|
197
|
+
page_key: str:
|
198
|
+
The column name to be used to identify pagination chunks.
|
199
|
+
pageset: Tuple[int, int]:
|
200
|
+
The range for values used for paginating data from source.
|
195
201
|
sort_output: bool
|
196
202
|
Specifies whether to sort cytotable output or not.
|
197
203
|
add_cytotable_meta: bool, default=False:
|
198
204
|
Whether to add CytoTable metadata fields or not
|
205
|
+
tablenumber: Optional[int], default=None:
|
206
|
+
An optional table number to append to the results.
|
207
|
+
Defaults to None.
|
199
208
|
|
200
209
|
Returns:
|
201
210
|
pyarrow.Table:
|
@@ -205,10 +214,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
205
214
|
|
206
215
|
import pyarrow as pa
|
207
216
|
|
208
|
-
from cytotable.constants import
|
209
|
-
CYOTABLE_META_COLUMN_TYPES,
|
210
|
-
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
|
211
|
-
)
|
217
|
+
from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
212
218
|
from cytotable.exceptions import DatatypeException
|
213
219
|
|
214
220
|
# open sqlite3 connection
|
@@ -254,9 +260,19 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
254
260
|
# return the translated type for use in SQLite
|
255
261
|
return translated_type[0]
|
256
262
|
|
263
|
+
# build tablenumber segment addition (if necessary)
|
264
|
+
tablenumber_sql = (
|
265
|
+
# to become tablenumber in sql select later with integer
|
266
|
+
f"CAST({tablenumber} AS INTEGER) as TableNumber, "
|
267
|
+
if tablenumber is not None
|
268
|
+
# if we don't have a tablenumber value, don't introduce the column
|
269
|
+
else ""
|
270
|
+
)
|
271
|
+
|
257
272
|
# create cases for mixed-type handling in each column discovered above
|
258
|
-
query_parts =
|
259
|
-
|
273
|
+
query_parts = tablenumber_sql + ", ".join(
|
274
|
+
[
|
275
|
+
f"""
|
260
276
|
CASE
|
261
277
|
/* when the storage class type doesn't match the column, return nulltype */
|
262
278
|
WHEN typeof({col['column_name']}) !=
|
@@ -265,45 +281,18 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
265
281
|
ELSE {col['column_name']}
|
266
282
|
END AS {col['column_name']}
|
267
283
|
"""
|
268
|
-
|
269
|
-
]
|
270
|
-
|
271
|
-
if add_cytotable_meta:
|
272
|
-
query_parts += [
|
273
|
-
(
|
274
|
-
f"CAST( '{f'{source_path}_table_{table_name}'}' "
|
275
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
|
276
|
-
"AS cytotable_meta_source_path"
|
277
|
-
),
|
278
|
-
(
|
279
|
-
f"CAST( {offset} "
|
280
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
|
281
|
-
"AS cytotable_meta_offset"
|
282
|
-
),
|
283
|
-
(
|
284
|
-
f"CAST( (ROW_NUMBER() OVER ()) AS "
|
285
|
-
f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
|
286
|
-
"AS cytotable_meta_rownum"
|
287
|
-
),
|
284
|
+
for col in column_info
|
288
285
|
]
|
286
|
+
)
|
289
287
|
|
290
288
|
# perform the select using the cases built above and using chunksize + offset
|
291
|
-
sql_stmt =
|
292
|
-
f"""
|
293
|
-
SELECT
|
294
|
-
{', '.join(query_parts)}
|
295
|
-
FROM {table_name}
|
296
|
-
ORDER BY {', '.join([col['column_name'] for col in column_info])}
|
297
|
-
LIMIT {chunk_size} OFFSET {offset};
|
298
|
-
"""
|
299
|
-
if sort_output
|
300
|
-
else f"""
|
289
|
+
sql_stmt = f"""
|
301
290
|
SELECT
|
302
|
-
{
|
291
|
+
{query_parts}
|
303
292
|
FROM {table_name}
|
304
|
-
|
293
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
294
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
305
295
|
"""
|
306
|
-
)
|
307
296
|
|
308
297
|
# execute the sql stmt
|
309
298
|
cursor.execute(sql_stmt)
|
@@ -508,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
|
|
508
497
|
)
|
509
498
|
|
510
499
|
|
500
|
+
def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
|
501
|
+
"""
|
502
|
+
Build and return a checksum for use as a unique identifier across datasets
|
503
|
+
referenced from cytominer-database:
|
504
|
+
https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
|
505
|
+
|
506
|
+
Args:
|
507
|
+
pathname: str:
|
508
|
+
A path to a file with which to generate the checksum on.
|
509
|
+
buffer_size: int:
|
510
|
+
Buffer size to use for reading data.
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
int
|
514
|
+
an integer representing the checksum of the pathname file.
|
515
|
+
"""
|
516
|
+
|
517
|
+
import os
|
518
|
+
import zlib
|
519
|
+
|
520
|
+
# check whether the buffer size is larger than the file_size
|
521
|
+
file_size = os.path.getsize(pathname)
|
522
|
+
if file_size < buffer_size:
|
523
|
+
buffer_size = file_size
|
524
|
+
|
525
|
+
# open file
|
526
|
+
with open(str(pathname), "rb") as stream:
|
527
|
+
# begin result formation
|
528
|
+
result = zlib.crc32(bytes(0))
|
529
|
+
while True:
|
530
|
+
# read data from stream using buffer size
|
531
|
+
buffer = stream.read(buffer_size)
|
532
|
+
if not buffer:
|
533
|
+
# if we have no more data to use, break while loop
|
534
|
+
break
|
535
|
+
# use buffer read data to form checksum
|
536
|
+
result = zlib.crc32(buffer, result)
|
537
|
+
|
538
|
+
return result & 0xFFFFFFFF
|
539
|
+
|
540
|
+
|
511
541
|
def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
|
512
542
|
"""
|
513
543
|
Helper function to unwrap futures from values or return values
|
@@ -563,14 +593,16 @@ def _unwrap_source(
|
|
563
593
|
return _unwrap_value(source)
|
564
594
|
|
565
595
|
|
566
|
-
def evaluate_futures(
|
596
|
+
def evaluate_futures(
|
597
|
+
sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
|
598
|
+
) -> Any:
|
567
599
|
"""
|
568
600
|
Evaluates any Parsl futures for use within other tasks.
|
569
601
|
This enables a pattern of Parsl app usage as "tasks" and delayed
|
570
602
|
future result evaluation for concurrency.
|
571
603
|
|
572
604
|
Args:
|
573
|
-
sources: Union[Dict[str, List[Dict[str, Any]]], str]
|
605
|
+
sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
|
574
606
|
Sources are an internal data structure used by CytoTable for
|
575
607
|
processing and organizing data results. They may include futures
|
576
608
|
which require asynchronous processing through Parsl, so we
|
@@ -600,3 +632,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
|
|
600
632
|
if isinstance(sources, dict)
|
601
633
|
else _unwrap_value(sources)
|
602
634
|
)
|
635
|
+
|
636
|
+
|
637
|
+
def _generate_pagesets(
|
638
|
+
keys: List[Union[int, float]], chunk_size: int
|
639
|
+
) -> List[Tuple[Union[int, float], Union[int, float]]]:
|
640
|
+
"""
|
641
|
+
Generate a pageset (keyset pagination) from a list of keys.
|
642
|
+
|
643
|
+
Parameters:
|
644
|
+
keys List[Union[int, float]]:
|
645
|
+
List of keys to paginate.
|
646
|
+
chunk_size int:
|
647
|
+
Size of each chunk/page.
|
648
|
+
|
649
|
+
Returns:
|
650
|
+
List[Tuple[Union[int, float], Union[int, float]]]:
|
651
|
+
List of (start_key, end_key) tuples representing each page.
|
652
|
+
"""
|
653
|
+
|
654
|
+
# Initialize an empty list to store the chunks/pages
|
655
|
+
chunks = []
|
656
|
+
|
657
|
+
# Start index for iteration through the keys
|
658
|
+
i = 0
|
659
|
+
|
660
|
+
while i < len(keys):
|
661
|
+
# Get the start key for the current chunk
|
662
|
+
start_key = keys[i]
|
663
|
+
|
664
|
+
# Calculate the end index for the current chunk
|
665
|
+
end_index = min(i + chunk_size, len(keys)) - 1
|
666
|
+
|
667
|
+
# Get the end key for the current chunk
|
668
|
+
end_key = keys[end_index]
|
669
|
+
|
670
|
+
# Ensure non-overlapping by incrementing the start of the next range if there are duplicates
|
671
|
+
while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
|
672
|
+
end_index += 1
|
673
|
+
|
674
|
+
# Append the current chunk (start_key, end_key) to the list of chunks
|
675
|
+
chunks.append((start_key, end_key))
|
676
|
+
|
677
|
+
# Update the index to start from the next chunk
|
678
|
+
i = end_index + 1
|
679
|
+
|
680
|
+
# Return the list of chunks/pages
|
681
|
+
return chunks
|
682
|
+
|
683
|
+
|
684
|
+
def _natural_sort(list_to_sort):
|
685
|
+
"""
|
686
|
+
Sorts the given iterable using natural sort adapted from approach
|
687
|
+
provided by the following link:
|
688
|
+
https://stackoverflow.com/a/4836734
|
689
|
+
|
690
|
+
Args:
|
691
|
+
list_to_sort: List:
|
692
|
+
The list to sort.
|
693
|
+
|
694
|
+
Returns:
|
695
|
+
List: The sorted list.
|
696
|
+
"""
|
697
|
+
import re
|
698
|
+
|
699
|
+
return sorted(
|
700
|
+
list_to_sort,
|
701
|
+
# use a custom key to sort the list
|
702
|
+
key=lambda key: [
|
703
|
+
# use integer of c if it's a digit, otherwise str
|
704
|
+
int(c) if c.isdigit() else c
|
705
|
+
# Split the key into parts, separating numbers from alphabetic characters
|
706
|
+
for c in re.split("([0-9]+)", str(key))
|
707
|
+
],
|
708
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.11
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=KSVr7xOOrpmQ_ybzcsZkblTAzPIYEq7_bm-Cjc874FM,316
|
2
|
+
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
+
cytotable/convert.py,sha256=5VHnw0eGdfXTbSfeEoPAPVa-dtobM6VHkIJwscLe68M,60651
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=tywZg1Gr78ebLlOp8R7trkiV7jsQ4iiZt4B6qG6SrxY,22578
|
8
|
+
cytotable-0.0.11.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.11.dist-info/METADATA,sha256=sOvdWxld2Ryyjd5bluZt8Z78uElg1CyWG0UIRJn0F8E,3424
|
10
|
+
cytotable-0.0.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
11
|
+
cytotable-0.0.11.dist-info/RECORD,,
|
cytotable-0.0.9.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
|
2
|
-
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
-
cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
|
6
|
-
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
-
cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
|
8
|
-
cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
|
10
|
-
cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.9.dist-info/RECORD,,
|
File without changes
|