CytoTable 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/convert.py +145 -14
- cytotable/utils.py +71 -7
- {cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/METADATA +1 -1
- cytotable-0.0.11.dist-info/RECORD +11 -0
- {cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/WHEEL +1 -1
- cytotable-0.0.10.dist-info/RECORD +0 -11
- {cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/LICENSE +0 -0
cytotable/__init__.py
CHANGED
cytotable/convert.py
CHANGED
@@ -173,6 +173,106 @@ def _prep_cast_column_data_types(
|
|
173
173
|
return columns
|
174
174
|
|
175
175
|
|
176
|
+
@python_app
|
177
|
+
def _set_tablenumber(
|
178
|
+
sources: Dict[str, List[Dict[str, Any]]],
|
179
|
+
add_tablenumber: Optional[bool] = None,
|
180
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
181
|
+
"""
|
182
|
+
Gathers a "TableNumber" from the image table (if CSV) or
|
183
|
+
SQLite file (if SQLite source) which is a unique identifier
|
184
|
+
intended to help differentiate between imagenumbers
|
185
|
+
to create distinct records for single-cell profiles
|
186
|
+
referenced across multiple source data exports.
|
187
|
+
For example, ImageNumber column values from CellProfiler
|
188
|
+
will repeat across exports, meaning we may lose distinction
|
189
|
+
when combining multiple export files together through CytoTable.
|
190
|
+
|
191
|
+
Note:
|
192
|
+
- If using CSV data sources, the image.csv table is used for checksum.
|
193
|
+
- If using SQLite data sources, the entire SQLite database is used for checksum.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
sources: Dict[str, List[Dict[str, Any]]]
|
197
|
+
Contains metadata about data tables and related contents.
|
198
|
+
add_tablenumber: Optional[bool]
|
199
|
+
Whether to add a calculated tablenumber.
|
200
|
+
Note: when False, adds None as the tablenumber
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
List[Dict[str, Any]]
|
204
|
+
New source group with added TableNumber details.
|
205
|
+
"""
|
206
|
+
|
207
|
+
from cloudpathlib import AnyPath
|
208
|
+
|
209
|
+
from cytotable.utils import _gather_tablenumber_checksum
|
210
|
+
|
211
|
+
image_table_groups = {
|
212
|
+
# create a data structure with the common parent for each dataset
|
213
|
+
# and the calculated checksum from the image table.
|
214
|
+
# note: the source_path parent is used for non-SQLite files
|
215
|
+
# whereas the direct source path is used for SQLite files.
|
216
|
+
(
|
217
|
+
str(source["source_path"].parent)
|
218
|
+
if source["source_path"].suffix != "sqlite"
|
219
|
+
else source["source_path"]
|
220
|
+
): source["source_path"]
|
221
|
+
for source_group_name, source_group_vals in sources.items()
|
222
|
+
# use the image tables references only for the basis of the
|
223
|
+
# these calculations.
|
224
|
+
if any(
|
225
|
+
value in str(AnyPath(source_group_name).stem).lower()
|
226
|
+
for value in ["image", "per_image"]
|
227
|
+
)
|
228
|
+
for source in source_group_vals
|
229
|
+
}
|
230
|
+
|
231
|
+
# determine if we need to add tablenumber data
|
232
|
+
if (
|
233
|
+
# case for detecting multiple image tables which need to be differentiated
|
234
|
+
add_tablenumber is None
|
235
|
+
and (len(image_table_groups) <= 1)
|
236
|
+
) or (
|
237
|
+
# case for explicitly set no tablenumbers
|
238
|
+
add_tablenumber
|
239
|
+
is False
|
240
|
+
):
|
241
|
+
return {
|
242
|
+
source_group_name: [
|
243
|
+
dict(
|
244
|
+
source,
|
245
|
+
**{
|
246
|
+
"tablenumber": None,
|
247
|
+
},
|
248
|
+
)
|
249
|
+
for source in source_group_vals
|
250
|
+
]
|
251
|
+
for source_group_name, source_group_vals in sources.items()
|
252
|
+
}
|
253
|
+
|
254
|
+
# gather the image table from the source_group
|
255
|
+
tablenumber_table = {
|
256
|
+
# create a data structure with the common parent for each dataset
|
257
|
+
# and the calculated checksum from the image table
|
258
|
+
group: _gather_tablenumber_checksum(path)
|
259
|
+
for group, path in image_table_groups.items()
|
260
|
+
}
|
261
|
+
|
262
|
+
# return a modified sources data structure with the tablenumber added
|
263
|
+
return {
|
264
|
+
source_group_name: [
|
265
|
+
dict(
|
266
|
+
source,
|
267
|
+
**{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
|
268
|
+
)
|
269
|
+
for source in source_group_vals
|
270
|
+
if str(source["source_path"].parent) in list(tablenumber_table.keys())
|
271
|
+
]
|
272
|
+
for source_group_name, source_group_vals in sources.items()
|
273
|
+
}
|
274
|
+
|
275
|
+
|
176
276
|
@python_app
|
177
277
|
def _get_table_keyset_pagination_sets(
|
178
278
|
chunk_size: int,
|
@@ -310,6 +410,18 @@ def _source_pageset_to_parquet(
|
|
310
410
|
)
|
311
411
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
312
412
|
|
413
|
+
# build tablenumber segment addition (if necessary)
|
414
|
+
tablenumber_sql = (
|
415
|
+
# to become tablenumber in sql select later with bigint (8-byte integer)
|
416
|
+
# we cast here to bigint to avoid concat or join conflicts later due to
|
417
|
+
# misaligned automatic data typing.
|
418
|
+
f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
|
419
|
+
if source["tablenumber"] is not None
|
420
|
+
# don't introduce the column if we aren't supposed to add tablenumber
|
421
|
+
# as per parameter.
|
422
|
+
else ""
|
423
|
+
)
|
424
|
+
|
313
425
|
# add source table columns
|
314
426
|
casted_source_cols = [
|
315
427
|
# here we cast the column to the specified type ensure the colname remains the same
|
@@ -317,8 +429,8 @@ def _source_pageset_to_parquet(
|
|
317
429
|
for column in source["columns"]
|
318
430
|
]
|
319
431
|
|
320
|
-
# create selection statement from lists above
|
321
|
-
select_columns = ",".join(
|
432
|
+
# create selection statement from tablenumber_sql + lists above
|
433
|
+
select_columns = tablenumber_sql + ",".join(
|
322
434
|
# if we should sort the output, add the metadata_cols
|
323
435
|
casted_source_cols
|
324
436
|
if sort_output
|
@@ -376,6 +488,7 @@ def _source_pageset_to_parquet(
|
|
376
488
|
page_key=source["page_key"],
|
377
489
|
pageset=pageset,
|
378
490
|
sort_output=sort_output,
|
491
|
+
tablenumber=source["tablenumber"],
|
379
492
|
),
|
380
493
|
where=result_filepath,
|
381
494
|
)
|
@@ -994,8 +1107,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
994
1107
|
sort_output: bool,
|
995
1108
|
page_keys: Dict[str, str],
|
996
1109
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1110
|
+
add_tablenumber: Optional[bool] = None,
|
997
1111
|
**kwargs,
|
998
|
-
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
1112
|
+
) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
|
999
1113
|
"""
|
1000
1114
|
Export data to parquet.
|
1001
1115
|
|
@@ -1137,6 +1251,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1137
1251
|
for source_group_name, source_group_vals in invalid_files_dropped.items()
|
1138
1252
|
}
|
1139
1253
|
|
1254
|
+
# add tablenumber details, appending None if not add_tablenumber
|
1255
|
+
tablenumber_prepared = _set_tablenumber(
|
1256
|
+
sources=evaluate_futures(column_names_and_types_gathered),
|
1257
|
+
add_tablenumber=add_tablenumber,
|
1258
|
+
).result()
|
1259
|
+
|
1140
1260
|
results = {
|
1141
1261
|
source_group_name: [
|
1142
1262
|
dict(
|
@@ -1165,7 +1285,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1165
1285
|
for source in source_group_vals
|
1166
1286
|
]
|
1167
1287
|
for source_group_name, source_group_vals in evaluate_futures(
|
1168
|
-
|
1288
|
+
tablenumber_prepared
|
1169
1289
|
).items()
|
1170
1290
|
}
|
1171
1291
|
|
@@ -1244,15 +1364,19 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1244
1364
|
).result()
|
1245
1365
|
]
|
1246
1366
|
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1367
|
+
if concat:
|
1368
|
+
# concat our join chunks together as one cohesive dataset
|
1369
|
+
# return results in common format which includes metadata
|
1370
|
+
# for lineage and debugging
|
1371
|
+
results = _concat_join_sources(
|
1372
|
+
dest_path=expanded_dest_path,
|
1373
|
+
join_sources=[join.result() for join in join_sources_result],
|
1374
|
+
sources=evaluated_results,
|
1375
|
+
sort_output=sort_output,
|
1376
|
+
)
|
1377
|
+
else:
|
1378
|
+
# else we leave the joined chunks as-is and return them
|
1379
|
+
return evaluate_futures(join_sources_result)
|
1256
1380
|
|
1257
1381
|
# wrap the final result as a future and return
|
1258
1382
|
return evaluate_futures(results)
|
@@ -1273,12 +1397,13 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1273
1397
|
infer_common_schema: bool = True,
|
1274
1398
|
drop_null: bool = False,
|
1275
1399
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1400
|
+
add_tablenumber: Optional[bool] = None,
|
1276
1401
|
page_keys: Optional[Dict[str, str]] = None,
|
1277
1402
|
sort_output: bool = True,
|
1278
1403
|
preset: Optional[str] = "cellprofiler_csv",
|
1279
1404
|
parsl_config: Optional[parsl.Config] = None,
|
1280
1405
|
**kwargs,
|
1281
|
-
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
1406
|
+
) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
|
1282
1407
|
"""
|
1283
1408
|
Convert file-based data from various sources to Pycytominer-compatible standards.
|
1284
1409
|
|
@@ -1322,6 +1447,11 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1322
1447
|
A dictionary mapping data type groups to specific types.
|
1323
1448
|
Roughly includes Arrow data types language from:
|
1324
1449
|
https://arrow.apache.org/docs/python/api/datatypes.html
|
1450
|
+
add_tablenumber: Optional[bool]
|
1451
|
+
Whether to add a calculated tablenumber which helps differentiate
|
1452
|
+
various repeated values (such as ObjectNumber) within source data.
|
1453
|
+
Useful for processing multiple SQLite or CSV data sources together
|
1454
|
+
to retain distinction from each dataset.
|
1325
1455
|
page_keys: str:
|
1326
1456
|
The table and column names to be used for key pagination.
|
1327
1457
|
Uses the form: {"table_name":"column_name"}.
|
@@ -1462,6 +1592,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1462
1592
|
infer_common_schema=infer_common_schema,
|
1463
1593
|
drop_null=drop_null,
|
1464
1594
|
data_type_cast_map=data_type_cast_map,
|
1595
|
+
add_tablenumber=add_tablenumber,
|
1465
1596
|
sort_output=sort_output,
|
1466
1597
|
page_keys=cast(dict, page_keys),
|
1467
1598
|
**kwargs,
|
cytotable/utils.py
CHANGED
@@ -166,6 +166,12 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
166
166
|
https://duckdb.org/docs/sql/configuration#configuration-reference
|
167
167
|
*/
|
168
168
|
PRAGMA preserve_insertion_order=FALSE;
|
169
|
+
|
170
|
+
/*
|
171
|
+
Disable progress bar from displaying (defaults to TRUE)
|
172
|
+
See earlier documentation references above for more information.
|
173
|
+
*/
|
174
|
+
SET enable_progress_bar=FALSE;
|
169
175
|
""",
|
170
176
|
)
|
171
177
|
|
@@ -176,6 +182,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
176
182
|
page_key: str,
|
177
183
|
pageset: Tuple[Union[int, float], Union[int, float]],
|
178
184
|
sort_output: bool,
|
185
|
+
tablenumber: Optional[int] = None,
|
179
186
|
) -> str:
|
180
187
|
"""
|
181
188
|
Performs SQLite table data extraction where one or many
|
@@ -195,6 +202,9 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
195
202
|
Specifies whether to sort cytotable output or not.
|
196
203
|
add_cytotable_meta: bool, default=False:
|
197
204
|
Whether to add CytoTable metadata fields or not
|
205
|
+
tablenumber: Optional[int], default=None:
|
206
|
+
An optional table number to append to the results.
|
207
|
+
Defaults to None.
|
198
208
|
|
199
209
|
Returns:
|
200
210
|
pyarrow.Table:
|
@@ -250,9 +260,19 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
250
260
|
# return the translated type for use in SQLite
|
251
261
|
return translated_type[0]
|
252
262
|
|
263
|
+
# build tablenumber segment addition (if necessary)
|
264
|
+
tablenumber_sql = (
|
265
|
+
# to become tablenumber in sql select later with integer
|
266
|
+
f"CAST({tablenumber} AS INTEGER) as TableNumber, "
|
267
|
+
if tablenumber is not None
|
268
|
+
# if we don't have a tablenumber value, don't introduce the column
|
269
|
+
else ""
|
270
|
+
)
|
271
|
+
|
253
272
|
# create cases for mixed-type handling in each column discovered above
|
254
|
-
query_parts =
|
255
|
-
|
273
|
+
query_parts = tablenumber_sql + ", ".join(
|
274
|
+
[
|
275
|
+
f"""
|
256
276
|
CASE
|
257
277
|
/* when the storage class type doesn't match the column, return nulltype */
|
258
278
|
WHEN typeof({col['column_name']}) !=
|
@@ -261,13 +281,14 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
261
281
|
ELSE {col['column_name']}
|
262
282
|
END AS {col['column_name']}
|
263
283
|
"""
|
264
|
-
|
265
|
-
|
284
|
+
for col in column_info
|
285
|
+
]
|
286
|
+
)
|
266
287
|
|
267
288
|
# perform the select using the cases built above and using chunksize + offset
|
268
289
|
sql_stmt = f"""
|
269
290
|
SELECT
|
270
|
-
{
|
291
|
+
{query_parts}
|
271
292
|
FROM {table_name}
|
272
293
|
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
273
294
|
{"ORDER BY " + page_key if sort_output else ""};
|
@@ -476,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
|
|
476
497
|
)
|
477
498
|
|
478
499
|
|
500
|
+
def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
|
501
|
+
"""
|
502
|
+
Build and return a checksum for use as a unique identifier across datasets
|
503
|
+
referenced from cytominer-database:
|
504
|
+
https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
|
505
|
+
|
506
|
+
Args:
|
507
|
+
pathname: str:
|
508
|
+
A path to a file with which to generate the checksum on.
|
509
|
+
buffer_size: int:
|
510
|
+
Buffer size to use for reading data.
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
int
|
514
|
+
an integer representing the checksum of the pathname file.
|
515
|
+
"""
|
516
|
+
|
517
|
+
import os
|
518
|
+
import zlib
|
519
|
+
|
520
|
+
# check whether the buffer size is larger than the file_size
|
521
|
+
file_size = os.path.getsize(pathname)
|
522
|
+
if file_size < buffer_size:
|
523
|
+
buffer_size = file_size
|
524
|
+
|
525
|
+
# open file
|
526
|
+
with open(str(pathname), "rb") as stream:
|
527
|
+
# begin result formation
|
528
|
+
result = zlib.crc32(bytes(0))
|
529
|
+
while True:
|
530
|
+
# read data from stream using buffer size
|
531
|
+
buffer = stream.read(buffer_size)
|
532
|
+
if not buffer:
|
533
|
+
# if we have no more data to use, break while loop
|
534
|
+
break
|
535
|
+
# use buffer read data to form checksum
|
536
|
+
result = zlib.crc32(buffer, result)
|
537
|
+
|
538
|
+
return result & 0xFFFFFFFF
|
539
|
+
|
540
|
+
|
479
541
|
def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
|
480
542
|
"""
|
481
543
|
Helper function to unwrap futures from values or return values
|
@@ -531,14 +593,16 @@ def _unwrap_source(
|
|
531
593
|
return _unwrap_value(source)
|
532
594
|
|
533
595
|
|
534
|
-
def evaluate_futures(
|
596
|
+
def evaluate_futures(
|
597
|
+
sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
|
598
|
+
) -> Any:
|
535
599
|
"""
|
536
600
|
Evaluates any Parsl futures for use within other tasks.
|
537
601
|
This enables a pattern of Parsl app usage as "tasks" and delayed
|
538
602
|
future result evaluation for concurrency.
|
539
603
|
|
540
604
|
Args:
|
541
|
-
sources: Union[Dict[str, List[Dict[str, Any]]], str]
|
605
|
+
sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
|
542
606
|
Sources are an internal data structure used by CytoTable for
|
543
607
|
processing and organizing data results. They may include futures
|
544
608
|
which require asynchronous processing through Parsl, so we
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.11
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=KSVr7xOOrpmQ_ybzcsZkblTAzPIYEq7_bm-Cjc874FM,316
|
2
|
+
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
+
cytotable/convert.py,sha256=5VHnw0eGdfXTbSfeEoPAPVa-dtobM6VHkIJwscLe68M,60651
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=tywZg1Gr78ebLlOp8R7trkiV7jsQ4iiZt4B6qG6SrxY,22578
|
8
|
+
cytotable-0.0.11.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.11.dist-info/METADATA,sha256=sOvdWxld2Ryyjd5bluZt8Z78uElg1CyWG0UIRJn0F8E,3424
|
10
|
+
cytotable-0.0.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
11
|
+
cytotable-0.0.11.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
|
2
|
-
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
-
cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
|
6
|
-
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
-
cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
|
8
|
-
cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
|
10
|
-
cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.10.dist-info/RECORD,,
|
File without changes
|