CytoTable 0.0.13__tar.gz → 0.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cytotable-0.0.13 → cytotable-0.0.15}/PKG-INFO +7 -2
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/__init__.py +1 -1
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/convert.py +110 -52
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/presets.py +31 -0
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/sources.py +29 -14
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/utils.py +178 -2
- {cytotable-0.0.13 → cytotable-0.0.15}/pyproject.toml +2 -2
- {cytotable-0.0.13 → cytotable-0.0.15}/readme.md +5 -0
- {cytotable-0.0.13 → cytotable-0.0.15}/LICENSE +0 -0
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/constants.py +0 -0
- {cytotable-0.0.13 → cytotable-0.0.15}/cytotable/exceptions.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.15
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
License: BSD-3-Clause License
|
6
6
|
Keywords: python,cellprofiler,single-cell-analysis,way-lab
|
@@ -13,7 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
16
|
-
Requires-Dist: cloudpathlib[all,s3] (>=0.18
|
16
|
+
Requires-Dist: cloudpathlib[all,s3] (>=0.18,<0.22)
|
17
17
|
Requires-Dist: duckdb (>=0.10.1)
|
18
18
|
Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
|
19
19
|
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
@@ -29,6 +29,11 @@ Description-Content-Type: text/markdown
|
|
29
29
|
|
30
30
|
# CytoTable
|
31
31
|
|
32
|
+

|
33
|
+
[](https://github.com/cytomining/cytotable/actions/workflows/test.yml?query=branch%3Amain)
|
34
|
+
[](https://python-poetry.org/)
|
35
|
+
[](https://doi.org/10.5281/zenodo.14888111)
|
36
|
+
|
32
37
|

|
33
38
|
_Diagram showing data flow relative to this project._
|
34
39
|
|
@@ -7,6 +7,7 @@ import logging
|
|
7
7
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
|
8
8
|
|
9
9
|
import parsl
|
10
|
+
import pyarrow as pa
|
10
11
|
from parsl.app.app import python_app
|
11
12
|
|
12
13
|
from cytotable.exceptions import CytoTableException
|
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
|
|
26
27
|
@python_app
|
27
28
|
def _get_table_columns_and_types(
|
28
29
|
source: Dict[str, Any], sort_output: bool
|
29
|
-
) -> List[Dict[str, str]]:
|
30
|
+
) -> List[Optional[Dict[str, str]]]:
|
30
31
|
"""
|
31
32
|
Gather column data from table through duckdb.
|
32
33
|
|
@@ -38,7 +39,7 @@ def _get_table_columns_and_types(
|
|
38
39
|
Specifies whether to sort cytotable output or not.
|
39
40
|
|
40
41
|
Returns:
|
41
|
-
List[Dict[str, str]]
|
42
|
+
List[Optional[Dict[str, str]]]
|
42
43
|
list of dictionaries which each include column level information
|
43
44
|
"""
|
44
45
|
|
@@ -49,6 +50,12 @@ def _get_table_columns_and_types(
|
|
49
50
|
source_path = source["source_path"]
|
50
51
|
source_type = str(source_path.suffix).lower()
|
51
52
|
|
53
|
+
# If we have .npz files, return a list with None
|
54
|
+
# because we're querying a non-tabular data source.
|
55
|
+
# These will be handled later by _extract_npz_to_parquet.
|
56
|
+
if source_type == ".npz":
|
57
|
+
return [None]
|
58
|
+
|
52
59
|
# prepare the data source in the form of a duckdb query
|
53
60
|
select_source = (
|
54
61
|
f"read_csv_auto('{source_path}')"
|
@@ -279,7 +286,9 @@ def _get_table_keyset_pagination_sets(
|
|
279
286
|
page_key: str,
|
280
287
|
source: Optional[Dict[str, Any]] = None,
|
281
288
|
sql_stmt: Optional[str] = None,
|
282
|
-
) -> Union[
|
289
|
+
) -> Union[
|
290
|
+
List[Optional[Tuple[Union[int, float], Union[int, float]]]], List[None], None
|
291
|
+
]:
|
283
292
|
"""
|
284
293
|
Get table data chunk keys for later use in capturing segments
|
285
294
|
of values. This work also provides a chance to catch problematic
|
@@ -300,7 +309,7 @@ def _get_table_keyset_pagination_sets(
|
|
300
309
|
data source.
|
301
310
|
|
302
311
|
Returns:
|
303
|
-
|
312
|
+
Union[List[Optional[Tuple[Union[int, float], Union[int, float]]]], None]
|
304
313
|
List of keys to use for reading the data later on.
|
305
314
|
"""
|
306
315
|
|
@@ -324,8 +333,15 @@ def _get_table_keyset_pagination_sets(
|
|
324
333
|
with _duckdb_reader() as ddb_reader:
|
325
334
|
if source_type == ".csv":
|
326
335
|
sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
|
327
|
-
|
336
|
+
elif source_type == ".sqlite":
|
328
337
|
sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
|
338
|
+
elif source_type == ".npz":
|
339
|
+
# If we have npz files there's no need to paginate
|
340
|
+
# so we return None. None within a list is used as
|
341
|
+
# a special "passthrough" case within the pipeline
|
342
|
+
# so we may specially handle NPZ files later on via
|
343
|
+
# _source_pageset_to_parquet and _extract_npz_to_parquet.
|
344
|
+
return [None]
|
329
345
|
|
330
346
|
page_keys = [
|
331
347
|
results[0] for results in ddb_reader.execute(sql_query).fetchall()
|
@@ -360,14 +376,16 @@ def _get_table_keyset_pagination_sets(
|
|
360
376
|
page_keys = ddb_reader.execute(sql_query).fetchall()
|
361
377
|
page_keys = [key[0] for key in page_keys]
|
362
378
|
|
363
|
-
|
379
|
+
# The type: mention below is used to ignore a mypy linting error
|
380
|
+
# wherein it considers _generate_pagesets to be invalid.
|
381
|
+
return _generate_pagesets(page_keys, chunk_size) # type: ignore[return-value]
|
364
382
|
|
365
383
|
|
366
384
|
@python_app
|
367
385
|
def _source_pageset_to_parquet(
|
368
386
|
source_group_name: str,
|
369
387
|
source: Dict[str, Any],
|
370
|
-
pageset: Tuple[Union[int, float], Union[int, float]],
|
388
|
+
pageset: Optional[Tuple[Union[int, float], Union[int, float]]],
|
371
389
|
dest_path: str,
|
372
390
|
sort_output: bool,
|
373
391
|
) -> str:
|
@@ -380,7 +398,7 @@ def _source_pageset_to_parquet(
|
|
380
398
|
source: Dict[str, Any]
|
381
399
|
Contains the source data to be chunked. Represents a single
|
382
400
|
file or table of some kind along with collected information about table.
|
383
|
-
pageset: Tuple[int, int]
|
401
|
+
pageset: Optional[Tuple[Union[int, float], Union[int, float]]]
|
384
402
|
The pageset for chunking the data from source.
|
385
403
|
dest_path: str
|
386
404
|
Path to store the output data.
|
@@ -399,10 +417,13 @@ def _source_pageset_to_parquet(
|
|
399
417
|
|
400
418
|
from cytotable.utils import (
|
401
419
|
_duckdb_reader,
|
420
|
+
_extract_npz_to_parquet,
|
402
421
|
_sqlite_mixed_type_query_to_parquet,
|
403
422
|
_write_parquet_table_with_metadata,
|
404
423
|
)
|
405
424
|
|
425
|
+
source_type = str(source["source_path"].suffix).lower()
|
426
|
+
|
406
427
|
# attempt to build dest_path
|
407
428
|
source_dest_path = (
|
408
429
|
f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
|
@@ -410,6 +431,28 @@ def _source_pageset_to_parquet(
|
|
410
431
|
)
|
411
432
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
412
433
|
|
434
|
+
# If we have npz files, we need to extract them in a specialized manner.
|
435
|
+
# See below for CSV and SQLite handling.
|
436
|
+
if source_type == ".npz":
|
437
|
+
return _extract_npz_to_parquet(
|
438
|
+
source_path=str(source["source_path"]),
|
439
|
+
dest_path=f"{source_dest_path}/{str(source['source_path'].stem)}.parquet",
|
440
|
+
tablenumber=source["tablenumber"],
|
441
|
+
)
|
442
|
+
|
443
|
+
elif pageset is None:
|
444
|
+
# if we have a `None` pageset and we're not using
|
445
|
+
# npz, then we have an exception (this shouldn't happen
|
446
|
+
# because we will need a pageset range to work with for
|
447
|
+
# table queries and npz files are handled above with
|
448
|
+
# the none case).
|
449
|
+
raise CytoTableException(
|
450
|
+
(
|
451
|
+
"No pageset range provided for source data"
|
452
|
+
" (required for non-NPZ datasets)."
|
453
|
+
)
|
454
|
+
)
|
455
|
+
|
413
456
|
# build tablenumber segment addition (if necessary)
|
414
457
|
tablenumber_sql = (
|
415
458
|
# to become tablenumber in sql select later with bigint (8-byte integer)
|
@@ -439,11 +482,11 @@ def _source_pageset_to_parquet(
|
|
439
482
|
|
440
483
|
# build output query and filepath base
|
441
484
|
# (chunked output will append offset to keep output paths unique)
|
442
|
-
if
|
485
|
+
if source_type == ".csv":
|
443
486
|
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
|
444
487
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
|
445
488
|
|
446
|
-
elif
|
489
|
+
elif source_type == ".sqlite":
|
447
490
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
448
491
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
449
492
|
|
@@ -840,7 +883,7 @@ def _join_source_pageset(
|
|
840
883
|
dest_path: str,
|
841
884
|
joins: str,
|
842
885
|
page_key: str,
|
843
|
-
pageset: Tuple[int, int],
|
886
|
+
pageset: Union[Tuple[int, int], None],
|
844
887
|
sort_output: bool,
|
845
888
|
drop_null: bool,
|
846
889
|
) -> str:
|
@@ -877,7 +920,7 @@ def _join_source_pageset(
|
|
877
920
|
)
|
878
921
|
SELECT *
|
879
922
|
FROM joined
|
880
|
-
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
923
|
+
{f"WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}" if pageset is not None else ""}
|
881
924
|
/* optional sorting per pagset */
|
882
925
|
{"ORDER BY " + page_key if sort_output else ""};
|
883
926
|
"""
|
@@ -902,11 +945,13 @@ def _join_source_pageset(
|
|
902
945
|
|
903
946
|
result_file_path = (
|
904
947
|
# store the result in the parent of the dest_path
|
905
|
-
f"{str(pathlib.Path(dest_path).parent)}/"
|
948
|
+
f"{str(pathlib.Path(dest_path).parent)}/" +
|
906
949
|
# use the dest_path stem in the name
|
907
|
-
f"{str(pathlib.Path(dest_path).stem)}-"
|
950
|
+
f"{str(pathlib.Path(dest_path).stem)}-" +
|
908
951
|
# add the pageset indication to the filename
|
909
952
|
f"{pageset[0]}-{pageset[1]}.parquet"
|
953
|
+
if pageset is not None
|
954
|
+
else ".parquet"
|
910
955
|
)
|
911
956
|
|
912
957
|
# write the result
|
@@ -1001,9 +1046,9 @@ def _concat_join_sources(
|
|
1001
1046
|
def _infer_source_group_common_schema(
|
1002
1047
|
source_group: List[Dict[str, Any]],
|
1003
1048
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1004
|
-
) -> List[Tuple[str,
|
1049
|
+
) -> List[Tuple[str, pa.DataType]]:
|
1005
1050
|
"""
|
1006
|
-
Infers a common schema for group of parquet files which may have
|
1051
|
+
Infers a common schema for a group of parquet files which may have
|
1007
1052
|
similar but slightly different schema or data. Intended to assist with
|
1008
1053
|
data concatenation and other operations.
|
1009
1054
|
|
@@ -1015,9 +1060,8 @@ def _infer_source_group_common_schema(
|
|
1015
1060
|
A dictionary mapping data type groups to specific types.
|
1016
1061
|
Roughly includes Arrow data types language from:
|
1017
1062
|
https://arrow.apache.org/docs/python/api/datatypes.html
|
1018
|
-
|
1019
1063
|
Returns:
|
1020
|
-
List[Tuple[str,
|
1064
|
+
List[Tuple[str, pa.DataType]]
|
1021
1065
|
A list of tuples which includes column name and PyArrow datatype.
|
1022
1066
|
This data will later be used as the basis for forming a PyArrow schema.
|
1023
1067
|
"""
|
@@ -1025,32 +1069,31 @@ def _infer_source_group_common_schema(
|
|
1025
1069
|
import pyarrow as pa
|
1026
1070
|
import pyarrow.parquet as parquet
|
1027
1071
|
|
1028
|
-
from cytotable.
|
1072
|
+
from cytotable.utils import map_pyarrow_type
|
1029
1073
|
|
1030
|
-
#
|
1074
|
+
# Read the first file to establish the base schema
|
1031
1075
|
common_schema = parquet.read_schema(source_group[0]["table"][0])
|
1032
1076
|
|
1033
|
-
#
|
1077
|
+
# Infer the common schema by comparing all schemas in the group
|
1034
1078
|
for schema in [
|
1035
1079
|
parquet.read_schema(table)
|
1036
1080
|
for source in source_group
|
1037
1081
|
for table in source["table"]
|
1038
1082
|
]:
|
1039
|
-
#
|
1083
|
+
# Skip if the schema matches the common schema
|
1040
1084
|
if schema.equals(common_schema):
|
1041
1085
|
continue
|
1042
1086
|
|
1043
|
-
#
|
1087
|
+
# Gather field names from the schema
|
1044
1088
|
schema_field_names = [item.name for item in schema]
|
1045
1089
|
|
1046
|
-
#
|
1090
|
+
# Reverse enumeration to avoid index shifting when removing fields
|
1047
1091
|
for index, field in reversed(list(enumerate(common_schema))):
|
1048
|
-
#
|
1049
|
-
# note: because this only checks for naming, we defer to initially detected type
|
1092
|
+
# Remove fields not present in the current schema
|
1050
1093
|
if field.name not in schema_field_names:
|
1051
1094
|
common_schema = common_schema.remove(index)
|
1052
1095
|
|
1053
|
-
#
|
1096
|
+
# Handle null vs non-null type conflicts
|
1054
1097
|
elif pa.types.is_null(field.type) and not pa.types.is_null(
|
1055
1098
|
schema.field(field.name).type
|
1056
1099
|
):
|
@@ -1058,37 +1101,44 @@ def _infer_source_group_common_schema(
|
|
1058
1101
|
index, field.with_type(schema.field(field.name).type)
|
1059
1102
|
)
|
1060
1103
|
|
1061
|
-
#
|
1104
|
+
# Handle integer to float type conflicts
|
1062
1105
|
elif pa.types.is_integer(field.type) and pa.types.is_floating(
|
1063
1106
|
schema.field(field.name).type
|
1064
1107
|
):
|
1065
1108
|
common_schema = common_schema.set(
|
1066
1109
|
index,
|
1067
1110
|
field.with_type(
|
1068
|
-
# use float64 as a default here if we aren't casting floats
|
1069
1111
|
pa.float64()
|
1070
1112
|
if data_type_cast_map is None
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1113
|
+
else pa.type_for_alias(
|
1114
|
+
data_type_cast_map.get("float", "float64")
|
1115
|
+
)
|
1074
1116
|
),
|
1075
1117
|
)
|
1076
1118
|
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1119
|
+
# Handle nested or complex types dynamically
|
1120
|
+
else:
|
1121
|
+
common_schema = common_schema.set(
|
1122
|
+
index,
|
1123
|
+
field.with_type(
|
1124
|
+
map_pyarrow_type(
|
1125
|
+
field_type=field.type, data_type_cast_map=data_type_cast_map
|
1126
|
+
)
|
1127
|
+
),
|
1128
|
+
)
|
1084
1129
|
|
1085
|
-
#
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1130
|
+
# Validate the schema to ensure all types are valid PyArrow types
|
1131
|
+
validated_schema = [
|
1132
|
+
(
|
1133
|
+
field.name,
|
1134
|
+
map_pyarrow_type(
|
1135
|
+
field_type=field.type, data_type_cast_map=data_type_cast_map
|
1136
|
+
),
|
1090
1137
|
)
|
1091
|
-
|
1138
|
+
for field in common_schema
|
1139
|
+
]
|
1140
|
+
|
1141
|
+
return validated_schema
|
1092
1142
|
|
1093
1143
|
|
1094
1144
|
def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
@@ -1185,9 +1235,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1185
1235
|
matching_keys = [
|
1186
1236
|
key for key in page_keys.keys() if key.lower() in source_group_name.lower()
|
1187
1237
|
]
|
1188
|
-
if not matching_keys:
|
1238
|
+
if not matching_keys and source_datatype != "npz":
|
1189
1239
|
raise CytoTableException(
|
1190
|
-
f"No matching key found in page_keys for source_group_name: {source_group_name}."
|
1240
|
+
f"No matching key found in page_keys for source_group_name: {source_group_name}. "
|
1191
1241
|
"Please include a pagination key based on a column name from the table."
|
1192
1242
|
)
|
1193
1243
|
|
@@ -1198,11 +1248,16 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1198
1248
|
source,
|
1199
1249
|
**{
|
1200
1250
|
"page_key": (
|
1201
|
-
page_key :=
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1251
|
+
page_key := next(
|
1252
|
+
(
|
1253
|
+
value
|
1254
|
+
for key, value in page_keys.items()
|
1255
|
+
if key.lower() in source_group_name.lower()
|
1256
|
+
),
|
1257
|
+
# Placeholder value if no match is found
|
1258
|
+
# used in cases for .npz source types.
|
1259
|
+
"placeholder",
|
1260
|
+
)
|
1206
1261
|
),
|
1207
1262
|
"pagesets": _get_table_keyset_pagination_sets(
|
1208
1263
|
source=source,
|
@@ -1598,4 +1653,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1598
1653
|
**kwargs,
|
1599
1654
|
)
|
1600
1655
|
|
1656
|
+
# cleanup Parsl executor and related
|
1657
|
+
parsl.dfk().cleanup()
|
1658
|
+
|
1601
1659
|
return output
|
@@ -317,6 +317,37 @@ config = {
|
|
317
317
|
# and modified at runtime as needed
|
318
318
|
"CONFIG_JOINS": "",
|
319
319
|
},
|
320
|
+
"deepprofiler": {
|
321
|
+
# version specifications using related references
|
322
|
+
"CONFIG_SOURCE_VERSION": {
|
323
|
+
"deepprofiler": "v0.3.1",
|
324
|
+
"cellprofiler": "v4.2.x",
|
325
|
+
},
|
326
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
327
|
+
# in the case of NPZ files, these sometimes
|
328
|
+
# include the name of the well or site
|
329
|
+
# but not the compartment, and as a result,
|
330
|
+
# we specify an empty tuple.
|
331
|
+
"CONFIG_NAMES_COMPARTMENTS": tuple(),
|
332
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
333
|
+
"CONFIG_NAMES_METADATA": tuple(),
|
334
|
+
# column names in any compartment or metadata tables which contain
|
335
|
+
# unique names to avoid renaming
|
336
|
+
"CONFIG_IDENTIFYING_COLUMNS": tuple(),
|
337
|
+
# pagination keys for use with this data
|
338
|
+
# of the rough format "table" -> "column".
|
339
|
+
# note: page keys are expected to be numeric (int, float)
|
340
|
+
"CONFIG_PAGE_KEYS": {
|
341
|
+
"join": "Metadata_Site",
|
342
|
+
},
|
343
|
+
# chunk size to use for join operations to help with possible performance issues
|
344
|
+
# note: this number is an estimate and is may need changes contingent on data
|
345
|
+
# and system used by this library.
|
346
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
347
|
+
# compartment and metadata joins performed using DuckDB SQL
|
348
|
+
# and modified at runtime as needed
|
349
|
+
"CONFIG_JOINS": "",
|
350
|
+
},
|
320
351
|
}
|
321
352
|
"""
|
322
353
|
Configuration presets for CytoTable
|
@@ -36,7 +36,10 @@ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
|
|
36
36
|
|
37
37
|
# set the client for a CloudPath
|
38
38
|
if isinstance(processed_path, CloudPath):
|
39
|
-
|
39
|
+
# Create a new client instance with the provided kwargs
|
40
|
+
client = processed_path.client.__class__(**kwargs)
|
41
|
+
# Recreate the CloudPath object with the new client
|
42
|
+
processed_path = client.CloudPath(processed_path)
|
40
43
|
|
41
44
|
return processed_path
|
42
45
|
|
@@ -75,7 +78,9 @@ def _get_source_filepaths(
|
|
75
78
|
"A source_datatype must be specified when using undefined compartments and metadata names."
|
76
79
|
)
|
77
80
|
|
78
|
-
|
81
|
+
source_datatypes = [".csv", ".npz", ".sqlite"] # Default supported extensions
|
82
|
+
|
83
|
+
# Gather files from the provided path using compartments + metadata as a filter
|
79
84
|
sources = [
|
80
85
|
# build source_paths for all files
|
81
86
|
# note: builds local cache for sqlite files from cloud
|
@@ -90,16 +95,22 @@ def _get_source_filepaths(
|
|
90
95
|
)
|
91
96
|
# ensure the subpaths meet certain specifications
|
92
97
|
if (
|
93
|
-
targets
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
98
|
+
# If targets are specified, only include files matching targets
|
99
|
+
(
|
100
|
+
targets is not None
|
101
|
+
and str(subpath.stem).lower() in [target.lower() for target in targets]
|
102
|
+
or subpath.suffix.lower() == ".sqlite"
|
103
|
+
)
|
104
|
+
# Otherwise, include files matching the source_datatypes
|
105
|
+
or (
|
106
|
+
targets is None
|
107
|
+
or targets == []
|
108
|
+
and subpath.suffix.lower() in source_datatypes
|
109
|
+
)
|
99
110
|
)
|
100
111
|
]
|
101
112
|
|
102
|
-
#
|
113
|
+
# Expand sources to include sqlite tables similarly to files (one entry per table)
|
103
114
|
expanded_sources = []
|
104
115
|
with _duckdb_reader() as ddb_reader:
|
105
116
|
for element in sources:
|
@@ -118,8 +129,8 @@ def _get_source_filepaths(
|
|
118
129
|
"""
|
119
130
|
/* perform query on sqlite_master table for metadata on tables */
|
120
131
|
SELECT name as table_name
|
121
|
-
|
122
|
-
|
132
|
+
FROM sqlite_scan(?, 'sqlite_master')
|
133
|
+
WHERE type='table'
|
123
134
|
""",
|
124
135
|
parameters=[str(element["source_path"])],
|
125
136
|
)
|
@@ -153,10 +164,14 @@ def _get_source_filepaths(
|
|
153
164
|
# use lowercase version of the path to infer a commonprefix
|
154
165
|
source["source_path"].stem.lower()
|
155
166
|
for source in sources
|
156
|
-
if source["source_path"].suffix
|
167
|
+
if source["source_path"].suffix in source_datatypes
|
157
168
|
]
|
158
169
|
)
|
159
|
-
grouped_sources[
|
170
|
+
grouped_sources[
|
171
|
+
# construct a grouped source name, deferring to use 'all_files'
|
172
|
+
# if no common prefix is found.
|
173
|
+
f"{common_prefix if common_prefix != '' else 'all_files'}.{source_datatype}"
|
174
|
+
] = sources
|
160
175
|
|
161
176
|
# otherwise, use the unique names in the paths to determine source grouping
|
162
177
|
else:
|
@@ -283,7 +298,7 @@ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
|
|
283
298
|
|
284
299
|
# if we don't have a sqlite file
|
285
300
|
# (we can't check sqlite files for lines)
|
286
|
-
if path.suffix.lower()
|
301
|
+
if path.suffix.lower() not in [".sqlite", ".npz"]:
|
287
302
|
with path.open("r") as f:
|
288
303
|
try:
|
289
304
|
# read two lines, if the second is empty return false
|
@@ -196,7 +196,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
196
196
|
The name of the table being queried.
|
197
197
|
page_key: str:
|
198
198
|
The column name to be used to identify pagination chunks.
|
199
|
-
pageset: Tuple[int, int]:
|
199
|
+
pageset: Tuple[Union[int, float], Union[int, float]]:
|
200
200
|
The range for values used for paginating data from source.
|
201
201
|
sort_output: bool
|
202
202
|
Specifies whether to sort cytotable output or not.
|
@@ -336,7 +336,7 @@ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
|
|
336
336
|
if (
|
337
337
|
isinstance(path, CloudPath)
|
338
338
|
and path.is_file()
|
339
|
-
and path.suffix.lower()
|
339
|
+
and path.suffix.lower() in [".sqlite", ".npz"]
|
340
340
|
):
|
341
341
|
try:
|
342
342
|
# update the path to be the local filepath for reference in CytoTable ops
|
@@ -706,3 +706,179 @@ def _natural_sort(list_to_sort):
|
|
706
706
|
for c in re.split("([0-9]+)", str(key))
|
707
707
|
],
|
708
708
|
)
|
709
|
+
|
710
|
+
|
711
|
+
def _extract_npz_to_parquet(
|
712
|
+
source_path: str,
|
713
|
+
dest_path: str,
|
714
|
+
tablenumber: Optional[int] = None,
|
715
|
+
) -> str:
|
716
|
+
"""
|
717
|
+
Extract data from an .npz file created by DeepProfiler
|
718
|
+
as a tabular dataset and write to parquet.
|
719
|
+
|
720
|
+
DeepProfiler creates datasets which look somewhat like this:
|
721
|
+
Keys in the .npz file: ['features', 'metadata', 'locations']
|
722
|
+
|
723
|
+
Variable: features
|
724
|
+
Shape: (229, 6400)
|
725
|
+
Data type: float32
|
726
|
+
|
727
|
+
Variable: locations
|
728
|
+
Shape: (229, 2)
|
729
|
+
Data type: float64
|
730
|
+
|
731
|
+
Variable: metadata
|
732
|
+
Shape: ()
|
733
|
+
Data type: object
|
734
|
+
Whole object: {
|
735
|
+
'Metadata_Plate': 'SQ00014812',
|
736
|
+
'Metadata_Well': 'A01',
|
737
|
+
'Metadata_Site': 1,
|
738
|
+
'Plate_Map_Name': 'C-7161-01-LM6-022',
|
739
|
+
'RNA': 'SQ00014812/r01c01f01p01-ch3sk1fk1fl1.png',
|
740
|
+
'ER': 'SQ00014812/r01c01f01p01-ch2sk1fk1fl1.png',
|
741
|
+
'AGP': 'SQ00014812/r01c01f01p01-ch4sk1fk1fl1.png',
|
742
|
+
'Mito': 'SQ00014812/r01c01f01p01-ch5sk1fk1fl1.png',
|
743
|
+
'DNA': 'SQ00014812/r01c01f01p01-ch1sk1fk1fl1.png',
|
744
|
+
'Treatment_ID': 0,
|
745
|
+
'Treatment_Replicate': 1,
|
746
|
+
'Treatment': 'DMSO@NA',
|
747
|
+
'Compound': 'DMSO',
|
748
|
+
'Concentration': '',
|
749
|
+
'Split': 'Training',
|
750
|
+
'Metadata_Model': 'efficientnet'
|
751
|
+
}
|
752
|
+
|
753
|
+
Args:
|
754
|
+
source_path: str
|
755
|
+
Path to the .npz file.
|
756
|
+
dest_path: str
|
757
|
+
Destination path for the parquet file.
|
758
|
+
tablenumber: Optional[int]
|
759
|
+
Optional tablenumber to be added to the data.
|
760
|
+
|
761
|
+
Returns:
|
762
|
+
str
|
763
|
+
Path to the exported parquet file.
|
764
|
+
"""
|
765
|
+
|
766
|
+
import pathlib
|
767
|
+
|
768
|
+
import numpy as np
|
769
|
+
import pyarrow as pa
|
770
|
+
import pyarrow.parquet as parquet
|
771
|
+
|
772
|
+
# Load features from the .npz file
|
773
|
+
with open(source_path, "rb") as data:
|
774
|
+
loaded_npz = np.load(file=data, allow_pickle=True)
|
775
|
+
# find the shape of the features, which will help structure
|
776
|
+
# data which doesn't yet conform to the same shape (by row count).
|
777
|
+
rows = loaded_npz["features"].shape[0]
|
778
|
+
# note: we use [()] to load the numpy array as a python dict
|
779
|
+
metadata = loaded_npz["metadata"][()]
|
780
|
+
# fetch the metadata model name, falling back to "DP" if not found
|
781
|
+
feature_prefix = metadata.get("Metadata_Model", "DP")
|
782
|
+
# we transpose the feature data for more efficient
|
783
|
+
# columnar-focused access
|
784
|
+
feature_data = loaded_npz["features"].T
|
785
|
+
|
786
|
+
npz_as_pydict = {
|
787
|
+
# add metadata to the table
|
788
|
+
# note: metadata within npz files corresponds to a dictionary of
|
789
|
+
# various keys and values related to the feature and location data.
|
790
|
+
"Metadata_TableNumber": pa.array([tablenumber] * rows, type=pa.int64()),
|
791
|
+
"Metadata_NPZSource": pa.array(
|
792
|
+
[pathlib.Path(source_path).name] * rows, type=pa.string()
|
793
|
+
),
|
794
|
+
**{key: [metadata[key]] * rows for key in metadata.keys()},
|
795
|
+
# add locations data to the table
|
796
|
+
"Location_Center_X": [loaded_npz["locations"][i][0] for i in range(rows)],
|
797
|
+
"Location_Center_Y": [loaded_npz["locations"][i][1] for i in range(rows)],
|
798
|
+
# add features data to the table
|
799
|
+
**{
|
800
|
+
f"{feature_prefix}_{feature_idx + 1}": feature_data[feature_idx]
|
801
|
+
for feature_idx in range(feature_data.shape[0])
|
802
|
+
},
|
803
|
+
}
|
804
|
+
|
805
|
+
# convert the numpy arrays to a PyArrow table and write to parquet
|
806
|
+
parquet.write_table(pa.Table.from_pydict(npz_as_pydict), dest_path)
|
807
|
+
|
808
|
+
return dest_path
|
809
|
+
|
810
|
+
|
811
|
+
def map_pyarrow_type(
|
812
|
+
field_type: pa.DataType, data_type_cast_map: Optional[Dict[str, str]]
|
813
|
+
) -> pa.DataType:
|
814
|
+
"""
|
815
|
+
Map PyArrow types dynamically to handle nested types and casting.
|
816
|
+
|
817
|
+
This function takes a PyArrow `field_type` and dynamically maps
|
818
|
+
it to a valid PyArrow type, handling nested types (e.g., lists,
|
819
|
+
structs) and resolving type conflicts (e.g., integer to float).
|
820
|
+
It also supports custom type casting using the
|
821
|
+
`data_type_cast_map` parameter.
|
822
|
+
|
823
|
+
Args:
|
824
|
+
field_type: pa.DataType
|
825
|
+
The PyArrow data type to be mapped.
|
826
|
+
This can include simple types (e.g., int, float, string)
|
827
|
+
or nested types (e.g., list, struct).
|
828
|
+
data_type_cast_map: Optional[Dict[str, str]], default None
|
829
|
+
A dictionary mapping data type groups to specific types.
|
830
|
+
This allows for custom type casting.
|
831
|
+
For example:
|
832
|
+
- {"float": "float32"} maps
|
833
|
+
floating-point types to `float32`.
|
834
|
+
- {"int": "int64"} maps integer
|
835
|
+
types to `int64`.
|
836
|
+
If `data_type_cast_map` is
|
837
|
+
None, default PyArrow types are used.
|
838
|
+
|
839
|
+
Returns:
|
840
|
+
pa.DataType
|
841
|
+
The mapped PyArrow data type.
|
842
|
+
If no mapping is needed, the original
|
843
|
+
`field_type` is returned.
|
844
|
+
"""
|
845
|
+
|
846
|
+
if pa.types.is_list(field_type):
|
847
|
+
# Handle list types (e.g., list<element: float>)
|
848
|
+
return pa.list_(
|
849
|
+
map_pyarrow_type(
|
850
|
+
field_type=field_type.value_type, data_type_cast_map=data_type_cast_map
|
851
|
+
)
|
852
|
+
)
|
853
|
+
elif pa.types.is_struct(field_type):
|
854
|
+
# Handle struct types recursively
|
855
|
+
return pa.struct(
|
856
|
+
[
|
857
|
+
(
|
858
|
+
field.name,
|
859
|
+
map_pyarrow_type(
|
860
|
+
field_type=field.type, data_type_cast_map=data_type_cast_map
|
861
|
+
),
|
862
|
+
)
|
863
|
+
for field in field_type
|
864
|
+
]
|
865
|
+
)
|
866
|
+
elif pa.types.is_floating(field_type):
|
867
|
+
# Handle floating-point types
|
868
|
+
if data_type_cast_map and "float" in data_type_cast_map:
|
869
|
+
return pa.type_for_alias(data_type_cast_map["float"])
|
870
|
+
return pa.float64() # Default to float64 if no mapping is provided
|
871
|
+
elif pa.types.is_integer(field_type):
|
872
|
+
# Handle integer types
|
873
|
+
if data_type_cast_map and "integer" in data_type_cast_map:
|
874
|
+
return pa.type_for_alias(data_type_cast_map["integer"])
|
875
|
+
return pa.int64() # Default to int64 if no mapping is provided
|
876
|
+
elif pa.types.is_string(field_type):
|
877
|
+
# Handle string types
|
878
|
+
return pa.string()
|
879
|
+
elif pa.types.is_null(field_type):
|
880
|
+
# Handle null types
|
881
|
+
return pa.null()
|
882
|
+
else:
|
883
|
+
# Default to the original type if no mapping is needed
|
884
|
+
return field_type
|
@@ -5,7 +5,7 @@ requires = [ "poetry-core>=1", "poetry-dynamic-versioning>=1,<2" ]
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "CytoTable"
|
7
7
|
# note: version data is maintained by poetry-dynamic-versioning (do not edit)
|
8
|
-
version = "0.0.
|
8
|
+
version = "0.0.15"
|
9
9
|
description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
|
10
10
|
authors = [ "Cytomining Community" ]
|
11
11
|
license = "BSD-3-Clause License"
|
@@ -18,7 +18,7 @@ keywords = [ "python", "cellprofiler", "single-cell-analysis", "way-lab" ]
|
|
18
18
|
[tool.poetry.dependencies]
|
19
19
|
python = ">=3.9,<3.14"
|
20
20
|
pyarrow = ">=13.0.0"
|
21
|
-
cloudpathlib = { extras = [ "all", "s3" ], version = "
|
21
|
+
cloudpathlib = { extras = [ "all", "s3" ], version = ">=0.18,<0.22" }
|
22
22
|
duckdb = ">=0.8.0,!=0.10.0,>=0.10.1"
|
23
23
|
parsl = ">=2023.9.25"
|
24
24
|
numpy = [
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
# CytoTable
|
4
4
|
|
5
|
+

|
6
|
+
[](https://github.com/cytomining/cytotable/actions/workflows/test.yml?query=branch%3Amain)
|
7
|
+
[](https://python-poetry.org/)
|
8
|
+
[](https://doi.org/10.5281/zenodo.14888111)
|
9
|
+
|
5
10
|

|
6
11
|
_Diagram showing data flow relative to this project._
|
7
12
|
|
File without changes
|
File without changes
|
File without changes
|