CytoTable 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/convert.py +11 -18
- cytotable/presets.py +48 -0
- cytotable/sources.py +45 -16
- cytotable/utils.py +12 -7
- {cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/METADATA +2 -2
- cytotable-0.0.9.dist-info/RECORD +11 -0
- cytotable-0.0.8.dist-info/RECORD +0 -11
- {cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/LICENSE +0 -0
- {cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/WHEEL +0 -0
cytotable/__init__.py
CHANGED
cytotable/convert.py
CHANGED
@@ -46,11 +46,12 @@ def _get_table_columns_and_types(
|
|
46
46
|
import pathlib
|
47
47
|
|
48
48
|
import duckdb
|
49
|
+
from cloudpathlib import AnyPath
|
49
50
|
|
50
51
|
from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
|
51
52
|
|
52
53
|
source_path = source["source_path"]
|
53
|
-
source_type = str(
|
54
|
+
source_type = str(source_path.suffix).lower()
|
54
55
|
|
55
56
|
# prepare the data source in the form of a duckdb query
|
56
57
|
select_source = (
|
@@ -209,7 +210,7 @@ def _get_table_chunk_offsets(
|
|
209
210
|
import pathlib
|
210
211
|
|
211
212
|
import duckdb
|
212
|
-
from cloudpathlib import AnyPath
|
213
|
+
from cloudpathlib import AnyPath, CloudPath
|
213
214
|
|
214
215
|
from cytotable.exceptions import NoInputDataException
|
215
216
|
from cytotable.utils import _duckdb_reader
|
@@ -219,18 +220,9 @@ def _get_table_chunk_offsets(
|
|
219
220
|
if source is not None:
|
220
221
|
table_name = source["table_name"] if "table_name" in source.keys() else None
|
221
222
|
source_path = source["source_path"]
|
222
|
-
source_type = str(
|
223
|
+
source_type = str(source_path.suffix).lower()
|
223
224
|
|
224
225
|
try:
|
225
|
-
# for csv's, check that we have more than one row (a header and data values)
|
226
|
-
if (
|
227
|
-
source_type == ".csv"
|
228
|
-
and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
|
229
|
-
):
|
230
|
-
raise NoInputDataException(
|
231
|
-
f"Data file has 0 rows of values. Error in file: {source_path}"
|
232
|
-
)
|
233
|
-
|
234
226
|
# gather the total rowcount from csv or sqlite data input sources
|
235
227
|
with _duckdb_reader() as ddb_reader:
|
236
228
|
rowcount = int(
|
@@ -322,8 +314,8 @@ def _source_chunk_to_parquet(
|
|
322
314
|
|
323
315
|
# attempt to build dest_path
|
324
316
|
source_dest_path = (
|
325
|
-
f"{dest_path}/{str(
|
326
|
-
f"{str(
|
317
|
+
f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
|
318
|
+
f"{str(source['source_path'].parent.name).lower()}"
|
327
319
|
)
|
328
320
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
329
321
|
|
@@ -364,11 +356,11 @@ def _source_chunk_to_parquet(
|
|
364
356
|
|
365
357
|
# build output query and filepath base
|
366
358
|
# (chunked output will append offset to keep output paths unique)
|
367
|
-
if str(
|
359
|
+
if str(source["source_path"].suffix).lower() == ".csv":
|
368
360
|
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
|
369
361
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
|
370
362
|
|
371
|
-
elif str(
|
363
|
+
elif str(source["source_path"].suffix).lower() == ".sqlite":
|
372
364
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
373
365
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
374
366
|
|
@@ -405,7 +397,7 @@ def _source_chunk_to_parquet(
|
|
405
397
|
# to handle the mixed types
|
406
398
|
if (
|
407
399
|
"Mismatch Type Error" in str(e)
|
408
|
-
and str(
|
400
|
+
and str(source["source_path"].suffix).lower() == ".sqlite"
|
409
401
|
):
|
410
402
|
_write_parquet_table_with_metadata(
|
411
403
|
# here we use sqlite instead of duckdb to extract
|
@@ -817,6 +809,7 @@ def _join_source_chunk(
|
|
817
809
|
exclude_meta_cols = [
|
818
810
|
f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
|
819
811
|
]
|
812
|
+
|
820
813
|
with _duckdb_reader() as ddb_reader:
|
821
814
|
result = ddb_reader.execute(
|
822
815
|
f"""
|
@@ -1114,7 +1107,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1114
1107
|
else []
|
1115
1108
|
),
|
1116
1109
|
**kwargs,
|
1117
|
-
)
|
1110
|
+
)
|
1118
1111
|
|
1119
1112
|
# expand the destination path
|
1120
1113
|
expanded_dest_path = _expand_path(path=dest_path)
|
cytotable/presets.py
CHANGED
@@ -85,6 +85,54 @@ config = {
|
|
85
85
|
AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
|
86
86
|
""",
|
87
87
|
},
|
88
|
+
"cellprofiler_sqlite_cpg0016_jump": {
|
89
|
+
# version specifications using related references
|
90
|
+
"CONFIG_SOURCE_VERSION": {
|
91
|
+
"cellprofiler": "v4.0.0",
|
92
|
+
},
|
93
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
94
|
+
"CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
|
95
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
96
|
+
"CONFIG_NAMES_METADATA": ("image",),
|
97
|
+
# column names in any compartment or metadata tables which contain
|
98
|
+
# unique names to avoid renaming
|
99
|
+
"CONFIG_IDENTIFYING_COLUMNS": (
|
100
|
+
"ImageNumber",
|
101
|
+
"ObjectNumber",
|
102
|
+
"Metadata_Well",
|
103
|
+
"Metadata_Plate",
|
104
|
+
"Parent_Cells",
|
105
|
+
"Parent_Nuclei",
|
106
|
+
),
|
107
|
+
# chunk size to use for join operations to help with possible performance issues
|
108
|
+
# note: this number is an estimate and is may need changes contingent on data
|
109
|
+
# and system used by this library.
|
110
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
111
|
+
# compartment and metadata joins performed using DuckDB SQL
|
112
|
+
# and modified at runtime as needed
|
113
|
+
"CONFIG_JOINS": """
|
114
|
+
SELECT
|
115
|
+
image.Image_TableNumber,
|
116
|
+
image.Metadata_ImageNumber,
|
117
|
+
image.Metadata_Plate,
|
118
|
+
image.Metadata_Well,
|
119
|
+
image.Image_Metadata_Site,
|
120
|
+
image.Image_Metadata_Row,
|
121
|
+
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
122
|
+
cells.* EXCLUDE (Metadata_ImageNumber),
|
123
|
+
nuclei.* EXCLUDE (Metadata_ImageNumber)
|
124
|
+
FROM
|
125
|
+
read_parquet('cytoplasm.parquet') AS cytoplasm
|
126
|
+
LEFT JOIN read_parquet('cells.parquet') AS cells ON
|
127
|
+
cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
128
|
+
AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
|
129
|
+
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
|
130
|
+
nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
131
|
+
AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
|
132
|
+
LEFT JOIN read_parquet('image.parquet') AS image ON
|
133
|
+
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
134
|
+
""",
|
135
|
+
},
|
88
136
|
"cellprofiler_sqlite_pycytominer": {
|
89
137
|
# version specifications using related references
|
90
138
|
"CONFIG_SOURCE_VERSION": {
|
cytotable/sources.py
CHANGED
@@ -7,13 +7,11 @@ import pathlib
|
|
7
7
|
from typing import Any, Dict, List, Optional, Union
|
8
8
|
|
9
9
|
from cloudpathlib import AnyPath
|
10
|
-
from parsl.app.app import join_app, python_app
|
11
10
|
|
11
|
+
from cytotable.exceptions import NoInputDataException
|
12
12
|
|
13
|
-
|
14
|
-
def _build_path(
|
15
|
-
path: Union[str, pathlib.Path, AnyPath], **kwargs
|
16
|
-
) -> Union[pathlib.Path, AnyPath]:
|
13
|
+
|
14
|
+
def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
|
17
15
|
"""
|
18
16
|
Build a path client or return local path.
|
19
17
|
|
@@ -43,10 +41,9 @@ def _build_path(
|
|
43
41
|
return processed_path
|
44
42
|
|
45
43
|
|
46
|
-
@python_app
|
47
44
|
def _get_source_filepaths(
|
48
45
|
path: Union[pathlib.Path, AnyPath],
|
49
|
-
targets: List[str],
|
46
|
+
targets: Optional[List[str]] = None,
|
50
47
|
source_datatype: Optional[str] = None,
|
51
48
|
) -> Dict[str, List[Dict[str, Any]]]:
|
52
49
|
"""
|
@@ -75,7 +72,7 @@ def _get_source_filepaths(
|
|
75
72
|
|
76
73
|
if (targets is None or targets == []) and source_datatype is None:
|
77
74
|
raise DatatypeException(
|
78
|
-
|
75
|
+
"A source_datatype must be specified when using undefined compartments and metadata names."
|
79
76
|
)
|
80
77
|
|
81
78
|
# gathers files from provided path using compartments + metadata as a filter
|
@@ -87,9 +84,9 @@ def _get_source_filepaths(
|
|
87
84
|
for subpath in (
|
88
85
|
(path,)
|
89
86
|
# used if the source path is a single file
|
90
|
-
if
|
87
|
+
if path.is_file()
|
91
88
|
# iterates through a source directory
|
92
|
-
else (x for x in
|
89
|
+
else (x for x in path.glob("**/*") if x.is_file())
|
93
90
|
)
|
94
91
|
# ensure the subpaths meet certain specifications
|
95
92
|
if (
|
@@ -129,7 +126,8 @@ def _get_source_filepaths(
|
|
129
126
|
.arrow()["table_name"]
|
130
127
|
.to_pylist()
|
131
128
|
# make sure the table names match with compartment + metadata names
|
132
|
-
if
|
129
|
+
if targets is not None
|
130
|
+
and any(target.lower() in table_name.lower() for target in targets)
|
133
131
|
]
|
134
132
|
else:
|
135
133
|
# if we don't have sqlite source, append the existing element
|
@@ -181,7 +179,6 @@ def _get_source_filepaths(
|
|
181
179
|
return grouped_sources
|
182
180
|
|
183
181
|
|
184
|
-
@python_app
|
185
182
|
def _infer_source_datatype(
|
186
183
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
|
187
184
|
) -> str:
|
@@ -230,7 +227,6 @@ def _infer_source_datatype(
|
|
230
227
|
return source_datatype
|
231
228
|
|
232
229
|
|
233
|
-
@python_app
|
234
230
|
def _filter_source_filepaths(
|
235
231
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
|
236
232
|
) -> Dict[str, List[Dict[str, Any]]]:
|
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
|
|
260
256
|
if file["source_path"].stat().st_size > 0
|
261
257
|
# ensure the datatype matches the source datatype
|
262
258
|
and file["source_path"].suffix == f".{source_datatype}"
|
259
|
+
and _file_is_more_than_one_line(path=file["source_path"])
|
263
260
|
]
|
264
261
|
for filegroup, files in sources.items()
|
265
262
|
}
|
266
263
|
|
267
264
|
|
268
|
-
|
265
|
+
def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
|
266
|
+
"""
|
267
|
+
Check if the file has more than one line.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
path (Union[pathlib.Path, AnyPath]):
|
271
|
+
The path to the file.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
bool:
|
275
|
+
True if the file has more than one line, False otherwise.
|
276
|
+
|
277
|
+
Raises:
|
278
|
+
NoInputDataException: If the file has zero lines.
|
279
|
+
"""
|
280
|
+
|
281
|
+
# if we don't have a sqlite file
|
282
|
+
# (we can't check sqlite files for lines)
|
283
|
+
if path.suffix.lower() != ".sqlite":
|
284
|
+
with path.open("r") as f:
|
285
|
+
try:
|
286
|
+
# read two lines, if the second is empty return false
|
287
|
+
return bool(f.readline() and f.readline())
|
288
|
+
|
289
|
+
except StopIteration:
|
290
|
+
# If we encounter the end of the file, it has only one line
|
291
|
+
raise NoInputDataException(
|
292
|
+
f"Data file has 0 rows of values. Error in file: {path}"
|
293
|
+
)
|
294
|
+
else:
|
295
|
+
return True
|
296
|
+
|
297
|
+
|
269
298
|
def _gather_sources(
|
270
299
|
source_path: str,
|
271
300
|
source_datatype: Optional[str] = None,
|
@@ -295,11 +324,11 @@ def _gather_sources(
|
|
295
324
|
_infer_source_datatype,
|
296
325
|
)
|
297
326
|
|
298
|
-
|
327
|
+
built_path = _build_path(path=source_path, **kwargs)
|
299
328
|
|
300
329
|
# gather filepaths which will be used as the basis for this work
|
301
330
|
sources = _get_source_filepaths(
|
302
|
-
path=
|
331
|
+
path=built_path, targets=targets, source_datatype=source_datatype
|
303
332
|
)
|
304
333
|
|
305
334
|
# infer or validate the source datatype based on source filepaths
|
cytotable/utils.py
CHANGED
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
149
149
|
INSTALL sqlite_scanner;
|
150
150
|
LOAD sqlite_scanner;
|
151
151
|
|
152
|
+
/* Install httpfs plugin to avoid error
|
153
|
+
https://github.com/duckdb/duckdb/issues/3243 */
|
154
|
+
INSTALL httpfs;
|
155
|
+
|
152
156
|
/*
|
153
157
|
Set threads available to duckdb
|
154
158
|
See the following for more information:
|
@@ -322,7 +326,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
322
326
|
return pa.Table.from_pylist(results)
|
323
327
|
|
324
328
|
|
325
|
-
def _cache_cloudpath_to_local(path:
|
329
|
+
def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
|
326
330
|
"""
|
327
331
|
Takes a cloudpath and uses cache to convert to a local copy
|
328
332
|
for use in scenarios where remote work is not possible (sqlite).
|
@@ -337,24 +341,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
|
|
337
341
|
A local pathlib.Path to cached version of cloudpath file.
|
338
342
|
"""
|
339
343
|
|
340
|
-
candidate_path = AnyPath(path)
|
341
|
-
|
342
344
|
# check that the path is a file (caching won't work with a dir)
|
343
345
|
# and check that the file is of sqlite type
|
344
346
|
# (other file types will be handled remotely in cloud)
|
345
|
-
if
|
347
|
+
if (
|
348
|
+
isinstance(path, CloudPath)
|
349
|
+
and path.is_file()
|
350
|
+
and path.suffix.lower() == ".sqlite"
|
351
|
+
):
|
346
352
|
try:
|
347
353
|
# update the path to be the local filepath for reference in CytoTable ops
|
348
354
|
# note: incurs a data read which will trigger caching of the file
|
349
|
-
path =
|
355
|
+
path = pathlib.Path(path.fspath)
|
350
356
|
except InvalidPrefixError:
|
351
357
|
# share information about not finding a cloud path
|
352
358
|
logger.info(
|
353
359
|
"Did not detect a cloud path based on prefix. Defaulting to use local path operations."
|
354
360
|
)
|
355
361
|
|
356
|
-
|
357
|
-
return pathlib.Path(path)
|
362
|
+
return path
|
358
363
|
|
359
364
|
|
360
365
|
def _arrow_type_cast_if_specified(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
17
|
-
Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
|
17
|
+
Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
|
18
18
|
Requires-Dist: duckdb (>=0.10.1)
|
19
19
|
Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
|
20
20
|
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
|
2
|
+
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
+
cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
|
8
|
+
cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
|
10
|
+
cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
+
cytotable-0.0.9.dist-info/RECORD,,
|
cytotable-0.0.8.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
|
2
|
-
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
-
cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
|
6
|
-
cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
|
7
|
-
cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
|
8
|
-
cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
|
10
|
-
cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|