CytoTable 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cytotable-0.0.2 → cytotable-0.0.4}/PKG-INFO +19 -7
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/__init__.py +4 -0
- cytotable-0.0.4/cytotable/constants.py +74 -0
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/convert.py +127 -153
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/presets.py +30 -10
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/sources.py +47 -18
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/utils.py +74 -75
- {cytotable-0.0.2 → cytotable-0.0.4}/pyproject.toml +18 -9
- {cytotable-0.0.2 → cytotable-0.0.4}/readme.md +14 -3
- {cytotable-0.0.2 → cytotable-0.0.4}/LICENSE +0 -0
- {cytotable-0.0.2 → cytotable-0.0.4}/cytotable/exceptions.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -13,10 +13,11 @@ Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
17
|
Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
|
17
|
-
Requires-Dist: duckdb (>=0.8.0
|
18
|
-
Requires-Dist: parsl (>=2023.9.
|
19
|
-
Requires-Dist: pyarrow (>=13.0.0
|
18
|
+
Requires-Dist: duckdb (>=0.8.0)
|
19
|
+
Requires-Dist: parsl (>=2023.9.25)
|
20
|
+
Requires-Dist: pyarrow (>=13.0.0)
|
20
21
|
Project-URL: Documentation, https://cytomining.github.io/CytoTable/
|
21
22
|
Project-URL: Repository, https://github.com/cytomining/CytoTable
|
22
23
|
Description-Content-Type: text/markdown
|
@@ -25,20 +26,31 @@ Description-Content-Type: text/markdown
|
|
25
26
|
|
26
27
|
# CytoTable
|
27
28
|
|
28
|
-

|
29
|
+

|
29
30
|
_Diagram showing data flow relative to this project._
|
30
31
|
|
31
32
|
## Summary
|
32
33
|
|
33
|
-
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
|
34
|
+
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
|
34
35
|
CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
|
35
36
|
The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
|
36
37
|
|
38
|
+
The name for the project is inspired from:
|
39
|
+
|
40
|
+
- __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
|
41
|
+
- __Table__:
|
42
|
+
- "1. Furniture with a top surface to accommodate a variety of uses."
|
43
|
+
- "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
|
44
|
+
|
37
45
|
## Installation
|
38
46
|
|
39
|
-
Install CytoTable
|
47
|
+
Install CytoTable from [PyPI](https://pypi.org/) or from source:
|
40
48
|
|
41
49
|
```shell
|
50
|
+
# install from pypi
|
51
|
+
pip install cytotable
|
52
|
+
|
53
|
+
# install directly from source
|
42
54
|
pip install git+https://github.com/cytomining/CytoTable.git
|
43
55
|
```
|
44
56
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
CytoTable: constants - storing various constants to be used throughout cytotable.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import multiprocessing
|
6
|
+
import os
|
7
|
+
from typing import cast
|
8
|
+
|
9
|
+
from cytotable.utils import _get_cytotable_version
|
10
|
+
|
11
|
+
# read max threads from environment if necessary
|
12
|
+
# max threads will be used with default Parsl config and Duckdb
|
13
|
+
MAX_THREADS = (
|
14
|
+
multiprocessing.cpu_count()
|
15
|
+
if "CYTOTABLE_MAX_THREADS" not in os.environ
|
16
|
+
else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
|
17
|
+
)
|
18
|
+
|
19
|
+
# enables overriding default memory mapping behavior with pyarrow memory mapping
|
20
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
|
21
|
+
os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
|
22
|
+
)
|
23
|
+
|
24
|
+
DDB_DATA_TYPE_SYNONYMS = {
|
25
|
+
"real": ["float32", "float4", "float"],
|
26
|
+
"double": ["float64", "float8", "numeric", "decimal"],
|
27
|
+
"integer": ["int32", "int4", "int", "signed"],
|
28
|
+
"bigint": ["int64", "int8", "long"],
|
29
|
+
}
|
30
|
+
|
31
|
+
# A reference dictionary for SQLite affinity and storage class types
|
32
|
+
# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
|
33
|
+
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
34
|
+
"integer": [
|
35
|
+
"int",
|
36
|
+
"integer",
|
37
|
+
"tinyint",
|
38
|
+
"smallint",
|
39
|
+
"mediumint",
|
40
|
+
"bigint",
|
41
|
+
"unsigned big int",
|
42
|
+
"int2",
|
43
|
+
"int8",
|
44
|
+
],
|
45
|
+
"text": [
|
46
|
+
"character",
|
47
|
+
"varchar",
|
48
|
+
"varying character",
|
49
|
+
"nchar",
|
50
|
+
"native character",
|
51
|
+
"nvarchar",
|
52
|
+
"text",
|
53
|
+
"clob",
|
54
|
+
],
|
55
|
+
"blob": ["blob"],
|
56
|
+
"real": [
|
57
|
+
"real",
|
58
|
+
"double",
|
59
|
+
"double precision",
|
60
|
+
"float",
|
61
|
+
],
|
62
|
+
"numeric": [
|
63
|
+
"numeric",
|
64
|
+
"decimal",
|
65
|
+
"boolean",
|
66
|
+
"date",
|
67
|
+
"datetime",
|
68
|
+
],
|
69
|
+
}
|
70
|
+
|
71
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
72
|
+
"data-producer": "https://github.com/cytomining/CytoTable",
|
73
|
+
"data-producer-version": str(_get_cytotable_version()),
|
74
|
+
}
|
@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
|
|
75
75
|
segment_type as column_dtype
|
76
76
|
FROM pragma_storage_info('column_details')
|
77
77
|
/* avoid duplicate entries in the form of VALIDITY segment_types */
|
78
|
-
WHERE segment_type != 'VALIDITY'
|
78
|
+
WHERE segment_type != 'VALIDITY'
|
79
|
+
/* explicitly order the columns by their id to avoid inconsistent results */
|
80
|
+
ORDER BY column_id ASC;
|
79
81
|
"""
|
80
82
|
|
81
83
|
# attempt to read the data to parquet from duckdb
|
@@ -175,8 +177,9 @@ def _prep_cast_column_data_types(
|
|
175
177
|
|
176
178
|
@python_app
|
177
179
|
def _get_table_chunk_offsets(
|
178
|
-
source: Dict[str, Any],
|
179
180
|
chunk_size: int,
|
181
|
+
source: Optional[Dict[str, Any]] = None,
|
182
|
+
sql_stmt: Optional[str] = None,
|
180
183
|
) -> Union[List[int], None]:
|
181
184
|
"""
|
182
185
|
Get table data chunk offsets for later use in capturing segments
|
@@ -207,39 +210,54 @@ def _get_table_chunk_offsets(
|
|
207
210
|
|
208
211
|
logger = logging.getLogger(__name__)
|
209
212
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
+
if source is not None:
|
214
|
+
table_name = source["table_name"] if "table_name" in source.keys() else None
|
215
|
+
source_path = source["source_path"]
|
216
|
+
source_type = str(pathlib.Path(source_path).suffix).lower()
|
213
217
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
218
|
+
try:
|
219
|
+
# for csv's, check that we have more than one row (a header and data values)
|
220
|
+
if (
|
221
|
+
source_type == ".csv"
|
222
|
+
and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
|
223
|
+
):
|
224
|
+
raise NoInputDataException(
|
225
|
+
f"Data file has 0 rows of values. Error in file: {source_path}"
|
226
|
+
)
|
227
|
+
|
228
|
+
# gather the total rowcount from csv or sqlite data input sources
|
229
|
+
with _duckdb_reader() as ddb_reader:
|
230
|
+
rowcount = int(
|
231
|
+
ddb_reader.execute(
|
232
|
+
# nosec
|
233
|
+
f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
|
234
|
+
if source_type == ".csv"
|
235
|
+
else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
|
236
|
+
).fetchone()[0]
|
237
|
+
)
|
238
|
+
|
239
|
+
# catch input errors which will result in skipped files
|
240
|
+
except (
|
241
|
+
duckdb.InvalidInputException,
|
242
|
+
NoInputDataException,
|
243
|
+
) as invalid_input_exc:
|
244
|
+
logger.warning(
|
245
|
+
msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
|
222
246
|
)
|
223
247
|
|
248
|
+
return None
|
249
|
+
|
250
|
+
# find chunk offsets from sql statement
|
251
|
+
elif sql_stmt is not None:
|
224
252
|
# gather the total rowcount from csv or sqlite data input sources
|
225
253
|
with _duckdb_reader() as ddb_reader:
|
226
254
|
rowcount = int(
|
227
255
|
ddb_reader.execute(
|
228
256
|
# nosec
|
229
|
-
f"SELECT COUNT(*)
|
230
|
-
if source_type == ".csv"
|
231
|
-
else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
|
257
|
+
f"SELECT COUNT(*) FROM ({sql_stmt})"
|
232
258
|
).fetchone()[0]
|
233
259
|
)
|
234
260
|
|
235
|
-
# catch input errors which will result in skipped files
|
236
|
-
except (duckdb.InvalidInputException, NoInputDataException) as invalid_input_exc:
|
237
|
-
logger.warning(
|
238
|
-
msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
|
239
|
-
)
|
240
|
-
|
241
|
-
return None
|
242
|
-
|
243
261
|
return list(
|
244
262
|
range(
|
245
263
|
0,
|
@@ -258,7 +276,6 @@ def _source_chunk_to_parquet(
|
|
258
276
|
chunk_size: int,
|
259
277
|
offset: int,
|
260
278
|
dest_path: str,
|
261
|
-
data_type_cast_map: Optional[Dict[str, str]] = None,
|
262
279
|
) -> str:
|
263
280
|
"""
|
264
281
|
Export source data to chunked parquet file using chunk size and offsets.
|
@@ -287,7 +304,11 @@ def _source_chunk_to_parquet(
|
|
287
304
|
from cloudpathlib import AnyPath
|
288
305
|
from pyarrow import parquet
|
289
306
|
|
290
|
-
from cytotable.utils import
|
307
|
+
from cytotable.utils import (
|
308
|
+
_duckdb_reader,
|
309
|
+
_sqlite_mixed_type_query_to_parquet,
|
310
|
+
_write_parquet_table_with_metadata,
|
311
|
+
)
|
291
312
|
|
292
313
|
# attempt to build dest_path
|
293
314
|
source_dest_path = (
|
@@ -300,7 +321,7 @@ def _source_chunk_to_parquet(
|
|
300
321
|
select_columns = ",".join(
|
301
322
|
[
|
302
323
|
# here we cast the column to the specified type ensure the colname remains the same
|
303
|
-
f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
|
324
|
+
f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
|
304
325
|
for column in source["columns"]
|
305
326
|
]
|
306
327
|
)
|
@@ -324,7 +345,7 @@ def _source_chunk_to_parquet(
|
|
324
345
|
# read data with chunk size + offset
|
325
346
|
# and export to parquet
|
326
347
|
with _duckdb_reader() as ddb_reader:
|
327
|
-
|
348
|
+
_write_parquet_table_with_metadata(
|
328
349
|
table=ddb_reader.execute(
|
329
350
|
f"""
|
330
351
|
{base_query}
|
@@ -343,7 +364,7 @@ def _source_chunk_to_parquet(
|
|
343
364
|
"Mismatch Type Error" in str(e)
|
344
365
|
and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
|
345
366
|
):
|
346
|
-
|
367
|
+
_write_parquet_table_with_metadata(
|
347
368
|
# here we use sqlite instead of duckdb to extract
|
348
369
|
# data for special cases where column and value types
|
349
370
|
# may not align (which is valid functionality in SQLite).
|
@@ -395,14 +416,28 @@ def _prepend_column_name(
|
|
395
416
|
Path to the modified file.
|
396
417
|
"""
|
397
418
|
|
419
|
+
import logging
|
398
420
|
import pathlib
|
399
421
|
|
400
422
|
import pyarrow.parquet as parquet
|
401
423
|
|
402
|
-
from cytotable.
|
424
|
+
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
425
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
426
|
+
|
427
|
+
logger = logging.getLogger(__name__)
|
403
428
|
|
404
429
|
targets = tuple(metadata) + tuple(compartments)
|
405
430
|
|
431
|
+
# if we have no targets or metadata to work from, return the table unchanged
|
432
|
+
if len(targets) == 0:
|
433
|
+
logger.warning(
|
434
|
+
msg=(
|
435
|
+
"Skipping column name prepend operations"
|
436
|
+
"because no compartments or metadata were provided."
|
437
|
+
)
|
438
|
+
)
|
439
|
+
return table_path
|
440
|
+
|
406
441
|
table = parquet.read_table(
|
407
442
|
source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
408
443
|
)
|
@@ -484,7 +519,7 @@ def _prepend_column_name(
|
|
484
519
|
updated_column_names.append(column_name)
|
485
520
|
|
486
521
|
# perform table column name updates
|
487
|
-
|
522
|
+
_write_parquet_table_with_metadata(
|
488
523
|
table=table.rename_columns(updated_column_names), where=table_path
|
489
524
|
)
|
490
525
|
|
@@ -549,13 +584,18 @@ def _concat_source_group(
|
|
549
584
|
Updated dictionary containing concatenated sources.
|
550
585
|
"""
|
551
586
|
|
587
|
+
import errno
|
552
588
|
import pathlib
|
553
589
|
|
554
590
|
import pyarrow as pa
|
555
591
|
import pyarrow.parquet as parquet
|
556
592
|
|
593
|
+
from cytotable.constants import (
|
594
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
595
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
596
|
+
)
|
557
597
|
from cytotable.exceptions import SchemaException
|
558
|
-
from cytotable.utils import
|
598
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
559
599
|
|
560
600
|
# build a result placeholder
|
561
601
|
concatted: List[Dict[str, Any]] = [
|
@@ -585,7 +625,9 @@ def _concat_source_group(
|
|
585
625
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
586
626
|
|
587
627
|
# build the schema for concatenation writer
|
588
|
-
writer_schema = pa.schema(common_schema)
|
628
|
+
writer_schema = pa.schema(common_schema).with_metadata(
|
629
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
630
|
+
)
|
589
631
|
|
590
632
|
# build a parquet file writer which will be used to append files
|
591
633
|
# as a single concatted parquet file, referencing the first file's schema
|
@@ -623,7 +665,7 @@ def _concat_source_group(
|
|
623
665
|
pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
|
624
666
|
except OSError as os_err:
|
625
667
|
# raise only if we don't have a dir not empty errno
|
626
|
-
if os_err.errno !=
|
668
|
+
if os_err.errno != errno.ENOTEMPTY:
|
627
669
|
raise
|
628
670
|
|
629
671
|
# return the concatted parquet filename
|
@@ -632,75 +674,51 @@ def _concat_source_group(
|
|
632
674
|
return concatted
|
633
675
|
|
634
676
|
|
635
|
-
@python_app
|
636
|
-
def
|
677
|
+
@python_app()
|
678
|
+
def _prepare_join_sql(
|
637
679
|
sources: Dict[str, List[Dict[str, Any]]],
|
638
|
-
|
639
|
-
|
640
|
-
chunk_size: int,
|
641
|
-
) -> List[List[Dict[str, Any]]]:
|
680
|
+
joins: str,
|
681
|
+
) -> str:
|
642
682
|
"""
|
643
|
-
|
683
|
+
Prepare join SQL statement with actual locations of data based on the sources.
|
644
684
|
|
645
685
|
Args:
|
646
|
-
sources: Dict[List[Dict[str, Any]]]:
|
686
|
+
sources: Dict[str, List[Dict[str, Any]]]:
|
647
687
|
Grouped datasets of files which will be used by other functions.
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
chunk_size: int:
|
653
|
-
Size of join chunks which is used to limit data size during join ops.
|
688
|
+
Includes the metadata concerning location of actual data.
|
689
|
+
joins: str:
|
690
|
+
DuckDB-compatible SQL which will be used to perform the join
|
691
|
+
operations using the join_group keys as a reference.
|
654
692
|
|
655
693
|
Returns:
|
656
|
-
|
657
|
-
|
694
|
+
str:
|
695
|
+
String representing the SQL to be used in later join work.
|
658
696
|
"""
|
659
|
-
|
660
697
|
import pathlib
|
661
698
|
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
first_result = source
|
670
|
-
break
|
671
|
-
|
672
|
-
# gather the workflow result for basis if it's not yet returned
|
673
|
-
basis = first_result
|
674
|
-
|
675
|
-
# read only the table's chunk_columns
|
676
|
-
join_column_rows = parquet.read_table(
|
677
|
-
source=basis[0]["table"],
|
678
|
-
columns=list(chunk_columns),
|
679
|
-
memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
680
|
-
).to_pylist()
|
699
|
+
# replace with real location of sources for join sql
|
700
|
+
for key, val in sources.items():
|
701
|
+
if pathlib.Path(key).stem.lower() in joins.lower():
|
702
|
+
joins = joins.replace(
|
703
|
+
f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
|
704
|
+
str([str(table) for table in val[0]["table"]]),
|
705
|
+
)
|
681
706
|
|
682
|
-
|
683
|
-
return [
|
684
|
-
join_column_rows[i : i + chunk_size]
|
685
|
-
for i in range(0, len(join_column_rows), chunk_size)
|
686
|
-
]
|
707
|
+
return joins
|
687
708
|
|
688
709
|
|
689
710
|
@python_app
|
690
711
|
def _join_source_chunk(
|
691
|
-
sources: Dict[str, List[Dict[str, Any]]],
|
692
712
|
dest_path: str,
|
693
713
|
joins: str,
|
694
|
-
|
714
|
+
chunk_size: int,
|
715
|
+
offset: int,
|
695
716
|
drop_null: bool,
|
696
717
|
) -> str:
|
697
718
|
"""
|
698
719
|
Join sources based on join group keys (group of specific join column values)
|
699
720
|
|
700
721
|
Args:
|
701
|
-
sources: Dict[str, List[Dict[str, Any]]]:
|
702
|
-
Grouped datasets of files which will be used by other functions.
|
703
|
-
Includes the metadata concerning location of actual data.
|
704
722
|
dest_path: str:
|
705
723
|
Destination path to write file-based content.
|
706
724
|
joins: str:
|
@@ -722,54 +740,20 @@ def _join_source_chunk(
|
|
722
740
|
|
723
741
|
import pyarrow.parquet as parquet
|
724
742
|
|
725
|
-
from cytotable.utils import _duckdb_reader
|
726
|
-
|
727
|
-
# replace with real location of sources for join sql
|
728
|
-
for key, val in sources.items():
|
729
|
-
if pathlib.Path(key).stem.lower() in joins.lower():
|
730
|
-
joins = joins.replace(
|
731
|
-
f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
|
732
|
-
str([str(table) for table in val[0]["table"]]),
|
733
|
-
)
|
734
|
-
|
735
|
-
# update the join groups to include unique values per table
|
736
|
-
updated_join_group = []
|
737
|
-
for key in sources.keys():
|
738
|
-
updated_join_group.extend(
|
739
|
-
[
|
740
|
-
{
|
741
|
-
f"{str(pathlib.Path(key).stem)}.{join_key}": val
|
742
|
-
for join_key, val in chunk.items()
|
743
|
-
}
|
744
|
-
for chunk in join_group
|
745
|
-
]
|
746
|
-
)
|
747
|
-
|
748
|
-
# form where clause for sql joins to filter the results
|
749
|
-
joins += (
|
750
|
-
"WHERE ("
|
751
|
-
+ ") OR (".join(
|
752
|
-
[
|
753
|
-
" AND ".join(
|
754
|
-
[
|
755
|
-
# create groups of join column filters where values always
|
756
|
-
# are expected to equal those within the join_group together
|
757
|
-
f"{join_column} = {join_column_value}"
|
758
|
-
if not isinstance(join_column_value, str)
|
759
|
-
# account for string values
|
760
|
-
else (f"{join_column} = " f"'{join_column_value}'")
|
761
|
-
for join_column, join_column_value in chunk.items()
|
762
|
-
]
|
763
|
-
)
|
764
|
-
for chunk in updated_join_group
|
765
|
-
]
|
766
|
-
)
|
767
|
-
+ ")"
|
768
|
-
)
|
743
|
+
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
769
744
|
|
745
|
+
# Attempt to read the data to parquet file
|
746
|
+
# using duckdb for extraction and pyarrow for
|
747
|
+
# writing data to a parquet file.
|
748
|
+
# read data with chunk size + offset
|
749
|
+
# and export to parquet
|
770
750
|
with _duckdb_reader() as ddb_reader:
|
771
|
-
|
772
|
-
|
751
|
+
result = ddb_reader.execute(
|
752
|
+
f"""
|
753
|
+
{joins}
|
754
|
+
LIMIT {chunk_size} OFFSET {offset}
|
755
|
+
"""
|
756
|
+
).arrow()
|
773
757
|
|
774
758
|
# drop nulls if specified
|
775
759
|
if drop_null:
|
@@ -800,7 +784,7 @@ def _join_source_chunk(
|
|
800
784
|
)
|
801
785
|
|
802
786
|
# write the result
|
803
|
-
|
787
|
+
_write_parquet_table_with_metadata(
|
804
788
|
table=result,
|
805
789
|
where=result_file_path,
|
806
790
|
)
|
@@ -840,7 +824,11 @@ def _concat_join_sources(
|
|
840
824
|
|
841
825
|
import pyarrow.parquet as parquet
|
842
826
|
|
843
|
-
from cytotable.
|
827
|
+
from cytotable.constants import (
|
828
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
829
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
830
|
+
)
|
831
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
844
832
|
|
845
833
|
# remove the unjoined concatted compartments to prepare final dest_path usage
|
846
834
|
# (we now have joined results)
|
@@ -854,7 +842,7 @@ def _concat_join_sources(
|
|
854
842
|
shutil.rmtree(path=dest_path)
|
855
843
|
|
856
844
|
# write the concatted result as a parquet file
|
857
|
-
|
845
|
+
_write_parquet_table_with_metadata(
|
858
846
|
table=pa.concat_tables(
|
859
847
|
tables=[
|
860
848
|
parquet.read_table(
|
@@ -869,7 +857,9 @@ def _concat_join_sources(
|
|
869
857
|
# build a parquet file writer which will be used to append files
|
870
858
|
# as a single concatted parquet file, referencing the first file's schema
|
871
859
|
# (all must be the same schema)
|
872
|
-
writer_schema = parquet.read_schema(join_sources[0])
|
860
|
+
writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
|
861
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
862
|
+
)
|
873
863
|
with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
|
874
864
|
for table_path in join_sources:
|
875
865
|
writer.write_table(
|
@@ -1012,7 +1002,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1012
1002
|
concat: bool,
|
1013
1003
|
join: bool,
|
1014
1004
|
joins: Optional[str],
|
1015
|
-
chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
|
1016
1005
|
chunk_size: Optional[int],
|
1017
1006
|
infer_common_schema: bool,
|
1018
1007
|
drop_null: bool,
|
@@ -1048,8 +1037,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1048
1037
|
Whether to join the compartment data together into one dataset.
|
1049
1038
|
joins: str:
|
1050
1039
|
DuckDB-compatible SQL which will be used to perform the join operations.
|
1051
|
-
chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
|
1052
|
-
Column names which appear in all compartments to use when performing join.
|
1053
1040
|
chunk_size: Optional[int],
|
1054
1041
|
Size of join chunks which is used to limit data size during join ops.
|
1055
1042
|
infer_common_schema: bool: (Default value = True)
|
@@ -1074,7 +1061,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1074
1061
|
from cytotable.convert import (
|
1075
1062
|
_concat_join_sources,
|
1076
1063
|
_concat_source_group,
|
1077
|
-
_get_join_chunks,
|
1078
1064
|
_get_table_chunk_offsets,
|
1079
1065
|
_infer_source_group_common_schema,
|
1080
1066
|
_join_source_chunk,
|
@@ -1161,7 +1147,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1161
1147
|
chunk_size=chunk_size,
|
1162
1148
|
offset=offset,
|
1163
1149
|
dest_path=expanded_dest_path,
|
1164
|
-
data_type_cast_map=data_type_cast_map,
|
1165
1150
|
),
|
1166
1151
|
source_group_name=source_group_name,
|
1167
1152
|
identifying_columns=identifying_columns,
|
@@ -1210,6 +1195,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1210
1195
|
# conditional section for merging
|
1211
1196
|
# note: join implies a concat, but concat does not imply a join
|
1212
1197
|
if join:
|
1198
|
+
prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
|
1199
|
+
|
1213
1200
|
# map joined results based on the join groups gathered above
|
1214
1201
|
# note: after mapping we end up with a list of strings (task returns str)
|
1215
1202
|
join_sources_result = [
|
@@ -1217,21 +1204,18 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1217
1204
|
# gather the result of concatted sources prior to
|
1218
1205
|
# join group merging as each mapped task run will need
|
1219
1206
|
# full concat results
|
1220
|
-
sources=results,
|
1221
1207
|
dest_path=expanded_dest_path,
|
1222
|
-
joins=
|
1223
|
-
|
1224
|
-
|
1208
|
+
joins=prepared_joins_sql,
|
1209
|
+
chunk_size=chunk_size,
|
1210
|
+
offset=offset,
|
1225
1211
|
drop_null=drop_null,
|
1226
1212
|
).result()
|
1227
1213
|
# create join group for querying the concatenated
|
1228
1214
|
# data in order to perform memory-safe joining
|
1229
1215
|
# per user chunk size specification.
|
1230
|
-
for
|
1231
|
-
|
1232
|
-
chunk_columns=chunk_columns,
|
1216
|
+
for offset in _get_table_chunk_offsets(
|
1217
|
+
sql_stmt=prepared_joins_sql,
|
1233
1218
|
chunk_size=chunk_size,
|
1234
|
-
metadata=metadata,
|
1235
1219
|
).result()
|
1236
1220
|
]
|
1237
1221
|
|
@@ -1259,7 +1243,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1259
1243
|
concat: bool = True,
|
1260
1244
|
join: bool = True,
|
1261
1245
|
joins: Optional[str] = None,
|
1262
|
-
chunk_columns: Optional[Union[List[str], Tuple[str, ...]]] = None,
|
1263
1246
|
chunk_size: Optional[int] = None,
|
1264
1247
|
infer_common_schema: bool = True,
|
1265
1248
|
drop_null: bool = False,
|
@@ -1303,9 +1286,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1303
1286
|
Whether to join the compartment data together into one dataset
|
1304
1287
|
joins: str: (Default value = None):
|
1305
1288
|
DuckDB-compatible SQL which will be used to perform the join operations.
|
1306
|
-
chunk_columns: Optional[Union[List[str], Tuple[str, ...]]]
|
1307
|
-
(Default value = None)
|
1308
|
-
Column names which appear in all compartments to use when performing join
|
1309
1289
|
chunk_size: Optional[int] (Default value = None)
|
1310
1290
|
Size of join chunks which is used to limit data size during join ops
|
1311
1291
|
infer_common_schema: bool: (Default value = True)
|
@@ -1402,11 +1382,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1402
1382
|
else identifying_columns
|
1403
1383
|
)
|
1404
1384
|
joins = cast(str, config[preset]["CONFIG_JOINS"]) if joins is None else joins
|
1405
|
-
chunk_columns = (
|
1406
|
-
cast(list, config[preset]["CONFIG_CHUNK_COLUMNS"])
|
1407
|
-
if chunk_columns is None
|
1408
|
-
else chunk_columns
|
1409
|
-
)
|
1410
1385
|
chunk_size = (
|
1411
1386
|
cast(int, config[preset]["CONFIG_CHUNK_SIZE"])
|
1412
1387
|
if chunk_size is None
|
@@ -1425,7 +1400,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1425
1400
|
concat=concat,
|
1426
1401
|
join=join,
|
1427
1402
|
joins=joins,
|
1428
|
-
chunk_columns=chunk_columns,
|
1429
1403
|
chunk_size=chunk_size,
|
1430
1404
|
infer_common_schema=infer_common_schema,
|
1431
1405
|
drop_null=drop_null,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Presets for common
|
2
|
+
Presets for common CytoTable configurations.
|
3
3
|
"""
|
4
4
|
|
5
5
|
config = {
|
@@ -26,8 +26,6 @@ config = {
|
|
26
26
|
# note: this number is an estimate and is may need changes contingent on data
|
27
27
|
# and system used by this library.
|
28
28
|
"CONFIG_CHUNK_SIZE": 1000,
|
29
|
-
# chunking columns to use along with chunk size for join operations
|
30
|
-
"CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
|
31
29
|
# compartment and metadata joins performed using DuckDB SQL
|
32
30
|
# and modified at runtime as needed
|
33
31
|
"CONFIG_JOINS": """
|
@@ -73,8 +71,6 @@ config = {
|
|
73
71
|
# note: this number is an estimate and is may need changes contingent on data
|
74
72
|
# and system used by this library.
|
75
73
|
"CONFIG_CHUNK_SIZE": 1000,
|
76
|
-
# chunking columns to use along with chunk size for join operations
|
77
|
-
"CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
|
78
74
|
# compartment and metadata joins performed using DuckDB SQL
|
79
75
|
# and modified at runtime as needed
|
80
76
|
"CONFIG_JOINS": """
|
@@ -126,8 +122,6 @@ config = {
|
|
126
122
|
# note: this number is an estimate and is may need changes contingent on data
|
127
123
|
# and system used by this library.
|
128
124
|
"CONFIG_CHUNK_SIZE": 1000,
|
129
|
-
# chunking columns to use along with chunk size for join operations
|
130
|
-
"CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
|
131
125
|
# compartment and metadata joins performed using DuckDB SQL
|
132
126
|
# and modified at runtime as needed
|
133
127
|
"CONFIG_JOINS": """
|
@@ -181,8 +175,6 @@ config = {
|
|
181
175
|
# note: this number is an estimate and is may need changes contingent on data
|
182
176
|
# and system used by this library.
|
183
177
|
"CONFIG_CHUNK_SIZE": 1000,
|
184
|
-
# chunking columns to use along with chunk size for join operations
|
185
|
-
"CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
|
186
178
|
# compartment and metadata joins performed using DuckDB SQL
|
187
179
|
# and modified at runtime as needed
|
188
180
|
"CONFIG_JOINS": """
|
@@ -212,7 +204,35 @@ config = {
|
|
212
204
|
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
|
213
205
|
""",
|
214
206
|
},
|
207
|
+
"in-carta": {
|
208
|
+
# version specifications using related references
|
209
|
+
"CONFIG_SOURCE_VERSION": {
|
210
|
+
"in-carta": "v1.17.0412545",
|
211
|
+
},
|
212
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
213
|
+
"CONFIG_NAMES_COMPARTMENTS": tuple(),
|
214
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
215
|
+
"CONFIG_NAMES_METADATA": tuple(),
|
216
|
+
# column names in any compartment or metadata tables which contain
|
217
|
+
# unique names to avoid renaming
|
218
|
+
"CONFIG_IDENTIFYING_COLUMNS": (
|
219
|
+
"OBJECT ID",
|
220
|
+
"Row",
|
221
|
+
"Column",
|
222
|
+
"FOV",
|
223
|
+
"WELL LABEL",
|
224
|
+
"Z",
|
225
|
+
"T",
|
226
|
+
),
|
227
|
+
# chunk size to use for join operations to help with possible performance issues
|
228
|
+
# note: this number is an estimate and is may need changes contingent on data
|
229
|
+
# and system used by this library.
|
230
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
231
|
+
# compartment and metadata joins performed using DuckDB SQL
|
232
|
+
# and modified at runtime as needed
|
233
|
+
"CONFIG_JOINS": "",
|
234
|
+
},
|
215
235
|
}
|
216
236
|
"""
|
217
|
-
Configuration presets for
|
237
|
+
Configuration presets for CytoTable
|
218
238
|
"""
|
@@ -47,6 +47,7 @@ def _build_path(
|
|
47
47
|
def _get_source_filepaths(
|
48
48
|
path: Union[pathlib.Path, AnyPath],
|
49
49
|
targets: List[str],
|
50
|
+
source_datatype: Optional[str] = None,
|
50
51
|
) -> Dict[str, List[Dict[str, Any]]]:
|
51
52
|
"""
|
52
53
|
Gather dataset of filepaths from a provided directory path.
|
@@ -56,19 +57,27 @@ def _get_source_filepaths(
|
|
56
57
|
Either a directory path to seek filepaths within or a path directly to a file.
|
57
58
|
targets: List[str]:
|
58
59
|
Compartment and metadata names to seek within the provided path.
|
60
|
+
source_datatype: Optional[str]: (Default value = None)
|
61
|
+
The source datatype (extension) to use for reading the tables.
|
59
62
|
|
60
63
|
Returns:
|
61
64
|
Dict[str, List[Dict[str, Any]]]
|
62
65
|
Data structure which groups related files based on the compartments.
|
63
66
|
"""
|
64
67
|
|
68
|
+
import os
|
65
69
|
import pathlib
|
66
70
|
|
67
71
|
from cloudpathlib import AnyPath
|
68
72
|
|
69
|
-
from cytotable.exceptions import NoInputDataException
|
73
|
+
from cytotable.exceptions import DatatypeException, NoInputDataException
|
70
74
|
from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
|
71
75
|
|
76
|
+
if (targets is None or targets == []) and source_datatype is None:
|
77
|
+
raise DatatypeException(
|
78
|
+
f"A source_datatype must be specified when using undefined compartments and metadata names."
|
79
|
+
)
|
80
|
+
|
72
81
|
# gathers files from provided path using compartments + metadata as a filter
|
73
82
|
sources = [
|
74
83
|
# build source_paths for all files
|
@@ -85,6 +94,7 @@ def _get_source_filepaths(
|
|
85
94
|
# ensure the subpaths meet certain specifications
|
86
95
|
if (
|
87
96
|
targets is None
|
97
|
+
or targets == []
|
88
98
|
# checks for name of the file from targets (compartment + metadata names)
|
89
99
|
or str(subpath.stem).lower() in [target.lower() for target in targets]
|
90
100
|
# checks for sqlite extension (which may include compartment + metadata names)
|
@@ -134,21 +144,38 @@ def _get_source_filepaths(
|
|
134
144
|
|
135
145
|
# group files together by similar filename for later data operations
|
136
146
|
grouped_sources = {}
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
147
|
+
|
148
|
+
# if we have no targets, create a single group inferred from a common prefix and suffix
|
149
|
+
# note: this may apply for scenarios where no compartments or metadata are
|
150
|
+
# provided as input to CytoTable operations.
|
151
|
+
if targets is None or targets == []:
|
152
|
+
# gather a common prefix to use for the group
|
153
|
+
common_prefix = os.path.commonprefix(
|
154
|
+
[
|
155
|
+
source["source_path"].stem
|
156
|
+
for source in sources
|
157
|
+
if source["source_path"].suffix == f".{source_datatype}"
|
158
|
+
]
|
159
|
+
)
|
160
|
+
grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
|
161
|
+
|
162
|
+
# otherwise, use the unique names in the paths to determine source grouping
|
163
|
+
else:
|
164
|
+
for unique_source in set(source["source_path"].name for source in sources):
|
165
|
+
grouped_sources[unique_source.capitalize()] = [
|
166
|
+
# case for files besides sqlite
|
167
|
+
source if source["source_path"].suffix.lower() != ".sqlite"
|
168
|
+
# if we have sqlite entries, update the source_path to the parent
|
169
|
+
# (the parent table database file) as grouped key name will now
|
170
|
+
# encapsulate the table name details.
|
171
|
+
else {
|
172
|
+
"source_path": source["source_path"].parent,
|
173
|
+
"table_name": source["table_name"],
|
174
|
+
}
|
175
|
+
for source in sources
|
176
|
+
# focus only on entries which include the unique_source name
|
177
|
+
if source["source_path"].name == unique_source
|
178
|
+
]
|
152
179
|
|
153
180
|
return grouped_sources
|
154
181
|
|
@@ -190,7 +217,7 @@ def _infer_source_datatype(
|
|
190
217
|
raise DatatypeException(
|
191
218
|
(
|
192
219
|
f"Unable to find source datatype {source_datatype} "
|
193
|
-
"within files. Detected datatypes: {suffixes}"
|
220
|
+
f"within files. Detected datatypes: {suffixes}"
|
194
221
|
)
|
195
222
|
)
|
196
223
|
|
@@ -270,7 +297,9 @@ def _gather_sources(
|
|
270
297
|
source_path = _build_path(path=source_path, **kwargs)
|
271
298
|
|
272
299
|
# gather filepaths which will be used as the basis for this work
|
273
|
-
sources = _get_source_filepaths(
|
300
|
+
sources = _get_source_filepaths(
|
301
|
+
path=source_path, targets=targets, source_datatype=source_datatype
|
302
|
+
)
|
274
303
|
|
275
304
|
# infer or validate the source datatype based on source filepaths
|
276
305
|
source_datatype = _infer_source_datatype(
|
@@ -3,83 +3,22 @@ Utility functions for CytoTable
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import logging
|
6
|
-
import multiprocessing
|
7
6
|
import os
|
8
7
|
import pathlib
|
9
|
-
from typing import Any, Dict, Union, cast
|
8
|
+
from typing import Any, Dict, Optional, Union, cast
|
10
9
|
|
11
10
|
import duckdb
|
12
11
|
import parsl
|
12
|
+
import pyarrow as pa
|
13
13
|
from cloudpathlib import AnyPath, CloudPath
|
14
14
|
from cloudpathlib.exceptions import InvalidPrefixError
|
15
15
|
from parsl.app.app import AppBase
|
16
16
|
from parsl.config import Config
|
17
|
-
from parsl.errors import
|
17
|
+
from parsl.errors import NoDataFlowKernelError
|
18
18
|
from parsl.executors import HighThroughputExecutor
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
22
|
-
# read max threads from environment if necessary
|
23
|
-
# max threads will be used with default Parsl config and Duckdb
|
24
|
-
MAX_THREADS = (
|
25
|
-
multiprocessing.cpu_count()
|
26
|
-
if "CYTOTABLE_MAX_THREADS" not in os.environ
|
27
|
-
else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
|
28
|
-
)
|
29
|
-
|
30
|
-
# enables overriding default memory mapping behavior with pyarrow memory mapping
|
31
|
-
CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
|
32
|
-
os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
|
33
|
-
)
|
34
|
-
|
35
|
-
DDB_DATA_TYPE_SYNONYMS = {
|
36
|
-
"real": ["float32", "float4", "float"],
|
37
|
-
"double": ["float64", "float8", "numeric", "decimal"],
|
38
|
-
"integer": ["int32", "int4", "int", "signed"],
|
39
|
-
"bigint": ["int64", "int8", "long"],
|
40
|
-
}
|
41
|
-
|
42
|
-
# A reference dictionary for SQLite affinity and storage class types
|
43
|
-
# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
|
44
|
-
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
45
|
-
"integer": [
|
46
|
-
"int",
|
47
|
-
"integer",
|
48
|
-
"tinyint",
|
49
|
-
"smallint",
|
50
|
-
"mediumint",
|
51
|
-
"bigint",
|
52
|
-
"unsigned big int",
|
53
|
-
"int2",
|
54
|
-
"int8",
|
55
|
-
],
|
56
|
-
"text": [
|
57
|
-
"character",
|
58
|
-
"varchar",
|
59
|
-
"varying character",
|
60
|
-
"nchar",
|
61
|
-
"native character",
|
62
|
-
"nvarchar",
|
63
|
-
"text",
|
64
|
-
"clob",
|
65
|
-
],
|
66
|
-
"blob": ["blob"],
|
67
|
-
"real": [
|
68
|
-
"real",
|
69
|
-
"double",
|
70
|
-
"double precision",
|
71
|
-
"float",
|
72
|
-
],
|
73
|
-
"numeric": [
|
74
|
-
"numeric",
|
75
|
-
"decimal",
|
76
|
-
"boolean",
|
77
|
-
"date",
|
78
|
-
"datetime",
|
79
|
-
],
|
80
|
-
}
|
81
|
-
|
82
|
-
|
83
22
|
# reference the original init
|
84
23
|
original_init = AppBase.__init__
|
85
24
|
|
@@ -108,15 +47,10 @@ def _parsl_loaded() -> bool:
|
|
108
47
|
try:
|
109
48
|
# try to reference Parsl dataflowkernel
|
110
49
|
parsl.dfk()
|
111
|
-
except
|
112
|
-
# if we detect a Parsl
|
50
|
+
except NoDataFlowKernelError:
|
51
|
+
# if we detect a Parsl NoDataFlowKernelError
|
113
52
|
# return false to indicate parsl config has not yet been loaded.
|
114
|
-
|
115
|
-
return False
|
116
|
-
|
117
|
-
# otherwise we raise other ConfigurationError's
|
118
|
-
else:
|
119
|
-
raise
|
53
|
+
return False
|
120
54
|
|
121
55
|
# otherwise we indicate parsl config has already been loaded
|
122
56
|
return True
|
@@ -203,6 +137,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
203
137
|
duckdb.DuckDBPyConnection
|
204
138
|
"""
|
205
139
|
|
140
|
+
import duckdb
|
141
|
+
|
142
|
+
from cytotable.constants import MAX_THREADS
|
143
|
+
|
206
144
|
return duckdb.connect().execute(
|
207
145
|
# note: we use an f-string here to
|
208
146
|
# dynamically configure threads as appropriate
|
@@ -257,20 +195,25 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
257
195
|
|
258
196
|
import pyarrow as pa
|
259
197
|
|
198
|
+
from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
260
199
|
from cytotable.exceptions import DatatypeException
|
261
|
-
from cytotable.utils import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
262
200
|
|
263
201
|
# open sqlite3 connection
|
264
202
|
with sqlite3.connect(source_path) as conn:
|
265
203
|
cursor = conn.cursor()
|
266
204
|
|
267
|
-
#
|
205
|
+
# Gather table column details including datatype.
|
206
|
+
# Note: uses SQLite pragma for table information.
|
207
|
+
# See the following for more information:
|
208
|
+
# https://sqlite.org/pragma.html#pragma_table_info
|
268
209
|
cursor.execute(
|
269
210
|
f"""
|
270
211
|
SELECT :table_name as table_name,
|
271
212
|
name as column_name,
|
272
213
|
type as column_type
|
273
|
-
FROM pragma_table_info(:table_name)
|
214
|
+
FROM pragma_table_info(:table_name)
|
215
|
+
/* explicit column ordering by 'cid' */
|
216
|
+
ORDER BY cid ASC;
|
274
217
|
""",
|
275
218
|
{"table_name": table_name},
|
276
219
|
)
|
@@ -389,6 +332,9 @@ def _arrow_type_cast_if_specified(
|
|
389
332
|
Dict[str, str]
|
390
333
|
A potentially data type updated dictionary of column information
|
391
334
|
"""
|
335
|
+
|
336
|
+
from cytotable.constants import DDB_DATA_TYPE_SYNONYMS
|
337
|
+
|
392
338
|
# for casting to new float type
|
393
339
|
if "float" in data_type_cast_map.keys() and column["column_dtype"] in [
|
394
340
|
"REAL",
|
@@ -458,3 +404,56 @@ def _expand_path(
|
|
458
404
|
modifed_path = modifed_path.expanduser()
|
459
405
|
|
460
406
|
return modifed_path.resolve()
|
407
|
+
|
408
|
+
|
409
|
+
def _get_cytotable_version() -> str:
|
410
|
+
"""
|
411
|
+
Seeks the current version of CytoTable using either pkg_resources
|
412
|
+
or dunamai to determine the current version being used.
|
413
|
+
|
414
|
+
Returns:
|
415
|
+
str
|
416
|
+
A string representing the version of CytoTable currently being used.
|
417
|
+
"""
|
418
|
+
|
419
|
+
try:
|
420
|
+
# attempt to gather the development version from dunamai
|
421
|
+
# for scenarios where cytotable from source is used.
|
422
|
+
import dunamai
|
423
|
+
|
424
|
+
return dunamai.Version.from_any_vcs().serialize()
|
425
|
+
except (RuntimeError, ModuleNotFoundError):
|
426
|
+
# else grab a static version from __init__.py
|
427
|
+
# for scenarios where the built/packaged cytotable is used.
|
428
|
+
import cytotable
|
429
|
+
|
430
|
+
return cytotable.__version__
|
431
|
+
|
432
|
+
|
433
|
+
def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
|
434
|
+
"""
|
435
|
+
Adds metadata to parquet output from CytoTable.
|
436
|
+
Note: this mostly wraps pyarrow.parquet.write_table
|
437
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
|
438
|
+
|
439
|
+
Args:
|
440
|
+
table: pa.Table:
|
441
|
+
Pyarrow table to be serialized as parquet table.
|
442
|
+
**kwargs: Any:
|
443
|
+
kwargs provided to this function roughly align with
|
444
|
+
pyarrow.parquet.write_table. The following might be
|
445
|
+
examples of what to expect here:
|
446
|
+
- where: str or pyarrow.NativeFile
|
447
|
+
"""
|
448
|
+
|
449
|
+
from pyarrow import parquet
|
450
|
+
|
451
|
+
from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA
|
452
|
+
from cytotable.utils import _get_cytotable_version
|
453
|
+
|
454
|
+
parquet.write_table(
|
455
|
+
table=table.replace_schema_metadata(
|
456
|
+
metadata=CYTOTABLE_DEFAULT_PARQUET_METADATA
|
457
|
+
),
|
458
|
+
**kwargs,
|
459
|
+
)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "CytoTable"
|
3
|
-
version
|
3
|
+
# note: version data is maintained by poetry-dynamic-versioning (do not edit)
|
4
|
+
version = "0.0.4"
|
4
5
|
description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
|
5
6
|
authors = ["Cytomining Community"]
|
6
7
|
license = "BSD-3-Clause License"
|
@@ -10,14 +11,25 @@ repository = "https://github.com/cytomining/CytoTable"
|
|
10
11
|
documentation = "https://cytomining.github.io/CytoTable/"
|
11
12
|
keywords = ["python", "cellprofiler","single-cell-analysis", "way-lab"]
|
12
13
|
|
14
|
+
[tool.poetry-dynamic-versioning]
|
15
|
+
enable = false
|
16
|
+
style = "pep440"
|
17
|
+
vcs = "git"
|
18
|
+
|
19
|
+
[build-system]
|
20
|
+
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
|
21
|
+
build-backend = "poetry_dynamic_versioning.backend"
|
22
|
+
|
23
|
+
[tool.setuptools_scm]
|
24
|
+
|
13
25
|
[tool.poetry.dependencies]
|
14
26
|
python = ">=3.8,<3.13"
|
15
|
-
pyarrow = "
|
27
|
+
pyarrow = ">=13.0.0"
|
16
28
|
cloudpathlib = {extras = ["all"], version = "^0.15.0"}
|
17
|
-
duckdb = "
|
18
|
-
parsl = ">=2023.9.
|
29
|
+
duckdb = ">=0.8.0"
|
30
|
+
parsl = ">=2023.9.25"
|
19
31
|
|
20
|
-
[tool.poetry.dev
|
32
|
+
[tool.poetry.group.dev.dependencies]
|
21
33
|
pytest = "^7.4.0"
|
22
34
|
pytest-cov = "^4.1.0"
|
23
35
|
Sphinx = "^6.0.0"
|
@@ -27,10 +39,7 @@ moto = {extras = ["s3", "server"], version = "^4.0.0"}
|
|
27
39
|
cffconvert = "^2.0.0"
|
28
40
|
cytominer-database = "^0.3.4"
|
29
41
|
pycytominer = { git = "https://github.com/cytomining/pycytominer.git", rev = "09b2c79aa94908e3520f0931a844db4fba7fd3fb" }
|
30
|
-
|
31
|
-
[build-system]
|
32
|
-
requires = ["poetry-core"]
|
33
|
-
build-backend = "poetry.core.masonry.api"
|
42
|
+
dunamai = "^1.19.0"
|
34
43
|
|
35
44
|
[tool.vulture]
|
36
45
|
min_confidence = 80
|
@@ -2,20 +2,31 @@
|
|
2
2
|
|
3
3
|
# CytoTable
|
4
4
|
|
5
|
-

|
5
|
+

|
6
6
|
_Diagram showing data flow relative to this project._
|
7
7
|
|
8
8
|
## Summary
|
9
9
|
|
10
|
-
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
|
10
|
+
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
|
11
11
|
CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
|
12
12
|
The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
|
13
13
|
|
14
|
+
The name for the project is inspired from:
|
15
|
+
|
16
|
+
- __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
|
17
|
+
- __Table__:
|
18
|
+
- "1. Furniture with a top surface to accommodate a variety of uses."
|
19
|
+
- "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
|
20
|
+
|
14
21
|
## Installation
|
15
22
|
|
16
|
-
Install CytoTable
|
23
|
+
Install CytoTable from [PyPI](https://pypi.org/) or from source:
|
17
24
|
|
18
25
|
```shell
|
26
|
+
# install from pypi
|
27
|
+
pip install cytotable
|
28
|
+
|
29
|
+
# install directly from source
|
19
30
|
pip install git+https://github.com/cytomining/CytoTable.git
|
20
31
|
```
|
21
32
|
|
File without changes
|
File without changes
|