CytoTable 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +4 -0
- cytotable/constants.py +74 -0
- cytotable/convert.py +48 -15
- cytotable/presets.py +30 -2
- cytotable/sources.py +47 -18
- cytotable/utils.py +70 -66
- {cytotable-0.0.3.dist-info → cytotable-0.0.4.dist-info}/METADATA +11 -3
- cytotable-0.0.4.dist-info/RECORD +11 -0
- {cytotable-0.0.3.dist-info → cytotable-0.0.4.dist-info}/WHEEL +1 -1
- cytotable-0.0.3.dist-info/RECORD +0 -10
- {cytotable-0.0.3.dist-info → cytotable-0.0.4.dist-info}/LICENSE +0 -0
cytotable/__init__.py
CHANGED
cytotable/constants.py
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
CytoTable: constants - storing various constants to be used throughout cytotable.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import multiprocessing
|
6
|
+
import os
|
7
|
+
from typing import cast
|
8
|
+
|
9
|
+
from cytotable.utils import _get_cytotable_version
|
10
|
+
|
11
|
+
# read max threads from environment if necessary
|
12
|
+
# max threads will be used with default Parsl config and Duckdb
|
13
|
+
MAX_THREADS = (
|
14
|
+
multiprocessing.cpu_count()
|
15
|
+
if "CYTOTABLE_MAX_THREADS" not in os.environ
|
16
|
+
else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
|
17
|
+
)
|
18
|
+
|
19
|
+
# enables overriding default memory mapping behavior with pyarrow memory mapping
|
20
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
|
21
|
+
os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
|
22
|
+
)
|
23
|
+
|
24
|
+
DDB_DATA_TYPE_SYNONYMS = {
|
25
|
+
"real": ["float32", "float4", "float"],
|
26
|
+
"double": ["float64", "float8", "numeric", "decimal"],
|
27
|
+
"integer": ["int32", "int4", "int", "signed"],
|
28
|
+
"bigint": ["int64", "int8", "long"],
|
29
|
+
}
|
30
|
+
|
31
|
+
# A reference dictionary for SQLite affinity and storage class types
|
32
|
+
# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
|
33
|
+
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
34
|
+
"integer": [
|
35
|
+
"int",
|
36
|
+
"integer",
|
37
|
+
"tinyint",
|
38
|
+
"smallint",
|
39
|
+
"mediumint",
|
40
|
+
"bigint",
|
41
|
+
"unsigned big int",
|
42
|
+
"int2",
|
43
|
+
"int8",
|
44
|
+
],
|
45
|
+
"text": [
|
46
|
+
"character",
|
47
|
+
"varchar",
|
48
|
+
"varying character",
|
49
|
+
"nchar",
|
50
|
+
"native character",
|
51
|
+
"nvarchar",
|
52
|
+
"text",
|
53
|
+
"clob",
|
54
|
+
],
|
55
|
+
"blob": ["blob"],
|
56
|
+
"real": [
|
57
|
+
"real",
|
58
|
+
"double",
|
59
|
+
"double precision",
|
60
|
+
"float",
|
61
|
+
],
|
62
|
+
"numeric": [
|
63
|
+
"numeric",
|
64
|
+
"decimal",
|
65
|
+
"boolean",
|
66
|
+
"date",
|
67
|
+
"datetime",
|
68
|
+
],
|
69
|
+
}
|
70
|
+
|
71
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
72
|
+
"data-producer": "https://github.com/cytomining/CytoTable",
|
73
|
+
"data-producer-version": str(_get_cytotable_version()),
|
74
|
+
}
|
cytotable/convert.py
CHANGED
@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
|
|
75
75
|
segment_type as column_dtype
|
76
76
|
FROM pragma_storage_info('column_details')
|
77
77
|
/* avoid duplicate entries in the form of VALIDITY segment_types */
|
78
|
-
WHERE segment_type != 'VALIDITY'
|
78
|
+
WHERE segment_type != 'VALIDITY'
|
79
|
+
/* explicitly order the columns by their id to avoid inconsistent results */
|
80
|
+
ORDER BY column_id ASC;
|
79
81
|
"""
|
80
82
|
|
81
83
|
# attempt to read the data to parquet from duckdb
|
@@ -302,7 +304,11 @@ def _source_chunk_to_parquet(
|
|
302
304
|
from cloudpathlib import AnyPath
|
303
305
|
from pyarrow import parquet
|
304
306
|
|
305
|
-
from cytotable.utils import
|
307
|
+
from cytotable.utils import (
|
308
|
+
_duckdb_reader,
|
309
|
+
_sqlite_mixed_type_query_to_parquet,
|
310
|
+
_write_parquet_table_with_metadata,
|
311
|
+
)
|
306
312
|
|
307
313
|
# attempt to build dest_path
|
308
314
|
source_dest_path = (
|
@@ -315,7 +321,7 @@ def _source_chunk_to_parquet(
|
|
315
321
|
select_columns = ",".join(
|
316
322
|
[
|
317
323
|
# here we cast the column to the specified type ensure the colname remains the same
|
318
|
-
f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
|
324
|
+
f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
|
319
325
|
for column in source["columns"]
|
320
326
|
]
|
321
327
|
)
|
@@ -339,7 +345,7 @@ def _source_chunk_to_parquet(
|
|
339
345
|
# read data with chunk size + offset
|
340
346
|
# and export to parquet
|
341
347
|
with _duckdb_reader() as ddb_reader:
|
342
|
-
|
348
|
+
_write_parquet_table_with_metadata(
|
343
349
|
table=ddb_reader.execute(
|
344
350
|
f"""
|
345
351
|
{base_query}
|
@@ -358,7 +364,7 @@ def _source_chunk_to_parquet(
|
|
358
364
|
"Mismatch Type Error" in str(e)
|
359
365
|
and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
|
360
366
|
):
|
361
|
-
|
367
|
+
_write_parquet_table_with_metadata(
|
362
368
|
# here we use sqlite instead of duckdb to extract
|
363
369
|
# data for special cases where column and value types
|
364
370
|
# may not align (which is valid functionality in SQLite).
|
@@ -410,14 +416,28 @@ def _prepend_column_name(
|
|
410
416
|
Path to the modified file.
|
411
417
|
"""
|
412
418
|
|
419
|
+
import logging
|
413
420
|
import pathlib
|
414
421
|
|
415
422
|
import pyarrow.parquet as parquet
|
416
423
|
|
417
|
-
from cytotable.
|
424
|
+
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
425
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
426
|
+
|
427
|
+
logger = logging.getLogger(__name__)
|
418
428
|
|
419
429
|
targets = tuple(metadata) + tuple(compartments)
|
420
430
|
|
431
|
+
# if we have no targets or metadata to work from, return the table unchanged
|
432
|
+
if len(targets) == 0:
|
433
|
+
logger.warning(
|
434
|
+
msg=(
|
435
|
+
"Skipping column name prepend operations"
|
436
|
+
"because no compartments or metadata were provided."
|
437
|
+
)
|
438
|
+
)
|
439
|
+
return table_path
|
440
|
+
|
421
441
|
table = parquet.read_table(
|
422
442
|
source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
423
443
|
)
|
@@ -499,7 +519,7 @@ def _prepend_column_name(
|
|
499
519
|
updated_column_names.append(column_name)
|
500
520
|
|
501
521
|
# perform table column name updates
|
502
|
-
|
522
|
+
_write_parquet_table_with_metadata(
|
503
523
|
table=table.rename_columns(updated_column_names), where=table_path
|
504
524
|
)
|
505
525
|
|
@@ -564,13 +584,18 @@ def _concat_source_group(
|
|
564
584
|
Updated dictionary containing concatenated sources.
|
565
585
|
"""
|
566
586
|
|
587
|
+
import errno
|
567
588
|
import pathlib
|
568
589
|
|
569
590
|
import pyarrow as pa
|
570
591
|
import pyarrow.parquet as parquet
|
571
592
|
|
593
|
+
from cytotable.constants import (
|
594
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
595
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
596
|
+
)
|
572
597
|
from cytotable.exceptions import SchemaException
|
573
|
-
from cytotable.utils import
|
598
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
574
599
|
|
575
600
|
# build a result placeholder
|
576
601
|
concatted: List[Dict[str, Any]] = [
|
@@ -600,7 +625,9 @@ def _concat_source_group(
|
|
600
625
|
destination_path.parent.mkdir(parents=True, exist_ok=True)
|
601
626
|
|
602
627
|
# build the schema for concatenation writer
|
603
|
-
writer_schema = pa.schema(common_schema)
|
628
|
+
writer_schema = pa.schema(common_schema).with_metadata(
|
629
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
630
|
+
)
|
604
631
|
|
605
632
|
# build a parquet file writer which will be used to append files
|
606
633
|
# as a single concatted parquet file, referencing the first file's schema
|
@@ -638,7 +665,7 @@ def _concat_source_group(
|
|
638
665
|
pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
|
639
666
|
except OSError as os_err:
|
640
667
|
# raise only if we don't have a dir not empty errno
|
641
|
-
if os_err.errno !=
|
668
|
+
if os_err.errno != errno.ENOTEMPTY:
|
642
669
|
raise
|
643
670
|
|
644
671
|
# return the concatted parquet filename
|
@@ -713,7 +740,7 @@ def _join_source_chunk(
|
|
713
740
|
|
714
741
|
import pyarrow.parquet as parquet
|
715
742
|
|
716
|
-
from cytotable.utils import _duckdb_reader
|
743
|
+
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
717
744
|
|
718
745
|
# Attempt to read the data to parquet file
|
719
746
|
# using duckdb for extraction and pyarrow for
|
@@ -757,7 +784,7 @@ def _join_source_chunk(
|
|
757
784
|
)
|
758
785
|
|
759
786
|
# write the result
|
760
|
-
|
787
|
+
_write_parquet_table_with_metadata(
|
761
788
|
table=result,
|
762
789
|
where=result_file_path,
|
763
790
|
)
|
@@ -797,7 +824,11 @@ def _concat_join_sources(
|
|
797
824
|
|
798
825
|
import pyarrow.parquet as parquet
|
799
826
|
|
800
|
-
from cytotable.
|
827
|
+
from cytotable.constants import (
|
828
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
829
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
830
|
+
)
|
831
|
+
from cytotable.utils import _write_parquet_table_with_metadata
|
801
832
|
|
802
833
|
# remove the unjoined concatted compartments to prepare final dest_path usage
|
803
834
|
# (we now have joined results)
|
@@ -811,7 +842,7 @@ def _concat_join_sources(
|
|
811
842
|
shutil.rmtree(path=dest_path)
|
812
843
|
|
813
844
|
# write the concatted result as a parquet file
|
814
|
-
|
845
|
+
_write_parquet_table_with_metadata(
|
815
846
|
table=pa.concat_tables(
|
816
847
|
tables=[
|
817
848
|
parquet.read_table(
|
@@ -826,7 +857,9 @@ def _concat_join_sources(
|
|
826
857
|
# build a parquet file writer which will be used to append files
|
827
858
|
# as a single concatted parquet file, referencing the first file's schema
|
828
859
|
# (all must be the same schema)
|
829
|
-
writer_schema = parquet.read_schema(join_sources[0])
|
860
|
+
writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
|
861
|
+
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
862
|
+
)
|
830
863
|
with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
|
831
864
|
for table_path in join_sources:
|
832
865
|
writer.write_table(
|
cytotable/presets.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Presets for common
|
2
|
+
Presets for common CytoTable configurations.
|
3
3
|
"""
|
4
4
|
|
5
5
|
config = {
|
@@ -204,7 +204,35 @@ config = {
|
|
204
204
|
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
|
205
205
|
""",
|
206
206
|
},
|
207
|
+
"in-carta": {
|
208
|
+
# version specifications using related references
|
209
|
+
"CONFIG_SOURCE_VERSION": {
|
210
|
+
"in-carta": "v1.17.0412545",
|
211
|
+
},
|
212
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
213
|
+
"CONFIG_NAMES_COMPARTMENTS": tuple(),
|
214
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
215
|
+
"CONFIG_NAMES_METADATA": tuple(),
|
216
|
+
# column names in any compartment or metadata tables which contain
|
217
|
+
# unique names to avoid renaming
|
218
|
+
"CONFIG_IDENTIFYING_COLUMNS": (
|
219
|
+
"OBJECT ID",
|
220
|
+
"Row",
|
221
|
+
"Column",
|
222
|
+
"FOV",
|
223
|
+
"WELL LABEL",
|
224
|
+
"Z",
|
225
|
+
"T",
|
226
|
+
),
|
227
|
+
# chunk size to use for join operations to help with possible performance issues
|
228
|
+
# note: this number is an estimate and is may need changes contingent on data
|
229
|
+
# and system used by this library.
|
230
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
231
|
+
# compartment and metadata joins performed using DuckDB SQL
|
232
|
+
# and modified at runtime as needed
|
233
|
+
"CONFIG_JOINS": "",
|
234
|
+
},
|
207
235
|
}
|
208
236
|
"""
|
209
|
-
Configuration presets for
|
237
|
+
Configuration presets for CytoTable
|
210
238
|
"""
|
cytotable/sources.py
CHANGED
@@ -47,6 +47,7 @@ def _build_path(
|
|
47
47
|
def _get_source_filepaths(
|
48
48
|
path: Union[pathlib.Path, AnyPath],
|
49
49
|
targets: List[str],
|
50
|
+
source_datatype: Optional[str] = None,
|
50
51
|
) -> Dict[str, List[Dict[str, Any]]]:
|
51
52
|
"""
|
52
53
|
Gather dataset of filepaths from a provided directory path.
|
@@ -56,19 +57,27 @@ def _get_source_filepaths(
|
|
56
57
|
Either a directory path to seek filepaths within or a path directly to a file.
|
57
58
|
targets: List[str]:
|
58
59
|
Compartment and metadata names to seek within the provided path.
|
60
|
+
source_datatype: Optional[str]: (Default value = None)
|
61
|
+
The source datatype (extension) to use for reading the tables.
|
59
62
|
|
60
63
|
Returns:
|
61
64
|
Dict[str, List[Dict[str, Any]]]
|
62
65
|
Data structure which groups related files based on the compartments.
|
63
66
|
"""
|
64
67
|
|
68
|
+
import os
|
65
69
|
import pathlib
|
66
70
|
|
67
71
|
from cloudpathlib import AnyPath
|
68
72
|
|
69
|
-
from cytotable.exceptions import NoInputDataException
|
73
|
+
from cytotable.exceptions import DatatypeException, NoInputDataException
|
70
74
|
from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
|
71
75
|
|
76
|
+
if (targets is None or targets == []) and source_datatype is None:
|
77
|
+
raise DatatypeException(
|
78
|
+
f"A source_datatype must be specified when using undefined compartments and metadata names."
|
79
|
+
)
|
80
|
+
|
72
81
|
# gathers files from provided path using compartments + metadata as a filter
|
73
82
|
sources = [
|
74
83
|
# build source_paths for all files
|
@@ -85,6 +94,7 @@ def _get_source_filepaths(
|
|
85
94
|
# ensure the subpaths meet certain specifications
|
86
95
|
if (
|
87
96
|
targets is None
|
97
|
+
or targets == []
|
88
98
|
# checks for name of the file from targets (compartment + metadata names)
|
89
99
|
or str(subpath.stem).lower() in [target.lower() for target in targets]
|
90
100
|
# checks for sqlite extension (which may include compartment + metadata names)
|
@@ -134,21 +144,38 @@ def _get_source_filepaths(
|
|
134
144
|
|
135
145
|
# group files together by similar filename for later data operations
|
136
146
|
grouped_sources = {}
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
147
|
+
|
148
|
+
# if we have no targets, create a single group inferred from a common prefix and suffix
|
149
|
+
# note: this may apply for scenarios where no compartments or metadata are
|
150
|
+
# provided as input to CytoTable operations.
|
151
|
+
if targets is None or targets == []:
|
152
|
+
# gather a common prefix to use for the group
|
153
|
+
common_prefix = os.path.commonprefix(
|
154
|
+
[
|
155
|
+
source["source_path"].stem
|
156
|
+
for source in sources
|
157
|
+
if source["source_path"].suffix == f".{source_datatype}"
|
158
|
+
]
|
159
|
+
)
|
160
|
+
grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
|
161
|
+
|
162
|
+
# otherwise, use the unique names in the paths to determine source grouping
|
163
|
+
else:
|
164
|
+
for unique_source in set(source["source_path"].name for source in sources):
|
165
|
+
grouped_sources[unique_source.capitalize()] = [
|
166
|
+
# case for files besides sqlite
|
167
|
+
source if source["source_path"].suffix.lower() != ".sqlite"
|
168
|
+
# if we have sqlite entries, update the source_path to the parent
|
169
|
+
# (the parent table database file) as grouped key name will now
|
170
|
+
# encapsulate the table name details.
|
171
|
+
else {
|
172
|
+
"source_path": source["source_path"].parent,
|
173
|
+
"table_name": source["table_name"],
|
174
|
+
}
|
175
|
+
for source in sources
|
176
|
+
# focus only on entries which include the unique_source name
|
177
|
+
if source["source_path"].name == unique_source
|
178
|
+
]
|
152
179
|
|
153
180
|
return grouped_sources
|
154
181
|
|
@@ -190,7 +217,7 @@ def _infer_source_datatype(
|
|
190
217
|
raise DatatypeException(
|
191
218
|
(
|
192
219
|
f"Unable to find source datatype {source_datatype} "
|
193
|
-
"within files. Detected datatypes: {suffixes}"
|
220
|
+
f"within files. Detected datatypes: {suffixes}"
|
194
221
|
)
|
195
222
|
)
|
196
223
|
|
@@ -270,7 +297,9 @@ def _gather_sources(
|
|
270
297
|
source_path = _build_path(path=source_path, **kwargs)
|
271
298
|
|
272
299
|
# gather filepaths which will be used as the basis for this work
|
273
|
-
sources = _get_source_filepaths(
|
300
|
+
sources = _get_source_filepaths(
|
301
|
+
path=source_path, targets=targets, source_datatype=source_datatype
|
302
|
+
)
|
274
303
|
|
275
304
|
# infer or validate the source datatype based on source filepaths
|
276
305
|
source_datatype = _infer_source_datatype(
|
cytotable/utils.py
CHANGED
@@ -3,13 +3,13 @@ Utility functions for CytoTable
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import logging
|
6
|
-
import multiprocessing
|
7
6
|
import os
|
8
7
|
import pathlib
|
9
|
-
from typing import Any, Dict, Union, cast
|
8
|
+
from typing import Any, Dict, Optional, Union, cast
|
10
9
|
|
11
10
|
import duckdb
|
12
11
|
import parsl
|
12
|
+
import pyarrow as pa
|
13
13
|
from cloudpathlib import AnyPath, CloudPath
|
14
14
|
from cloudpathlib.exceptions import InvalidPrefixError
|
15
15
|
from parsl.app.app import AppBase
|
@@ -19,67 +19,6 @@ from parsl.executors import HighThroughputExecutor
|
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
22
|
-
# read max threads from environment if necessary
|
23
|
-
# max threads will be used with default Parsl config and Duckdb
|
24
|
-
MAX_THREADS = (
|
25
|
-
multiprocessing.cpu_count()
|
26
|
-
if "CYTOTABLE_MAX_THREADS" not in os.environ
|
27
|
-
else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
|
28
|
-
)
|
29
|
-
|
30
|
-
# enables overriding default memory mapping behavior with pyarrow memory mapping
|
31
|
-
CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
|
32
|
-
os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
|
33
|
-
)
|
34
|
-
|
35
|
-
DDB_DATA_TYPE_SYNONYMS = {
|
36
|
-
"real": ["float32", "float4", "float"],
|
37
|
-
"double": ["float64", "float8", "numeric", "decimal"],
|
38
|
-
"integer": ["int32", "int4", "int", "signed"],
|
39
|
-
"bigint": ["int64", "int8", "long"],
|
40
|
-
}
|
41
|
-
|
42
|
-
# A reference dictionary for SQLite affinity and storage class types
|
43
|
-
# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
|
44
|
-
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
45
|
-
"integer": [
|
46
|
-
"int",
|
47
|
-
"integer",
|
48
|
-
"tinyint",
|
49
|
-
"smallint",
|
50
|
-
"mediumint",
|
51
|
-
"bigint",
|
52
|
-
"unsigned big int",
|
53
|
-
"int2",
|
54
|
-
"int8",
|
55
|
-
],
|
56
|
-
"text": [
|
57
|
-
"character",
|
58
|
-
"varchar",
|
59
|
-
"varying character",
|
60
|
-
"nchar",
|
61
|
-
"native character",
|
62
|
-
"nvarchar",
|
63
|
-
"text",
|
64
|
-
"clob",
|
65
|
-
],
|
66
|
-
"blob": ["blob"],
|
67
|
-
"real": [
|
68
|
-
"real",
|
69
|
-
"double",
|
70
|
-
"double precision",
|
71
|
-
"float",
|
72
|
-
],
|
73
|
-
"numeric": [
|
74
|
-
"numeric",
|
75
|
-
"decimal",
|
76
|
-
"boolean",
|
77
|
-
"date",
|
78
|
-
"datetime",
|
79
|
-
],
|
80
|
-
}
|
81
|
-
|
82
|
-
|
83
22
|
# reference the original init
|
84
23
|
original_init = AppBase.__init__
|
85
24
|
|
@@ -198,6 +137,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
198
137
|
duckdb.DuckDBPyConnection
|
199
138
|
"""
|
200
139
|
|
140
|
+
import duckdb
|
141
|
+
|
142
|
+
from cytotable.constants import MAX_THREADS
|
143
|
+
|
201
144
|
return duckdb.connect().execute(
|
202
145
|
# note: we use an f-string here to
|
203
146
|
# dynamically configure threads as appropriate
|
@@ -252,20 +195,25 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
252
195
|
|
253
196
|
import pyarrow as pa
|
254
197
|
|
198
|
+
from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
255
199
|
from cytotable.exceptions import DatatypeException
|
256
|
-
from cytotable.utils import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
257
200
|
|
258
201
|
# open sqlite3 connection
|
259
202
|
with sqlite3.connect(source_path) as conn:
|
260
203
|
cursor = conn.cursor()
|
261
204
|
|
262
|
-
#
|
205
|
+
# Gather table column details including datatype.
|
206
|
+
# Note: uses SQLite pragma for table information.
|
207
|
+
# See the following for more information:
|
208
|
+
# https://sqlite.org/pragma.html#pragma_table_info
|
263
209
|
cursor.execute(
|
264
210
|
f"""
|
265
211
|
SELECT :table_name as table_name,
|
266
212
|
name as column_name,
|
267
213
|
type as column_type
|
268
|
-
FROM pragma_table_info(:table_name)
|
214
|
+
FROM pragma_table_info(:table_name)
|
215
|
+
/* explicit column ordering by 'cid' */
|
216
|
+
ORDER BY cid ASC;
|
269
217
|
""",
|
270
218
|
{"table_name": table_name},
|
271
219
|
)
|
@@ -384,6 +332,9 @@ def _arrow_type_cast_if_specified(
|
|
384
332
|
Dict[str, str]
|
385
333
|
A potentially data type updated dictionary of column information
|
386
334
|
"""
|
335
|
+
|
336
|
+
from cytotable.constants import DDB_DATA_TYPE_SYNONYMS
|
337
|
+
|
387
338
|
# for casting to new float type
|
388
339
|
if "float" in data_type_cast_map.keys() and column["column_dtype"] in [
|
389
340
|
"REAL",
|
@@ -453,3 +404,56 @@ def _expand_path(
|
|
453
404
|
modifed_path = modifed_path.expanduser()
|
454
405
|
|
455
406
|
return modifed_path.resolve()
|
407
|
+
|
408
|
+
|
409
|
+
def _get_cytotable_version() -> str:
|
410
|
+
"""
|
411
|
+
Seeks the current version of CytoTable using either pkg_resources
|
412
|
+
or dunamai to determine the current version being used.
|
413
|
+
|
414
|
+
Returns:
|
415
|
+
str
|
416
|
+
A string representing the version of CytoTable currently being used.
|
417
|
+
"""
|
418
|
+
|
419
|
+
try:
|
420
|
+
# attempt to gather the development version from dunamai
|
421
|
+
# for scenarios where cytotable from source is used.
|
422
|
+
import dunamai
|
423
|
+
|
424
|
+
return dunamai.Version.from_any_vcs().serialize()
|
425
|
+
except (RuntimeError, ModuleNotFoundError):
|
426
|
+
# else grab a static version from __init__.py
|
427
|
+
# for scenarios where the built/packaged cytotable is used.
|
428
|
+
import cytotable
|
429
|
+
|
430
|
+
return cytotable.__version__
|
431
|
+
|
432
|
+
|
433
|
+
def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
|
434
|
+
"""
|
435
|
+
Adds metadata to parquet output from CytoTable.
|
436
|
+
Note: this mostly wraps pyarrow.parquet.write_table
|
437
|
+
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
|
438
|
+
|
439
|
+
Args:
|
440
|
+
table: pa.Table:
|
441
|
+
Pyarrow table to be serialized as parquet table.
|
442
|
+
**kwargs: Any:
|
443
|
+
kwargs provided to this function roughly align with
|
444
|
+
pyarrow.parquet.write_table. The following might be
|
445
|
+
examples of what to expect here:
|
446
|
+
- where: str or pyarrow.NativeFile
|
447
|
+
"""
|
448
|
+
|
449
|
+
from pyarrow import parquet
|
450
|
+
|
451
|
+
from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA
|
452
|
+
from cytotable.utils import _get_cytotable_version
|
453
|
+
|
454
|
+
parquet.write_table(
|
455
|
+
table=table.replace_schema_metadata(
|
456
|
+
metadata=CYTOTABLE_DEFAULT_PARQUET_METADATA
|
457
|
+
),
|
458
|
+
**kwargs,
|
459
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -13,10 +13,11 @@ Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
16
17
|
Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
|
17
18
|
Requires-Dist: duckdb (>=0.8.0)
|
18
19
|
Requires-Dist: parsl (>=2023.9.25)
|
19
|
-
Requires-Dist: pyarrow (>=13.0.0
|
20
|
+
Requires-Dist: pyarrow (>=13.0.0)
|
20
21
|
Project-URL: Documentation, https://cytomining.github.io/CytoTable/
|
21
22
|
Project-URL: Repository, https://github.com/cytomining/CytoTable
|
22
23
|
Description-Content-Type: text/markdown
|
@@ -30,10 +31,17 @@ _Diagram showing data flow relative to this project._
|
|
30
31
|
|
31
32
|
## Summary
|
32
33
|
|
33
|
-
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
|
34
|
+
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
|
34
35
|
CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
|
35
36
|
The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
|
36
37
|
|
38
|
+
The name for the project is inspired from:
|
39
|
+
|
40
|
+
- __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
|
41
|
+
- __Table__:
|
42
|
+
- "1. Furniture with a top surface to accommodate a variety of uses."
|
43
|
+
- "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
|
44
|
+
|
37
45
|
## Installation
|
38
46
|
|
39
47
|
Install CytoTable from [PyPI](https://pypi.org/) or from source:
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=b0078yKBlAAnc7ms0n5nBRxK94xuKD52S4TFb4eTSiE,315
|
2
|
+
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
+
cytotable/convert.py,sha256=ORn2MmDmBUBEHDelDHc_j4J3LQgCEflXyzLouvf5h6Y,51971
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=SYZXh0-eK-2VRRd8I30GCQcZ4wDMmhGes8KdDsxpFqg,10771
|
6
|
+
cytotable/sources.py,sha256=M03pV0Z9YIiWs9pgoAFci3-S63uGCHq9HxvGLqhNV_0,11199
|
7
|
+
cytotable/utils.py,sha256=9zqLf_95-phH6IdsDgpK3g3NkDG4odx0NUWogQDs31k,14344
|
8
|
+
cytotable-0.0.4.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.4.dist-info/METADATA,sha256=fUPPn1ufKVe0nIvtHapwEBaNlr9di0hlmnsxh8n_BI0,3181
|
10
|
+
cytotable-0.0.4.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
11
|
+
cytotable-0.0.4.dist-info/RECORD,,
|
cytotable-0.0.3.dist-info/RECORD
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=_rBEpjjZTru1zqcGCxbqKD0LS20jM_jEeLnBTQP1Afw,213
|
2
|
-
cytotable/convert.py,sha256=DIDBFclu7jN1d2Ri-82LTxIEm5OjkMwZn8V609ZXEJE,50786
|
3
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
4
|
-
cytotable/presets.py,sha256=mP7IAu9LzVf9L5VDhXK2j3QHpxNJqcwudGZxklQpi2s,9694
|
5
|
-
cytotable/sources.py,sha256=jCzlm9jvezXABEeucfit6XRJ7HU3cKL5BQci-Oj-yzA,9910
|
6
|
-
cytotable/utils.py,sha256=PvhHdBQFewHwQ4a2zPeqMdSA3iwp8L25HwZI4oVfrYY,13771
|
7
|
-
cytotable-0.0.3.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
8
|
-
cytotable-0.0.3.dist-info/METADATA,sha256=T8sPn1j9ElIPGuErqtEKChy8M21jgalycQuSauQyI34,2743
|
9
|
-
cytotable-0.0.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
10
|
-
cytotable-0.0.3.dist-info/RECORD,,
|
File without changes
|