CytoTable 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/constants.py +7 -0
- cytotable/convert.py +163 -98
- cytotable/presets.py +91 -67
- cytotable/sources.py +45 -16
- cytotable/utils.py +150 -12
- {cytotable-0.0.7.dist-info → cytotable-0.0.9.dist-info}/METADATA +2 -2
- cytotable-0.0.9.dist-info/RECORD +11 -0
- cytotable-0.0.7.dist-info/RECORD +0 -11
- {cytotable-0.0.7.dist-info → cytotable-0.0.9.dist-info}/LICENSE +0 -0
- {cytotable-0.0.7.dist-info → cytotable-0.0.9.dist-info}/WHEEL +0 -0
cytotable/__init__.py
CHANGED
cytotable/constants.py
CHANGED
@@ -68,6 +68,13 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
|
68
68
|
],
|
69
69
|
}
|
70
70
|
|
71
|
+
# metadata column names and types for internal use within CytoTable
|
72
|
+
CYOTABLE_META_COLUMN_TYPES = {
|
73
|
+
"cytotable_meta_source_path": "VARCHAR",
|
74
|
+
"cytotable_meta_offset": "BIGINT",
|
75
|
+
"cytotable_meta_rownum": "BIGINT",
|
76
|
+
}
|
77
|
+
|
71
78
|
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
72
79
|
"data-producer": "https://github.com/cytomining/CytoTable",
|
73
80
|
"data-producer-version": str(_get_cytotable_version()),
|
cytotable/convert.py
CHANGED
@@ -8,23 +8,26 @@ import uuid
|
|
8
8
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import parsl
|
11
|
-
|
12
|
-
from parsl.app.app import join_app, python_app
|
11
|
+
from parsl.app.app import python_app
|
13
12
|
|
14
13
|
from cytotable.exceptions import CytoTableException
|
15
14
|
from cytotable.presets import config
|
15
|
+
from cytotable.sources import _gather_sources
|
16
16
|
from cytotable.utils import (
|
17
17
|
_column_sort,
|
18
18
|
_default_parsl_config,
|
19
19
|
_expand_path,
|
20
20
|
_parsl_loaded,
|
21
|
+
evaluate_futures,
|
21
22
|
)
|
22
23
|
|
23
24
|
logger = logging.getLogger(__name__)
|
24
25
|
|
25
26
|
|
26
27
|
@python_app
|
27
|
-
def _get_table_columns_and_types(
|
28
|
+
def _get_table_columns_and_types(
|
29
|
+
source: Dict[str, Any], sort_output: bool
|
30
|
+
) -> List[Dict[str, str]]:
|
28
31
|
"""
|
29
32
|
Gather column data from table through duckdb.
|
30
33
|
|
@@ -32,6 +35,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
|
|
32
35
|
source: Dict[str, Any]
|
33
36
|
Contains the source data to be chunked. Represents a single
|
34
37
|
file or table of some kind.
|
38
|
+
sort_output:
|
39
|
+
Specifies whether to sort cytotable output or not.
|
35
40
|
|
36
41
|
Returns:
|
37
42
|
List[Dict[str, str]]
|
@@ -41,11 +46,12 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
|
|
41
46
|
import pathlib
|
42
47
|
|
43
48
|
import duckdb
|
49
|
+
from cloudpathlib import AnyPath
|
44
50
|
|
45
51
|
from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
|
46
52
|
|
47
53
|
source_path = source["source_path"]
|
48
|
-
source_type = str(
|
54
|
+
source_type = str(source_path.suffix).lower()
|
49
55
|
|
50
56
|
# prepare the data source in the form of a duckdb query
|
51
57
|
select_source = (
|
@@ -109,6 +115,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
|
|
109
115
|
# offset is set to 0 start at first row
|
110
116
|
# result from table
|
111
117
|
offset=0,
|
118
|
+
add_cytotable_meta=False,
|
119
|
+
sort_output=sort_output,
|
112
120
|
)
|
113
121
|
with _duckdb_reader() as ddb_reader:
|
114
122
|
return (
|
@@ -202,7 +210,7 @@ def _get_table_chunk_offsets(
|
|
202
210
|
import pathlib
|
203
211
|
|
204
212
|
import duckdb
|
205
|
-
from cloudpathlib import AnyPath
|
213
|
+
from cloudpathlib import AnyPath, CloudPath
|
206
214
|
|
207
215
|
from cytotable.exceptions import NoInputDataException
|
208
216
|
from cytotable.utils import _duckdb_reader
|
@@ -212,18 +220,9 @@ def _get_table_chunk_offsets(
|
|
212
220
|
if source is not None:
|
213
221
|
table_name = source["table_name"] if "table_name" in source.keys() else None
|
214
222
|
source_path = source["source_path"]
|
215
|
-
source_type = str(
|
223
|
+
source_type = str(source_path.suffix).lower()
|
216
224
|
|
217
225
|
try:
|
218
|
-
# for csv's, check that we have more than one row (a header and data values)
|
219
|
-
if (
|
220
|
-
source_type == ".csv"
|
221
|
-
and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
|
222
|
-
):
|
223
|
-
raise NoInputDataException(
|
224
|
-
f"Data file has 0 rows of values. Error in file: {source_path}"
|
225
|
-
)
|
226
|
-
|
227
226
|
# gather the total rowcount from csv or sqlite data input sources
|
228
227
|
with _duckdb_reader() as ddb_reader:
|
229
228
|
rowcount = int(
|
@@ -275,6 +274,7 @@ def _source_chunk_to_parquet(
|
|
275
274
|
chunk_size: int,
|
276
275
|
offset: int,
|
277
276
|
dest_path: str,
|
277
|
+
sort_output: bool,
|
278
278
|
) -> str:
|
279
279
|
"""
|
280
280
|
Export source data to chunked parquet file using chunk size and offsets.
|
@@ -291,6 +291,8 @@ def _source_chunk_to_parquet(
|
|
291
291
|
The offset for chunking the data from source.
|
292
292
|
dest_path: str
|
293
293
|
Path to store the output data.
|
294
|
+
sort_output: bool
|
295
|
+
Specifies whether to sort cytotable output or not.
|
294
296
|
|
295
297
|
Returns:
|
296
298
|
str
|
@@ -303,6 +305,7 @@ def _source_chunk_to_parquet(
|
|
303
305
|
from cloudpathlib import AnyPath
|
304
306
|
from pyarrow import parquet
|
305
307
|
|
308
|
+
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
306
309
|
from cytotable.utils import (
|
307
310
|
_duckdb_reader,
|
308
311
|
_sqlite_mixed_type_query_to_parquet,
|
@@ -311,27 +314,53 @@ def _source_chunk_to_parquet(
|
|
311
314
|
|
312
315
|
# attempt to build dest_path
|
313
316
|
source_dest_path = (
|
314
|
-
f"{dest_path}/{str(
|
315
|
-
f"{str(
|
317
|
+
f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
|
318
|
+
f"{str(source['source_path'].parent.name).lower()}"
|
316
319
|
)
|
317
320
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
318
321
|
|
322
|
+
source_path_str = (
|
323
|
+
source["source_path"]
|
324
|
+
if "table_name" not in source.keys()
|
325
|
+
else f"{source['source_path']}_table_{source['table_name']}"
|
326
|
+
)
|
319
327
|
# build the column selection block of query
|
328
|
+
|
329
|
+
# add cytotable metadata columns
|
330
|
+
cytotable_metadata_cols = [
|
331
|
+
(
|
332
|
+
f"CAST( '{source_path_str}' "
|
333
|
+
f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
|
334
|
+
' AS "cytotable_meta_source_path"'
|
335
|
+
),
|
336
|
+
f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
|
337
|
+
(
|
338
|
+
f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
|
339
|
+
' AS "cytotable_meta_rownum"'
|
340
|
+
),
|
341
|
+
]
|
342
|
+
# add source table columns
|
343
|
+
casted_source_cols = [
|
344
|
+
# here we cast the column to the specified type ensure the colname remains the same
|
345
|
+
f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
|
346
|
+
for column in source["columns"]
|
347
|
+
]
|
348
|
+
|
349
|
+
# create selection statement from lists above
|
320
350
|
select_columns = ",".join(
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
]
|
351
|
+
# if we should sort the output, add the metadata_cols
|
352
|
+
cytotable_metadata_cols + casted_source_cols
|
353
|
+
if sort_output
|
354
|
+
else casted_source_cols
|
326
355
|
)
|
327
356
|
|
328
357
|
# build output query and filepath base
|
329
358
|
# (chunked output will append offset to keep output paths unique)
|
330
|
-
if str(
|
359
|
+
if str(source["source_path"].suffix).lower() == ".csv":
|
331
360
|
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
|
332
361
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
|
333
362
|
|
334
|
-
elif str(
|
363
|
+
elif str(source["source_path"].suffix).lower() == ".sqlite":
|
335
364
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
336
365
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
337
366
|
|
@@ -352,6 +381,11 @@ def _source_chunk_to_parquet(
|
|
352
381
|
ORDER BY ALL
|
353
382
|
LIMIT {chunk_size} OFFSET {offset}
|
354
383
|
"""
|
384
|
+
if sort_output
|
385
|
+
else f"""
|
386
|
+
{base_query}
|
387
|
+
LIMIT {chunk_size} OFFSET {offset}
|
388
|
+
"""
|
355
389
|
).arrow(),
|
356
390
|
where=result_filepath,
|
357
391
|
)
|
@@ -363,7 +397,7 @@ def _source_chunk_to_parquet(
|
|
363
397
|
# to handle the mixed types
|
364
398
|
if (
|
365
399
|
"Mismatch Type Error" in str(e)
|
366
|
-
and str(
|
400
|
+
and str(source["source_path"].suffix).lower() == ".sqlite"
|
367
401
|
):
|
368
402
|
_write_parquet_table_with_metadata(
|
369
403
|
# here we use sqlite instead of duckdb to extract
|
@@ -374,6 +408,8 @@ def _source_chunk_to_parquet(
|
|
374
408
|
table_name=str(source["table_name"]),
|
375
409
|
chunk_size=chunk_size,
|
376
410
|
offset=offset,
|
411
|
+
add_cytotable_meta=True if sort_output else False,
|
412
|
+
sort_output=sort_output,
|
377
413
|
),
|
378
414
|
where=result_filepath,
|
379
415
|
)
|
@@ -422,7 +458,10 @@ def _prepend_column_name(
|
|
422
458
|
|
423
459
|
import pyarrow.parquet as parquet
|
424
460
|
|
425
|
-
from cytotable.constants import
|
461
|
+
from cytotable.constants import (
|
462
|
+
CYOTABLE_META_COLUMN_TYPES,
|
463
|
+
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
464
|
+
)
|
426
465
|
from cytotable.utils import _write_parquet_table_with_metadata
|
427
466
|
|
428
467
|
logger = logging.getLogger(__name__)
|
@@ -470,8 +509,10 @@ def _prepend_column_name(
|
|
470
509
|
# source_group_name_stem: 'Cells'
|
471
510
|
# column_name: 'AreaShape_Area'
|
472
511
|
# updated_column_name: 'Cells_AreaShape_Area'
|
473
|
-
if
|
474
|
-
|
512
|
+
if (
|
513
|
+
column_name not in identifying_columns
|
514
|
+
and not column_name.startswith(source_group_name_stem.capitalize())
|
515
|
+
and column_name not in CYOTABLE_META_COLUMN_TYPES
|
475
516
|
):
|
476
517
|
updated_column_names.append(f"{source_group_name_stem}_{column_name}")
|
477
518
|
# if-condition for prepending 'Metadata_' to column name
|
@@ -679,6 +720,7 @@ def _concat_source_group(
|
|
679
720
|
def _prepare_join_sql(
|
680
721
|
sources: Dict[str, List[Dict[str, Any]]],
|
681
722
|
joins: str,
|
723
|
+
sort_output: bool,
|
682
724
|
) -> str:
|
683
725
|
"""
|
684
726
|
Prepare join SQL statement with actual locations of data based on the sources.
|
@@ -690,6 +732,8 @@ def _prepare_join_sql(
|
|
690
732
|
joins: str:
|
691
733
|
DuckDB-compatible SQL which will be used to perform the join
|
692
734
|
operations using the join_group keys as a reference.
|
735
|
+
sort_output: bool
|
736
|
+
Specifies whether to sort cytotable output or not.
|
693
737
|
|
694
738
|
Returns:
|
695
739
|
str:
|
@@ -697,15 +741,30 @@ def _prepare_join_sql(
|
|
697
741
|
"""
|
698
742
|
import pathlib
|
699
743
|
|
744
|
+
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
745
|
+
|
700
746
|
# replace with real location of sources for join sql
|
747
|
+
order_by_tables = []
|
701
748
|
for key, val in sources.items():
|
702
749
|
if pathlib.Path(key).stem.lower() in joins.lower():
|
750
|
+
table_name = str(pathlib.Path(key).stem.lower())
|
703
751
|
joins = joins.replace(
|
704
|
-
f"'{
|
752
|
+
f"'{table_name}.parquet'",
|
705
753
|
str([str(table) for table in val[0]["table"]]),
|
706
754
|
)
|
755
|
+
order_by_tables.append(table_name)
|
707
756
|
|
708
|
-
|
757
|
+
# create order by statement with from all tables using cytotable metadata
|
758
|
+
order_by_sql = "ORDER BY " + ", ".join(
|
759
|
+
[
|
760
|
+
f"{table}.{meta_column}"
|
761
|
+
for table in order_by_tables
|
762
|
+
for meta_column in CYOTABLE_META_COLUMN_TYPES
|
763
|
+
]
|
764
|
+
)
|
765
|
+
|
766
|
+
# add the order by statements to the join
|
767
|
+
return joins + order_by_sql if sort_output else joins
|
709
768
|
|
710
769
|
|
711
770
|
@python_app
|
@@ -739,8 +798,7 @@ def _join_source_chunk(
|
|
739
798
|
|
740
799
|
import pathlib
|
741
800
|
|
742
|
-
|
743
|
-
|
801
|
+
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
744
802
|
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
745
803
|
|
746
804
|
# Attempt to read the data to parquet file
|
@@ -748,12 +806,22 @@ def _join_source_chunk(
|
|
748
806
|
# writing data to a parquet file.
|
749
807
|
# read data with chunk size + offset
|
750
808
|
# and export to parquet
|
809
|
+
exclude_meta_cols = [
|
810
|
+
f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
|
811
|
+
]
|
812
|
+
|
751
813
|
with _duckdb_reader() as ddb_reader:
|
752
814
|
result = ddb_reader.execute(
|
753
815
|
f"""
|
816
|
+
WITH joined AS (
|
754
817
|
{joins}
|
755
|
-
{"ORDER BY ALL" if "ORDER BY" not in joins.upper() else ""}
|
756
818
|
LIMIT {chunk_size} OFFSET {offset}
|
819
|
+
)
|
820
|
+
SELECT
|
821
|
+
/* exclude metadata columns from the results
|
822
|
+
by using a lambda on column names based on exclude_meta_cols. */
|
823
|
+
COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
|
824
|
+
FROM joined;
|
757
825
|
"""
|
758
826
|
).arrow()
|
759
827
|
|
@@ -960,40 +1028,20 @@ def _infer_source_group_common_schema(
|
|
960
1028
|
)
|
961
1029
|
|
962
1030
|
|
963
|
-
@python_app
|
964
|
-
def _return_future(input: Any) -> Any:
|
965
|
-
"""
|
966
|
-
This is a simple wrapper python_app to allow
|
967
|
-
the return of join_app-compliant output (must be a Parsl future)
|
968
|
-
|
969
|
-
Args:
|
970
|
-
input: Any
|
971
|
-
Any input which will be used within the context of a
|
972
|
-
Parsl join_app future return.
|
973
|
-
|
974
|
-
Returns:
|
975
|
-
Any
|
976
|
-
Returns the input as provided wrapped within the context
|
977
|
-
of a python_app for the purpose of a join_app.
|
978
|
-
"""
|
979
|
-
|
980
|
-
return input
|
981
|
-
|
982
|
-
|
983
|
-
@join_app
|
984
1031
|
def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
985
1032
|
source_path: str,
|
986
1033
|
dest_path: str,
|
987
1034
|
source_datatype: Optional[str],
|
988
|
-
metadata: Union[List[str], Tuple[str, ...]],
|
989
|
-
compartments: Union[List[str], Tuple[str, ...]],
|
990
|
-
identifying_columns: Union[List[str], Tuple[str, ...]],
|
1035
|
+
metadata: Optional[Union[List[str], Tuple[str, ...]]],
|
1036
|
+
compartments: Optional[Union[List[str], Tuple[str, ...]]],
|
1037
|
+
identifying_columns: Optional[Union[List[str], Tuple[str, ...]]],
|
991
1038
|
concat: bool,
|
992
1039
|
join: bool,
|
993
1040
|
joins: Optional[str],
|
994
1041
|
chunk_size: Optional[int],
|
995
1042
|
infer_common_schema: bool,
|
996
1043
|
drop_null: bool,
|
1044
|
+
sort_output: bool,
|
997
1045
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
998
1046
|
**kwargs,
|
999
1047
|
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
@@ -1032,6 +1080,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1032
1080
|
Whether to infer a common schema when concatenating sources.
|
1033
1081
|
drop_null: bool:
|
1034
1082
|
Whether to drop null results.
|
1083
|
+
sort_output: bool
|
1084
|
+
Specifies whether to sort cytotable output or not.
|
1035
1085
|
data_type_cast_map: Dict[str, str]
|
1036
1086
|
A dictionary mapping data type groups to specific types.
|
1037
1087
|
Roughly includes Arrow data types language from:
|
@@ -1047,26 +1097,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1047
1097
|
result.
|
1048
1098
|
"""
|
1049
1099
|
|
1050
|
-
from cytotable.convert import (
|
1051
|
-
_concat_join_sources,
|
1052
|
-
_concat_source_group,
|
1053
|
-
_get_table_chunk_offsets,
|
1054
|
-
_infer_source_group_common_schema,
|
1055
|
-
_join_source_chunk,
|
1056
|
-
_prepend_column_name,
|
1057
|
-
_return_future,
|
1058
|
-
_source_chunk_to_parquet,
|
1059
|
-
)
|
1060
|
-
from cytotable.sources import _gather_sources
|
1061
|
-
from cytotable.utils import _expand_path
|
1062
|
-
|
1063
1100
|
# gather sources to be processed
|
1064
1101
|
sources = _gather_sources(
|
1065
1102
|
source_path=source_path,
|
1066
1103
|
source_datatype=source_datatype,
|
1067
|
-
targets=
|
1104
|
+
targets=(
|
1105
|
+
list(metadata) + list(compartments)
|
1106
|
+
if metadata is not None and compartments is not None
|
1107
|
+
else []
|
1108
|
+
),
|
1068
1109
|
**kwargs,
|
1069
|
-
)
|
1110
|
+
)
|
1070
1111
|
|
1071
1112
|
# expand the destination path
|
1072
1113
|
expanded_dest_path = _expand_path(path=dest_path)
|
@@ -1080,7 +1121,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1080
1121
|
"offsets": _get_table_chunk_offsets(
|
1081
1122
|
source=source,
|
1082
1123
|
chunk_size=chunk_size,
|
1083
|
-
)
|
1124
|
+
)
|
1084
1125
|
},
|
1085
1126
|
)
|
1086
1127
|
for source in source_group_vals
|
@@ -1097,7 +1138,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1097
1138
|
for source in source_group_vals
|
1098
1139
|
if source["offsets"] is not None
|
1099
1140
|
]
|
1100
|
-
for source_group_name, source_group_vals in
|
1141
|
+
for source_group_name, source_group_vals in evaluate_futures(
|
1142
|
+
offsets_prepared
|
1143
|
+
).items()
|
1101
1144
|
# ensure we have source_groups with at least one source table
|
1102
1145
|
if len(source_group_vals) > 0
|
1103
1146
|
}
|
@@ -1110,10 +1153,10 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1110
1153
|
**{
|
1111
1154
|
"columns": _prep_cast_column_data_types(
|
1112
1155
|
columns=_get_table_columns_and_types(
|
1113
|
-
source=source,
|
1156
|
+
source=source, sort_output=sort_output
|
1114
1157
|
),
|
1115
1158
|
data_type_cast_map=data_type_cast_map,
|
1116
|
-
)
|
1159
|
+
)
|
1117
1160
|
},
|
1118
1161
|
)
|
1119
1162
|
for source in source_group_vals
|
@@ -1136,33 +1179,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1136
1179
|
chunk_size=chunk_size,
|
1137
1180
|
offset=offset,
|
1138
1181
|
dest_path=expanded_dest_path,
|
1182
|
+
sort_output=sort_output,
|
1139
1183
|
),
|
1140
1184
|
source_group_name=source_group_name,
|
1141
1185
|
identifying_columns=identifying_columns,
|
1142
1186
|
metadata=metadata,
|
1143
1187
|
compartments=compartments,
|
1144
|
-
)
|
1188
|
+
)
|
1145
1189
|
for offset in source["offsets"]
|
1146
1190
|
]
|
1147
1191
|
},
|
1148
1192
|
)
|
1149
1193
|
for source in source_group_vals
|
1150
1194
|
]
|
1151
|
-
for source_group_name, source_group_vals in
|
1195
|
+
for source_group_name, source_group_vals in evaluate_futures(
|
1196
|
+
column_names_and_types_gathered
|
1197
|
+
).items()
|
1152
1198
|
}
|
1153
1199
|
|
1154
1200
|
# if we're concatting or joining and need to infer the common schema
|
1155
1201
|
if (concat or join) and infer_common_schema:
|
1156
1202
|
# create a common schema for concatenation work
|
1157
1203
|
common_schema_determined = {
|
1158
|
-
source_group_name:
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1204
|
+
source_group_name: [
|
1205
|
+
{
|
1206
|
+
"sources": source_group_vals,
|
1207
|
+
"common_schema": _infer_source_group_common_schema(
|
1208
|
+
source_group=source_group_vals,
|
1209
|
+
data_type_cast_map=data_type_cast_map,
|
1210
|
+
),
|
1211
|
+
}
|
1212
|
+
]
|
1213
|
+
for source_group_name, source_group_vals in evaluate_futures(
|
1214
|
+
results
|
1215
|
+
).items()
|
1166
1216
|
}
|
1167
1217
|
|
1168
1218
|
# if concat or join, concat the source groups
|
@@ -1174,17 +1224,24 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1174
1224
|
results = {
|
1175
1225
|
source_group_name: _concat_source_group(
|
1176
1226
|
source_group_name=source_group_name,
|
1177
|
-
source_group=source_group_vals["sources"],
|
1227
|
+
source_group=source_group_vals[0]["sources"],
|
1178
1228
|
dest_path=expanded_dest_path,
|
1179
|
-
common_schema=source_group_vals["common_schema"],
|
1180
|
-
)
|
1181
|
-
for source_group_name, source_group_vals in
|
1229
|
+
common_schema=source_group_vals[0]["common_schema"],
|
1230
|
+
)
|
1231
|
+
for source_group_name, source_group_vals in evaluate_futures(
|
1232
|
+
common_schema_determined
|
1233
|
+
).items()
|
1182
1234
|
}
|
1183
1235
|
|
1184
1236
|
# conditional section for merging
|
1185
1237
|
# note: join implies a concat, but concat does not imply a join
|
1186
1238
|
if join:
|
1187
|
-
|
1239
|
+
# evaluate the results as they're used multiple times below
|
1240
|
+
evaluated_results = evaluate_futures(results)
|
1241
|
+
|
1242
|
+
prepared_joins_sql = _prepare_join_sql(
|
1243
|
+
sources=evaluated_results, joins=joins, sort_output=sort_output
|
1244
|
+
).result()
|
1188
1245
|
|
1189
1246
|
# map joined results based on the join groups gathered above
|
1190
1247
|
# note: after mapping we end up with a list of strings (task returns str)
|
@@ -1198,7 +1255,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1198
1255
|
chunk_size=chunk_size,
|
1199
1256
|
offset=offset,
|
1200
1257
|
drop_null=drop_null,
|
1201
|
-
)
|
1258
|
+
)
|
1202
1259
|
# create join group for querying the concatenated
|
1203
1260
|
# data in order to perform memory-safe joining
|
1204
1261
|
# per user chunk size specification.
|
@@ -1213,12 +1270,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1213
1270
|
# for lineage and debugging
|
1214
1271
|
results = _concat_join_sources(
|
1215
1272
|
dest_path=expanded_dest_path,
|
1216
|
-
join_sources=join_sources_result,
|
1217
|
-
sources=
|
1218
|
-
)
|
1273
|
+
join_sources=[join.result() for join in join_sources_result],
|
1274
|
+
sources=evaluated_results,
|
1275
|
+
)
|
1219
1276
|
|
1220
1277
|
# wrap the final result as a future and return
|
1221
|
-
return
|
1278
|
+
return evaluate_futures(results)
|
1222
1279
|
|
1223
1280
|
|
1224
1281
|
def convert( # pylint: disable=too-many-arguments,too-many-locals
|
@@ -1236,6 +1293,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1236
1293
|
infer_common_schema: bool = True,
|
1237
1294
|
drop_null: bool = False,
|
1238
1295
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1296
|
+
sort_output: bool = True,
|
1239
1297
|
preset: Optional[str] = "cellprofiler_csv",
|
1240
1298
|
parsl_config: Optional[parsl.Config] = None,
|
1241
1299
|
**kwargs,
|
@@ -1277,8 +1335,14 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1277
1335
|
DuckDB-compatible SQL which will be used to perform the join operations.
|
1278
1336
|
chunk_size: Optional[int] (Default value = None)
|
1279
1337
|
Size of join chunks which is used to limit data size during join ops
|
1280
|
-
infer_common_schema: bool
|
1338
|
+
infer_common_schema: bool (Default value = True)
|
1281
1339
|
Whether to infer a common schema when concatenating sources.
|
1340
|
+
data_type_cast_map: Dict[str, str], (Default value = None)
|
1341
|
+
A dictionary mapping data type groups to specific types.
|
1342
|
+
Roughly includes Arrow data types language from:
|
1343
|
+
https://arrow.apache.org/docs/python/api/datatypes.html
|
1344
|
+
sort_output: bool (Default value = True)
|
1345
|
+
Specifies whether to sort cytotable output or not.
|
1282
1346
|
drop_null: bool (Default value = False)
|
1283
1347
|
Whether to drop nan/null values from results
|
1284
1348
|
preset: str (Default value = "cellprofiler_csv")
|
@@ -1393,7 +1457,8 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1393
1457
|
infer_common_schema=infer_common_schema,
|
1394
1458
|
drop_null=drop_null,
|
1395
1459
|
data_type_cast_map=data_type_cast_map,
|
1460
|
+
sort_output=sort_output,
|
1396
1461
|
**kwargs,
|
1397
|
-
)
|
1462
|
+
)
|
1398
1463
|
|
1399
1464
|
return output
|
cytotable/presets.py
CHANGED
@@ -29,25 +29,19 @@ config = {
|
|
29
29
|
# compartment and metadata joins performed using DuckDB SQL
|
30
30
|
# and modified at runtime as needed
|
31
31
|
"CONFIG_JOINS": """
|
32
|
-
WITH Image_Filtered AS (
|
33
|
-
SELECT
|
34
|
-
/* seeks columns by name, avoiding failure if some do not exist */
|
35
|
-
COLUMNS('^Metadata_ImageNumber$|^Image_Metadata_Well$|^Image_Metadata_Plate$')
|
36
|
-
FROM
|
37
|
-
read_parquet('image.parquet')
|
38
|
-
)
|
39
32
|
SELECT
|
40
|
-
|
33
|
+
image.Metadata_ImageNumber,
|
34
|
+
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
35
|
+
cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
|
36
|
+
nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
|
41
37
|
FROM
|
42
38
|
read_parquet('cytoplasm.parquet') AS cytoplasm
|
43
|
-
LEFT JOIN read_parquet('cells.parquet') AS cells
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
39
|
+
LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
|
40
|
+
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
|
41
|
+
LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
|
42
|
+
WHERE
|
43
|
+
cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
|
48
44
|
AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
|
49
|
-
LEFT JOIN Image_Filtered AS image ON
|
50
|
-
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
51
45
|
""",
|
52
46
|
},
|
53
47
|
"cellprofiler_sqlite": {
|
@@ -74,26 +68,69 @@ config = {
|
|
74
68
|
# compartment and metadata joins performed using DuckDB SQL
|
75
69
|
# and modified at runtime as needed
|
76
70
|
"CONFIG_JOINS": """
|
77
|
-
WITH Per_Image_Filtered AS (
|
78
|
-
SELECT
|
79
|
-
Metadata_ImageNumber,
|
80
|
-
Image_Metadata_Well,
|
81
|
-
Image_Metadata_Plate
|
82
|
-
FROM
|
83
|
-
read_parquet('per_image.parquet')
|
84
|
-
)
|
85
71
|
SELECT
|
86
|
-
|
72
|
+
per_image.Metadata_ImageNumber,
|
73
|
+
per_image.Image_Metadata_Well,
|
74
|
+
per_image.Image_Metadata_Plate,
|
75
|
+
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
76
|
+
per_cells.* EXCLUDE (Metadata_ImageNumber),
|
77
|
+
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
|
87
78
|
FROM
|
88
79
|
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
|
89
|
-
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
80
|
+
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
|
81
|
+
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
|
82
|
+
LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
|
83
|
+
WHERE
|
84
|
+
per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
|
94
85
|
AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
|
95
|
-
|
96
|
-
|
86
|
+
""",
|
87
|
+
},
|
88
|
+
"cellprofiler_sqlite_cpg0016_jump": {
|
89
|
+
# version specifications using related references
|
90
|
+
"CONFIG_SOURCE_VERSION": {
|
91
|
+
"cellprofiler": "v4.0.0",
|
92
|
+
},
|
93
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
94
|
+
"CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
|
95
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
96
|
+
"CONFIG_NAMES_METADATA": ("image",),
|
97
|
+
# column names in any compartment or metadata tables which contain
|
98
|
+
# unique names to avoid renaming
|
99
|
+
"CONFIG_IDENTIFYING_COLUMNS": (
|
100
|
+
"ImageNumber",
|
101
|
+
"ObjectNumber",
|
102
|
+
"Metadata_Well",
|
103
|
+
"Metadata_Plate",
|
104
|
+
"Parent_Cells",
|
105
|
+
"Parent_Nuclei",
|
106
|
+
),
|
107
|
+
# chunk size to use for join operations to help with possible performance issues
|
108
|
+
# note: this number is an estimate and is may need changes contingent on data
|
109
|
+
# and system used by this library.
|
110
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
111
|
+
# compartment and metadata joins performed using DuckDB SQL
|
112
|
+
# and modified at runtime as needed
|
113
|
+
"CONFIG_JOINS": """
|
114
|
+
SELECT
|
115
|
+
image.Image_TableNumber,
|
116
|
+
image.Metadata_ImageNumber,
|
117
|
+
image.Metadata_Plate,
|
118
|
+
image.Metadata_Well,
|
119
|
+
image.Image_Metadata_Site,
|
120
|
+
image.Image_Metadata_Row,
|
121
|
+
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
122
|
+
cells.* EXCLUDE (Metadata_ImageNumber),
|
123
|
+
nuclei.* EXCLUDE (Metadata_ImageNumber)
|
124
|
+
FROM
|
125
|
+
read_parquet('cytoplasm.parquet') AS cytoplasm
|
126
|
+
LEFT JOIN read_parquet('cells.parquet') AS cells ON
|
127
|
+
cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
128
|
+
AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
|
129
|
+
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
|
130
|
+
nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
131
|
+
AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
|
132
|
+
LEFT JOIN read_parquet('image.parquet') AS image ON
|
133
|
+
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
97
134
|
""",
|
98
135
|
},
|
99
136
|
"cellprofiler_sqlite_pycytominer": {
|
@@ -125,26 +162,21 @@ config = {
|
|
125
162
|
# compartment and metadata joins performed using DuckDB SQL
|
126
163
|
# and modified at runtime as needed
|
127
164
|
"CONFIG_JOINS": """
|
128
|
-
WITH Per_Image_Filtered AS (
|
129
|
-
SELECT
|
130
|
-
Metadata_ImageNumber,
|
131
|
-
Image_Metadata_Well,
|
132
|
-
Image_Metadata_Plate
|
133
|
-
FROM
|
134
|
-
read_parquet('per_image.parquet')
|
135
|
-
)
|
136
165
|
SELECT
|
137
|
-
|
166
|
+
per_image.Metadata_ImageNumber,
|
167
|
+
per_image.Image_Metadata_Well,
|
168
|
+
per_image.Image_Metadata_Plate,
|
169
|
+
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
170
|
+
per_cells.* EXCLUDE (Metadata_ImageNumber),
|
171
|
+
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
|
138
172
|
FROM
|
139
173
|
read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
|
140
|
-
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
174
|
+
LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
|
175
|
+
LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
|
176
|
+
LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
|
177
|
+
WHERE
|
178
|
+
per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
|
145
179
|
AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
|
146
|
-
LEFT JOIN Per_Image_Filtered AS per_image ON
|
147
|
-
per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
|
148
180
|
""",
|
149
181
|
},
|
150
182
|
"cell-health-cellprofiler-to-cytominer-database": {
|
@@ -178,30 +210,22 @@ config = {
|
|
178
210
|
# compartment and metadata joins performed using DuckDB SQL
|
179
211
|
# and modified at runtime as needed
|
180
212
|
"CONFIG_JOINS": """
|
181
|
-
WITH Image_Filtered AS (
|
182
|
-
SELECT
|
183
|
-
Metadata_TableNumber,
|
184
|
-
Metadata_ImageNumber,
|
185
|
-
Image_Metadata_Well,
|
186
|
-
Image_Metadata_Plate
|
187
|
-
FROM
|
188
|
-
read_parquet('image.parquet')
|
189
|
-
)
|
190
213
|
SELECT
|
191
|
-
|
214
|
+
image.Metadata_TableNumber,
|
215
|
+
image.Metadata_ImageNumber,
|
216
|
+
image.Image_Metadata_Well,
|
217
|
+
image.Image_Metadata_Plate,
|
218
|
+
cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
|
219
|
+
cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
|
220
|
+
nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
|
192
221
|
FROM
|
193
222
|
read_parquet('cytoplasm.parquet') AS cytoplasm
|
194
|
-
LEFT JOIN read_parquet('cells.parquet') AS cells
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
|
200
|
-
AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
223
|
+
LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_TableNumber, Metadata_ImageNumber)
|
224
|
+
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_TableNumber, Metadata_ImageNumber)
|
225
|
+
LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_TableNumber, Metadata_ImageNumber)
|
226
|
+
WHERE
|
227
|
+
cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
|
201
228
|
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
|
202
|
-
LEFT JOIN Image_Filtered AS image ON
|
203
|
-
image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
|
204
|
-
AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
205
229
|
""",
|
206
230
|
},
|
207
231
|
"in-carta": {
|
cytotable/sources.py
CHANGED
@@ -7,13 +7,11 @@ import pathlib
|
|
7
7
|
from typing import Any, Dict, List, Optional, Union
|
8
8
|
|
9
9
|
from cloudpathlib import AnyPath
|
10
|
-
from parsl.app.app import join_app, python_app
|
11
10
|
|
11
|
+
from cytotable.exceptions import NoInputDataException
|
12
12
|
|
13
|
-
|
14
|
-
def _build_path(
|
15
|
-
path: Union[str, pathlib.Path, AnyPath], **kwargs
|
16
|
-
) -> Union[pathlib.Path, AnyPath]:
|
13
|
+
|
14
|
+
def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
|
17
15
|
"""
|
18
16
|
Build a path client or return local path.
|
19
17
|
|
@@ -43,10 +41,9 @@ def _build_path(
|
|
43
41
|
return processed_path
|
44
42
|
|
45
43
|
|
46
|
-
@python_app
|
47
44
|
def _get_source_filepaths(
|
48
45
|
path: Union[pathlib.Path, AnyPath],
|
49
|
-
targets: List[str],
|
46
|
+
targets: Optional[List[str]] = None,
|
50
47
|
source_datatype: Optional[str] = None,
|
51
48
|
) -> Dict[str, List[Dict[str, Any]]]:
|
52
49
|
"""
|
@@ -75,7 +72,7 @@ def _get_source_filepaths(
|
|
75
72
|
|
76
73
|
if (targets is None or targets == []) and source_datatype is None:
|
77
74
|
raise DatatypeException(
|
78
|
-
|
75
|
+
"A source_datatype must be specified when using undefined compartments and metadata names."
|
79
76
|
)
|
80
77
|
|
81
78
|
# gathers files from provided path using compartments + metadata as a filter
|
@@ -87,9 +84,9 @@ def _get_source_filepaths(
|
|
87
84
|
for subpath in (
|
88
85
|
(path,)
|
89
86
|
# used if the source path is a single file
|
90
|
-
if
|
87
|
+
if path.is_file()
|
91
88
|
# iterates through a source directory
|
92
|
-
else (x for x in
|
89
|
+
else (x for x in path.glob("**/*") if x.is_file())
|
93
90
|
)
|
94
91
|
# ensure the subpaths meet certain specifications
|
95
92
|
if (
|
@@ -129,7 +126,8 @@ def _get_source_filepaths(
|
|
129
126
|
.arrow()["table_name"]
|
130
127
|
.to_pylist()
|
131
128
|
# make sure the table names match with compartment + metadata names
|
132
|
-
if
|
129
|
+
if targets is not None
|
130
|
+
and any(target.lower() in table_name.lower() for target in targets)
|
133
131
|
]
|
134
132
|
else:
|
135
133
|
# if we don't have sqlite source, append the existing element
|
@@ -181,7 +179,6 @@ def _get_source_filepaths(
|
|
181
179
|
return grouped_sources
|
182
180
|
|
183
181
|
|
184
|
-
@python_app
|
185
182
|
def _infer_source_datatype(
|
186
183
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
|
187
184
|
) -> str:
|
@@ -230,7 +227,6 @@ def _infer_source_datatype(
|
|
230
227
|
return source_datatype
|
231
228
|
|
232
229
|
|
233
|
-
@python_app
|
234
230
|
def _filter_source_filepaths(
|
235
231
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
|
236
232
|
) -> Dict[str, List[Dict[str, Any]]]:
|
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
|
|
260
256
|
if file["source_path"].stat().st_size > 0
|
261
257
|
# ensure the datatype matches the source datatype
|
262
258
|
and file["source_path"].suffix == f".{source_datatype}"
|
259
|
+
and _file_is_more_than_one_line(path=file["source_path"])
|
263
260
|
]
|
264
261
|
for filegroup, files in sources.items()
|
265
262
|
}
|
266
263
|
|
267
264
|
|
268
|
-
|
265
|
+
def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
|
266
|
+
"""
|
267
|
+
Check if the file has more than one line.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
path (Union[pathlib.Path, AnyPath]):
|
271
|
+
The path to the file.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
bool:
|
275
|
+
True if the file has more than one line, False otherwise.
|
276
|
+
|
277
|
+
Raises:
|
278
|
+
NoInputDataException: If the file has zero lines.
|
279
|
+
"""
|
280
|
+
|
281
|
+
# if we don't have a sqlite file
|
282
|
+
# (we can't check sqlite files for lines)
|
283
|
+
if path.suffix.lower() != ".sqlite":
|
284
|
+
with path.open("r") as f:
|
285
|
+
try:
|
286
|
+
# read two lines, if the second is empty return false
|
287
|
+
return bool(f.readline() and f.readline())
|
288
|
+
|
289
|
+
except StopIteration:
|
290
|
+
# If we encounter the end of the file, it has only one line
|
291
|
+
raise NoInputDataException(
|
292
|
+
f"Data file has 0 rows of values. Error in file: {path}"
|
293
|
+
)
|
294
|
+
else:
|
295
|
+
return True
|
296
|
+
|
297
|
+
|
269
298
|
def _gather_sources(
|
270
299
|
source_path: str,
|
271
300
|
source_datatype: Optional[str] = None,
|
@@ -295,11 +324,11 @@ def _gather_sources(
|
|
295
324
|
_infer_source_datatype,
|
296
325
|
)
|
297
326
|
|
298
|
-
|
327
|
+
built_path = _build_path(path=source_path, **kwargs)
|
299
328
|
|
300
329
|
# gather filepaths which will be used as the basis for this work
|
301
330
|
sources = _get_source_filepaths(
|
302
|
-
path=
|
331
|
+
path=built_path, targets=targets, source_datatype=source_datatype
|
303
332
|
)
|
304
333
|
|
305
334
|
# infer or validate the source datatype based on source filepaths
|
cytotable/utils.py
CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Any, Dict, Optional, Union, cast
|
8
|
+
from typing import Any, Dict, List, Optional, Union, cast
|
9
9
|
|
10
10
|
import duckdb
|
11
11
|
import parsl
|
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
149
149
|
INSTALL sqlite_scanner;
|
150
150
|
LOAD sqlite_scanner;
|
151
151
|
|
152
|
+
/* Install httpfs plugin to avoid error
|
153
|
+
https://github.com/duckdb/duckdb/issues/3243 */
|
154
|
+
INSTALL httpfs;
|
155
|
+
|
152
156
|
/*
|
153
157
|
Set threads available to duckdb
|
154
158
|
See the following for more information:
|
@@ -171,6 +175,8 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
171
175
|
table_name: str,
|
172
176
|
chunk_size: int,
|
173
177
|
offset: int,
|
178
|
+
sort_output: bool,
|
179
|
+
add_cytotable_meta: bool = False,
|
174
180
|
) -> str:
|
175
181
|
"""
|
176
182
|
Performs SQLite table data extraction where one or many
|
@@ -186,6 +192,10 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
186
192
|
Row count to use for chunked output.
|
187
193
|
offset: int:
|
188
194
|
The offset for chunking the data from source.
|
195
|
+
sort_output: bool
|
196
|
+
Specifies whether to sort cytotable output or not.
|
197
|
+
add_cytotable_meta: bool, default=False:
|
198
|
+
Whether to add CytoTable metadata fields or not
|
189
199
|
|
190
200
|
Returns:
|
191
201
|
pyarrow.Table:
|
@@ -195,7 +205,10 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
195
205
|
|
196
206
|
import pyarrow as pa
|
197
207
|
|
198
|
-
from cytotable.constants import
|
208
|
+
from cytotable.constants import (
|
209
|
+
CYOTABLE_META_COLUMN_TYPES,
|
210
|
+
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
|
211
|
+
)
|
199
212
|
from cytotable.exceptions import DatatypeException
|
200
213
|
|
201
214
|
# open sqlite3 connection
|
@@ -207,7 +220,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
207
220
|
# See the following for more information:
|
208
221
|
# https://sqlite.org/pragma.html#pragma_table_info
|
209
222
|
cursor.execute(
|
210
|
-
|
223
|
+
"""
|
211
224
|
SELECT :table_name as table_name,
|
212
225
|
name as column_name,
|
213
226
|
type as column_type
|
@@ -255,15 +268,45 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
255
268
|
for col in column_info
|
256
269
|
]
|
257
270
|
|
271
|
+
if add_cytotable_meta:
|
272
|
+
query_parts += [
|
273
|
+
(
|
274
|
+
f"CAST( '{f'{source_path}_table_{table_name}'}' "
|
275
|
+
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
|
276
|
+
"AS cytotable_meta_source_path"
|
277
|
+
),
|
278
|
+
(
|
279
|
+
f"CAST( {offset} "
|
280
|
+
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
|
281
|
+
"AS cytotable_meta_offset"
|
282
|
+
),
|
283
|
+
(
|
284
|
+
f"CAST( (ROW_NUMBER() OVER ()) AS "
|
285
|
+
f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
|
286
|
+
"AS cytotable_meta_rownum"
|
287
|
+
),
|
288
|
+
]
|
289
|
+
|
258
290
|
# perform the select using the cases built above and using chunksize + offset
|
259
|
-
|
291
|
+
sql_stmt = (
|
260
292
|
f"""
|
261
|
-
SELECT
|
293
|
+
SELECT
|
294
|
+
{', '.join(query_parts)}
|
262
295
|
FROM {table_name}
|
263
296
|
ORDER BY {', '.join([col['column_name'] for col in column_info])}
|
264
297
|
LIMIT {chunk_size} OFFSET {offset};
|
265
298
|
"""
|
299
|
+
if sort_output
|
300
|
+
else f"""
|
301
|
+
SELECT
|
302
|
+
{', '.join(query_parts)}
|
303
|
+
FROM {table_name}
|
304
|
+
LIMIT {chunk_size} OFFSET {offset};
|
305
|
+
"""
|
266
306
|
)
|
307
|
+
|
308
|
+
# execute the sql stmt
|
309
|
+
cursor.execute(sql_stmt)
|
267
310
|
# collect the results and include the column name with values
|
268
311
|
results = [
|
269
312
|
dict(zip([desc[0] for desc in cursor.description], row))
|
@@ -283,7 +326,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
283
326
|
return pa.Table.from_pylist(results)
|
284
327
|
|
285
328
|
|
286
|
-
def _cache_cloudpath_to_local(path:
|
329
|
+
def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
|
287
330
|
"""
|
288
331
|
Takes a cloudpath and uses cache to convert to a local copy
|
289
332
|
for use in scenarios where remote work is not possible (sqlite).
|
@@ -298,24 +341,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
|
|
298
341
|
A local pathlib.Path to cached version of cloudpath file.
|
299
342
|
"""
|
300
343
|
|
301
|
-
candidate_path = AnyPath(path)
|
302
|
-
|
303
344
|
# check that the path is a file (caching won't work with a dir)
|
304
345
|
# and check that the file is of sqlite type
|
305
346
|
# (other file types will be handled remotely in cloud)
|
306
|
-
if
|
347
|
+
if (
|
348
|
+
isinstance(path, CloudPath)
|
349
|
+
and path.is_file()
|
350
|
+
and path.suffix.lower() == ".sqlite"
|
351
|
+
):
|
307
352
|
try:
|
308
353
|
# update the path to be the local filepath for reference in CytoTable ops
|
309
354
|
# note: incurs a data read which will trigger caching of the file
|
310
|
-
path =
|
355
|
+
path = pathlib.Path(path.fspath)
|
311
356
|
except InvalidPrefixError:
|
312
357
|
# share information about not finding a cloud path
|
313
358
|
logger.info(
|
314
359
|
"Did not detect a cloud path based on prefix. Defaulting to use local path operations."
|
315
360
|
)
|
316
361
|
|
317
|
-
|
318
|
-
return pathlib.Path(path)
|
362
|
+
return path
|
319
363
|
|
320
364
|
|
321
365
|
def _arrow_type_cast_if_specified(
|
@@ -462,3 +506,97 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
|
|
462
506
|
),
|
463
507
|
**kwargs,
|
464
508
|
)
|
509
|
+
|
510
|
+
|
511
|
+
def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
|
512
|
+
"""
|
513
|
+
Helper function to unwrap futures from values or return values
|
514
|
+
where there are no futures.
|
515
|
+
|
516
|
+
Args:
|
517
|
+
val: Union[parsl.dataflow.futures.AppFuture, Any]
|
518
|
+
A value which may or may not be a Parsl future which
|
519
|
+
needs to be evaluated.
|
520
|
+
|
521
|
+
Returns:
|
522
|
+
Any
|
523
|
+
Returns the value as-is if there's no future, the future
|
524
|
+
result if Parsl futures are encountered.
|
525
|
+
"""
|
526
|
+
|
527
|
+
# if we have a future value, evaluate the result
|
528
|
+
if isinstance(val, parsl.dataflow.futures.AppFuture):
|
529
|
+
return val.result()
|
530
|
+
elif isinstance(val, list):
|
531
|
+
# if we have a list of futures, return the results
|
532
|
+
if isinstance(val[0], parsl.dataflow.futures.AppFuture):
|
533
|
+
return [elem.result() for elem in val]
|
534
|
+
# otherwise return the value
|
535
|
+
return val
|
536
|
+
|
537
|
+
|
538
|
+
def _unwrap_source(
|
539
|
+
source: Union[
|
540
|
+
Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
|
541
|
+
Union[parsl.dataflow.futures.AppFuture, Any],
|
542
|
+
]
|
543
|
+
) -> Union[Dict[str, Any], Any]:
|
544
|
+
"""
|
545
|
+
Helper function to unwrap futures from sources.
|
546
|
+
|
547
|
+
Args:
|
548
|
+
source: Union[
|
549
|
+
Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
|
550
|
+
Union[parsl.dataflow.futures.AppFuture, Any],
|
551
|
+
]
|
552
|
+
A source is a portion of an internal data structure used by
|
553
|
+
CytoTable for processing and organizing data results.
|
554
|
+
Returns:
|
555
|
+
Union[Dict[str, Any], Any]
|
556
|
+
An evaluated dictionary or other value type.
|
557
|
+
"""
|
558
|
+
# if we have a dictionary, unwrap any values which may be futures
|
559
|
+
if isinstance(source, dict):
|
560
|
+
return {key: _unwrap_value(val) for key, val in source.items()}
|
561
|
+
else:
|
562
|
+
# otherwise try to unwrap the source as-is without dictionary nesting
|
563
|
+
return _unwrap_value(source)
|
564
|
+
|
565
|
+
|
566
|
+
def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
|
567
|
+
"""
|
568
|
+
Evaluates any Parsl futures for use within other tasks.
|
569
|
+
This enables a pattern of Parsl app usage as "tasks" and delayed
|
570
|
+
future result evaluation for concurrency.
|
571
|
+
|
572
|
+
Args:
|
573
|
+
sources: Union[Dict[str, List[Dict[str, Any]]], str]
|
574
|
+
Sources are an internal data structure used by CytoTable for
|
575
|
+
processing and organizing data results. They may include futures
|
576
|
+
which require asynchronous processing through Parsl, so we
|
577
|
+
process them through this function.
|
578
|
+
|
579
|
+
Returns:
|
580
|
+
Union[Dict[str, List[Dict[str, Any]]], str]
|
581
|
+
A data structure which includes evaluated futures where they were found.
|
582
|
+
"""
|
583
|
+
|
584
|
+
return (
|
585
|
+
{
|
586
|
+
source_group_name: [
|
587
|
+
# unwrap sources into future results
|
588
|
+
_unwrap_source(source)
|
589
|
+
for source in (
|
590
|
+
source_group_vals.result()
|
591
|
+
# if we have a future, return the result
|
592
|
+
if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
|
593
|
+
# otherwise return the value
|
594
|
+
else source_group_vals
|
595
|
+
)
|
596
|
+
]
|
597
|
+
for source_group_name, source_group_vals in sources.items()
|
598
|
+
# if we have a dict, use the above, otherwise unwrap the value in case of future
|
599
|
+
}
|
600
|
+
if isinstance(sources, dict)
|
601
|
+
else _unwrap_value(sources)
|
602
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.9
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
17
|
-
Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
|
17
|
+
Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
|
18
18
|
Requires-Dist: duckdb (>=0.10.1)
|
19
19
|
Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
|
20
20
|
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
|
2
|
+
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
+
cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
|
8
|
+
cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
|
10
|
+
cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
+
cytotable-0.0.9.dist-info/RECORD,,
|
cytotable-0.0.7.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=3xspHDpARY8WLv1EQOR-RWnqpadANuo2uK_MMKnFD8k,315
|
2
|
-
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
-
cytotable/convert.py,sha256=EjEZpWvm3oPgDx1dKlfHETgs52blL79dBzfhcPOOK6o,51771
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=HSrINU0XzF4i4zxjNMMw9F0rRxgr6mm3V7Gh_Wb-uFI,10773
|
6
|
-
cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
|
7
|
-
cytotable/utils.py,sha256=E5r1Vk3eaCB42JFquQHpGQXdAy97kGl-YiapmOkURwA,14476
|
8
|
-
cytotable-0.0.7.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.7.dist-info/METADATA,sha256=U1kwsaRSVKB8iwlSw3iP3tLDO2LeKT9xjG1ctiWnHg0,3420
|
10
|
-
cytotable-0.0.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|