CytoTable 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.7"
6
+ __version__ = "0.0.9"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/constants.py CHANGED
@@ -68,6 +68,13 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
+ # metadata column names and types for internal use within CytoTable
72
+ CYOTABLE_META_COLUMN_TYPES = {
73
+ "cytotable_meta_source_path": "VARCHAR",
74
+ "cytotable_meta_offset": "BIGINT",
75
+ "cytotable_meta_rownum": "BIGINT",
76
+ }
77
+
71
78
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
72
79
  "data-producer": "https://github.com/cytomining/CytoTable",
73
80
  "data-producer-version": str(_get_cytotable_version()),
cytotable/convert.py CHANGED
@@ -8,23 +8,26 @@ import uuid
8
8
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
9
 
10
10
  import parsl
11
- import pyarrow as pa
12
- from parsl.app.app import join_app, python_app
11
+ from parsl.app.app import python_app
13
12
 
14
13
  from cytotable.exceptions import CytoTableException
15
14
  from cytotable.presets import config
15
+ from cytotable.sources import _gather_sources
16
16
  from cytotable.utils import (
17
17
  _column_sort,
18
18
  _default_parsl_config,
19
19
  _expand_path,
20
20
  _parsl_loaded,
21
+ evaluate_futures,
21
22
  )
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  @python_app
27
- def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]:
28
+ def _get_table_columns_and_types(
29
+ source: Dict[str, Any], sort_output: bool
30
+ ) -> List[Dict[str, str]]:
28
31
  """
29
32
  Gather column data from table through duckdb.
30
33
 
@@ -32,6 +35,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
32
35
  source: Dict[str, Any]
33
36
  Contains the source data to be chunked. Represents a single
34
37
  file or table of some kind.
38
+ sort_output:
39
+ Specifies whether to sort cytotable output or not.
35
40
 
36
41
  Returns:
37
42
  List[Dict[str, str]]
@@ -41,11 +46,12 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
41
46
  import pathlib
42
47
 
43
48
  import duckdb
49
+ from cloudpathlib import AnyPath
44
50
 
45
51
  from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
46
52
 
47
53
  source_path = source["source_path"]
48
- source_type = str(pathlib.Path(source_path).suffix).lower()
54
+ source_type = str(source_path.suffix).lower()
49
55
 
50
56
  # prepare the data source in the form of a duckdb query
51
57
  select_source = (
@@ -109,6 +115,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
109
115
  # offset is set to 0 start at first row
110
116
  # result from table
111
117
  offset=0,
118
+ add_cytotable_meta=False,
119
+ sort_output=sort_output,
112
120
  )
113
121
  with _duckdb_reader() as ddb_reader:
114
122
  return (
@@ -202,7 +210,7 @@ def _get_table_chunk_offsets(
202
210
  import pathlib
203
211
 
204
212
  import duckdb
205
- from cloudpathlib import AnyPath
213
+ from cloudpathlib import AnyPath, CloudPath
206
214
 
207
215
  from cytotable.exceptions import NoInputDataException
208
216
  from cytotable.utils import _duckdb_reader
@@ -212,18 +220,9 @@ def _get_table_chunk_offsets(
212
220
  if source is not None:
213
221
  table_name = source["table_name"] if "table_name" in source.keys() else None
214
222
  source_path = source["source_path"]
215
- source_type = str(pathlib.Path(source_path).suffix).lower()
223
+ source_type = str(source_path.suffix).lower()
216
224
 
217
225
  try:
218
- # for csv's, check that we have more than one row (a header and data values)
219
- if (
220
- source_type == ".csv"
221
- and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
222
- ):
223
- raise NoInputDataException(
224
- f"Data file has 0 rows of values. Error in file: {source_path}"
225
- )
226
-
227
226
  # gather the total rowcount from csv or sqlite data input sources
228
227
  with _duckdb_reader() as ddb_reader:
229
228
  rowcount = int(
@@ -275,6 +274,7 @@ def _source_chunk_to_parquet(
275
274
  chunk_size: int,
276
275
  offset: int,
277
276
  dest_path: str,
277
+ sort_output: bool,
278
278
  ) -> str:
279
279
  """
280
280
  Export source data to chunked parquet file using chunk size and offsets.
@@ -291,6 +291,8 @@ def _source_chunk_to_parquet(
291
291
  The offset for chunking the data from source.
292
292
  dest_path: str
293
293
  Path to store the output data.
294
+ sort_output: bool
295
+ Specifies whether to sort cytotable output or not.
294
296
 
295
297
  Returns:
296
298
  str
@@ -303,6 +305,7 @@ def _source_chunk_to_parquet(
303
305
  from cloudpathlib import AnyPath
304
306
  from pyarrow import parquet
305
307
 
308
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
306
309
  from cytotable.utils import (
307
310
  _duckdb_reader,
308
311
  _sqlite_mixed_type_query_to_parquet,
@@ -311,27 +314,53 @@ def _source_chunk_to_parquet(
311
314
 
312
315
  # attempt to build dest_path
313
316
  source_dest_path = (
314
- f"{dest_path}/{str(pathlib.Path(source_group_name).stem).lower()}/"
315
- f"{str(pathlib.Path(source['source_path']).parent.name).lower()}"
317
+ f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
318
+ f"{str(source['source_path'].parent.name).lower()}"
316
319
  )
317
320
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
318
321
 
322
+ source_path_str = (
323
+ source["source_path"]
324
+ if "table_name" not in source.keys()
325
+ else f"{source['source_path']}_table_{source['table_name']}"
326
+ )
319
327
  # build the column selection block of query
328
+
329
+ # add cytotable metadata columns
330
+ cytotable_metadata_cols = [
331
+ (
332
+ f"CAST( '{source_path_str}' "
333
+ f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
334
+ ' AS "cytotable_meta_source_path"'
335
+ ),
336
+ f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
337
+ (
338
+ f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
339
+ ' AS "cytotable_meta_rownum"'
340
+ ),
341
+ ]
342
+ # add source table columns
343
+ casted_source_cols = [
344
+ # here we cast the column to the specified type ensure the colname remains the same
345
+ f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
346
+ for column in source["columns"]
347
+ ]
348
+
349
+ # create selection statement from lists above
320
350
  select_columns = ",".join(
321
- [
322
- # here we cast the column to the specified type ensure the colname remains the same
323
- f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
324
- for column in source["columns"]
325
- ]
351
+ # if we should sort the output, add the metadata_cols
352
+ cytotable_metadata_cols + casted_source_cols
353
+ if sort_output
354
+ else casted_source_cols
326
355
  )
327
356
 
328
357
  # build output query and filepath base
329
358
  # (chunked output will append offset to keep output paths unique)
330
- if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
359
+ if str(source["source_path"].suffix).lower() == ".csv":
331
360
  base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
332
361
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
333
362
 
334
- elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
363
+ elif str(source["source_path"].suffix).lower() == ".sqlite":
335
364
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
336
365
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
337
366
 
@@ -352,6 +381,11 @@ def _source_chunk_to_parquet(
352
381
  ORDER BY ALL
353
382
  LIMIT {chunk_size} OFFSET {offset}
354
383
  """
384
+ if sort_output
385
+ else f"""
386
+ {base_query}
387
+ LIMIT {chunk_size} OFFSET {offset}
388
+ """
355
389
  ).arrow(),
356
390
  where=result_filepath,
357
391
  )
@@ -363,7 +397,7 @@ def _source_chunk_to_parquet(
363
397
  # to handle the mixed types
364
398
  if (
365
399
  "Mismatch Type Error" in str(e)
366
- and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
400
+ and str(source["source_path"].suffix).lower() == ".sqlite"
367
401
  ):
368
402
  _write_parquet_table_with_metadata(
369
403
  # here we use sqlite instead of duckdb to extract
@@ -374,6 +408,8 @@ def _source_chunk_to_parquet(
374
408
  table_name=str(source["table_name"]),
375
409
  chunk_size=chunk_size,
376
410
  offset=offset,
411
+ add_cytotable_meta=True if sort_output else False,
412
+ sort_output=sort_output,
377
413
  ),
378
414
  where=result_filepath,
379
415
  )
@@ -422,7 +458,10 @@ def _prepend_column_name(
422
458
 
423
459
  import pyarrow.parquet as parquet
424
460
 
425
- from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
461
+ from cytotable.constants import (
462
+ CYOTABLE_META_COLUMN_TYPES,
463
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
464
+ )
426
465
  from cytotable.utils import _write_parquet_table_with_metadata
427
466
 
428
467
  logger = logging.getLogger(__name__)
@@ -470,8 +509,10 @@ def _prepend_column_name(
470
509
  # source_group_name_stem: 'Cells'
471
510
  # column_name: 'AreaShape_Area'
472
511
  # updated_column_name: 'Cells_AreaShape_Area'
473
- if column_name not in identifying_columns and not column_name.startswith(
474
- source_group_name_stem.capitalize()
512
+ if (
513
+ column_name not in identifying_columns
514
+ and not column_name.startswith(source_group_name_stem.capitalize())
515
+ and column_name not in CYOTABLE_META_COLUMN_TYPES
475
516
  ):
476
517
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
477
518
  # if-condition for prepending 'Metadata_' to column name
@@ -679,6 +720,7 @@ def _concat_source_group(
679
720
  def _prepare_join_sql(
680
721
  sources: Dict[str, List[Dict[str, Any]]],
681
722
  joins: str,
723
+ sort_output: bool,
682
724
  ) -> str:
683
725
  """
684
726
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -690,6 +732,8 @@ def _prepare_join_sql(
690
732
  joins: str:
691
733
  DuckDB-compatible SQL which will be used to perform the join
692
734
  operations using the join_group keys as a reference.
735
+ sort_output: bool
736
+ Specifies whether to sort cytotable output or not.
693
737
 
694
738
  Returns:
695
739
  str:
@@ -697,15 +741,30 @@ def _prepare_join_sql(
697
741
  """
698
742
  import pathlib
699
743
 
744
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
745
+
700
746
  # replace with real location of sources for join sql
747
+ order_by_tables = []
701
748
  for key, val in sources.items():
702
749
  if pathlib.Path(key).stem.lower() in joins.lower():
750
+ table_name = str(pathlib.Path(key).stem.lower())
703
751
  joins = joins.replace(
704
- f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
752
+ f"'{table_name}.parquet'",
705
753
  str([str(table) for table in val[0]["table"]]),
706
754
  )
755
+ order_by_tables.append(table_name)
707
756
 
708
- return joins
757
+ # create order by statement with from all tables using cytotable metadata
758
+ order_by_sql = "ORDER BY " + ", ".join(
759
+ [
760
+ f"{table}.{meta_column}"
761
+ for table in order_by_tables
762
+ for meta_column in CYOTABLE_META_COLUMN_TYPES
763
+ ]
764
+ )
765
+
766
+ # add the order by statements to the join
767
+ return joins + order_by_sql if sort_output else joins
709
768
 
710
769
 
711
770
  @python_app
@@ -739,8 +798,7 @@ def _join_source_chunk(
739
798
 
740
799
  import pathlib
741
800
 
742
- import pyarrow.parquet as parquet
743
-
801
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
744
802
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
745
803
 
746
804
  # Attempt to read the data to parquet file
@@ -748,12 +806,22 @@ def _join_source_chunk(
748
806
  # writing data to a parquet file.
749
807
  # read data with chunk size + offset
750
808
  # and export to parquet
809
+ exclude_meta_cols = [
810
+ f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
811
+ ]
812
+
751
813
  with _duckdb_reader() as ddb_reader:
752
814
  result = ddb_reader.execute(
753
815
  f"""
816
+ WITH joined AS (
754
817
  {joins}
755
- {"ORDER BY ALL" if "ORDER BY" not in joins.upper() else ""}
756
818
  LIMIT {chunk_size} OFFSET {offset}
819
+ )
820
+ SELECT
821
+ /* exclude metadata columns from the results
822
+ by using a lambda on column names based on exclude_meta_cols. */
823
+ COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
824
+ FROM joined;
757
825
  """
758
826
  ).arrow()
759
827
 
@@ -960,40 +1028,20 @@ def _infer_source_group_common_schema(
960
1028
  )
961
1029
 
962
1030
 
963
- @python_app
964
- def _return_future(input: Any) -> Any:
965
- """
966
- This is a simple wrapper python_app to allow
967
- the return of join_app-compliant output (must be a Parsl future)
968
-
969
- Args:
970
- input: Any
971
- Any input which will be used within the context of a
972
- Parsl join_app future return.
973
-
974
- Returns:
975
- Any
976
- Returns the input as provided wrapped within the context
977
- of a python_app for the purpose of a join_app.
978
- """
979
-
980
- return input
981
-
982
-
983
- @join_app
984
1031
  def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
985
1032
  source_path: str,
986
1033
  dest_path: str,
987
1034
  source_datatype: Optional[str],
988
- metadata: Union[List[str], Tuple[str, ...]],
989
- compartments: Union[List[str], Tuple[str, ...]],
990
- identifying_columns: Union[List[str], Tuple[str, ...]],
1035
+ metadata: Optional[Union[List[str], Tuple[str, ...]]],
1036
+ compartments: Optional[Union[List[str], Tuple[str, ...]]],
1037
+ identifying_columns: Optional[Union[List[str], Tuple[str, ...]]],
991
1038
  concat: bool,
992
1039
  join: bool,
993
1040
  joins: Optional[str],
994
1041
  chunk_size: Optional[int],
995
1042
  infer_common_schema: bool,
996
1043
  drop_null: bool,
1044
+ sort_output: bool,
997
1045
  data_type_cast_map: Optional[Dict[str, str]] = None,
998
1046
  **kwargs,
999
1047
  ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1032,6 +1080,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1032
1080
  Whether to infer a common schema when concatenating sources.
1033
1081
  drop_null: bool:
1034
1082
  Whether to drop null results.
1083
+ sort_output: bool
1084
+ Specifies whether to sort cytotable output or not.
1035
1085
  data_type_cast_map: Dict[str, str]
1036
1086
  A dictionary mapping data type groups to specific types.
1037
1087
  Roughly includes Arrow data types language from:
@@ -1047,26 +1097,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1047
1097
  result.
1048
1098
  """
1049
1099
 
1050
- from cytotable.convert import (
1051
- _concat_join_sources,
1052
- _concat_source_group,
1053
- _get_table_chunk_offsets,
1054
- _infer_source_group_common_schema,
1055
- _join_source_chunk,
1056
- _prepend_column_name,
1057
- _return_future,
1058
- _source_chunk_to_parquet,
1059
- )
1060
- from cytotable.sources import _gather_sources
1061
- from cytotable.utils import _expand_path
1062
-
1063
1100
  # gather sources to be processed
1064
1101
  sources = _gather_sources(
1065
1102
  source_path=source_path,
1066
1103
  source_datatype=source_datatype,
1067
- targets=list(metadata) + list(compartments),
1104
+ targets=(
1105
+ list(metadata) + list(compartments)
1106
+ if metadata is not None and compartments is not None
1107
+ else []
1108
+ ),
1068
1109
  **kwargs,
1069
- ).result()
1110
+ )
1070
1111
 
1071
1112
  # expand the destination path
1072
1113
  expanded_dest_path = _expand_path(path=dest_path)
@@ -1080,7 +1121,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1080
1121
  "offsets": _get_table_chunk_offsets(
1081
1122
  source=source,
1082
1123
  chunk_size=chunk_size,
1083
- ).result()
1124
+ )
1084
1125
  },
1085
1126
  )
1086
1127
  for source in source_group_vals
@@ -1097,7 +1138,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1097
1138
  for source in source_group_vals
1098
1139
  if source["offsets"] is not None
1099
1140
  ]
1100
- for source_group_name, source_group_vals in offsets_prepared.items()
1141
+ for source_group_name, source_group_vals in evaluate_futures(
1142
+ offsets_prepared
1143
+ ).items()
1101
1144
  # ensure we have source_groups with at least one source table
1102
1145
  if len(source_group_vals) > 0
1103
1146
  }
@@ -1110,10 +1153,10 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1110
1153
  **{
1111
1154
  "columns": _prep_cast_column_data_types(
1112
1155
  columns=_get_table_columns_and_types(
1113
- source=source,
1156
+ source=source, sort_output=sort_output
1114
1157
  ),
1115
1158
  data_type_cast_map=data_type_cast_map,
1116
- ).result()
1159
+ )
1117
1160
  },
1118
1161
  )
1119
1162
  for source in source_group_vals
@@ -1136,33 +1179,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1136
1179
  chunk_size=chunk_size,
1137
1180
  offset=offset,
1138
1181
  dest_path=expanded_dest_path,
1182
+ sort_output=sort_output,
1139
1183
  ),
1140
1184
  source_group_name=source_group_name,
1141
1185
  identifying_columns=identifying_columns,
1142
1186
  metadata=metadata,
1143
1187
  compartments=compartments,
1144
- ).result()
1188
+ )
1145
1189
  for offset in source["offsets"]
1146
1190
  ]
1147
1191
  },
1148
1192
  )
1149
1193
  for source in source_group_vals
1150
1194
  ]
1151
- for source_group_name, source_group_vals in column_names_and_types_gathered.items()
1195
+ for source_group_name, source_group_vals in evaluate_futures(
1196
+ column_names_and_types_gathered
1197
+ ).items()
1152
1198
  }
1153
1199
 
1154
1200
  # if we're concatting or joining and need to infer the common schema
1155
1201
  if (concat or join) and infer_common_schema:
1156
1202
  # create a common schema for concatenation work
1157
1203
  common_schema_determined = {
1158
- source_group_name: {
1159
- "sources": source_group_vals,
1160
- "common_schema": _infer_source_group_common_schema(
1161
- source_group=source_group_vals,
1162
- data_type_cast_map=data_type_cast_map,
1163
- ),
1164
- }
1165
- for source_group_name, source_group_vals in results.items()
1204
+ source_group_name: [
1205
+ {
1206
+ "sources": source_group_vals,
1207
+ "common_schema": _infer_source_group_common_schema(
1208
+ source_group=source_group_vals,
1209
+ data_type_cast_map=data_type_cast_map,
1210
+ ),
1211
+ }
1212
+ ]
1213
+ for source_group_name, source_group_vals in evaluate_futures(
1214
+ results
1215
+ ).items()
1166
1216
  }
1167
1217
 
1168
1218
  # if concat or join, concat the source groups
@@ -1174,17 +1224,24 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1174
1224
  results = {
1175
1225
  source_group_name: _concat_source_group(
1176
1226
  source_group_name=source_group_name,
1177
- source_group=source_group_vals["sources"],
1227
+ source_group=source_group_vals[0]["sources"],
1178
1228
  dest_path=expanded_dest_path,
1179
- common_schema=source_group_vals["common_schema"],
1180
- ).result()
1181
- for source_group_name, source_group_vals in common_schema_determined.items()
1229
+ common_schema=source_group_vals[0]["common_schema"],
1230
+ )
1231
+ for source_group_name, source_group_vals in evaluate_futures(
1232
+ common_schema_determined
1233
+ ).items()
1182
1234
  }
1183
1235
 
1184
1236
  # conditional section for merging
1185
1237
  # note: join implies a concat, but concat does not imply a join
1186
1238
  if join:
1187
- prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
1239
+ # evaluate the results as they're used multiple times below
1240
+ evaluated_results = evaluate_futures(results)
1241
+
1242
+ prepared_joins_sql = _prepare_join_sql(
1243
+ sources=evaluated_results, joins=joins, sort_output=sort_output
1244
+ ).result()
1188
1245
 
1189
1246
  # map joined results based on the join groups gathered above
1190
1247
  # note: after mapping we end up with a list of strings (task returns str)
@@ -1198,7 +1255,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1198
1255
  chunk_size=chunk_size,
1199
1256
  offset=offset,
1200
1257
  drop_null=drop_null,
1201
- ).result()
1258
+ )
1202
1259
  # create join group for querying the concatenated
1203
1260
  # data in order to perform memory-safe joining
1204
1261
  # per user chunk size specification.
@@ -1213,12 +1270,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1213
1270
  # for lineage and debugging
1214
1271
  results = _concat_join_sources(
1215
1272
  dest_path=expanded_dest_path,
1216
- join_sources=join_sources_result,
1217
- sources=results,
1218
- ).result()
1273
+ join_sources=[join.result() for join in join_sources_result],
1274
+ sources=evaluated_results,
1275
+ )
1219
1276
 
1220
1277
  # wrap the final result as a future and return
1221
- return _return_future(results)
1278
+ return evaluate_futures(results)
1222
1279
 
1223
1280
 
1224
1281
  def convert( # pylint: disable=too-many-arguments,too-many-locals
@@ -1236,6 +1293,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1236
1293
  infer_common_schema: bool = True,
1237
1294
  drop_null: bool = False,
1238
1295
  data_type_cast_map: Optional[Dict[str, str]] = None,
1296
+ sort_output: bool = True,
1239
1297
  preset: Optional[str] = "cellprofiler_csv",
1240
1298
  parsl_config: Optional[parsl.Config] = None,
1241
1299
  **kwargs,
@@ -1277,8 +1335,14 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1277
1335
  DuckDB-compatible SQL which will be used to perform the join operations.
1278
1336
  chunk_size: Optional[int] (Default value = None)
1279
1337
  Size of join chunks which is used to limit data size during join ops
1280
- infer_common_schema: bool: (Default value = True)
1338
+ infer_common_schema: bool (Default value = True)
1281
1339
  Whether to infer a common schema when concatenating sources.
1340
+ data_type_cast_map: Dict[str, str], (Default value = None)
1341
+ A dictionary mapping data type groups to specific types.
1342
+ Roughly includes Arrow data types language from:
1343
+ https://arrow.apache.org/docs/python/api/datatypes.html
1344
+ sort_output: bool (Default value = True)
1345
+ Specifies whether to sort cytotable output or not.
1282
1346
  drop_null: bool (Default value = False)
1283
1347
  Whether to drop nan/null values from results
1284
1348
  preset: str (Default value = "cellprofiler_csv")
@@ -1393,7 +1457,8 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1393
1457
  infer_common_schema=infer_common_schema,
1394
1458
  drop_null=drop_null,
1395
1459
  data_type_cast_map=data_type_cast_map,
1460
+ sort_output=sort_output,
1396
1461
  **kwargs,
1397
- ).result()
1462
+ )
1398
1463
 
1399
1464
  return output
cytotable/presets.py CHANGED
@@ -29,25 +29,19 @@ config = {
29
29
  # compartment and metadata joins performed using DuckDB SQL
30
30
  # and modified at runtime as needed
31
31
  "CONFIG_JOINS": """
32
- WITH Image_Filtered AS (
33
- SELECT
34
- /* seeks columns by name, avoiding failure if some do not exist */
35
- COLUMNS('^Metadata_ImageNumber$|^Image_Metadata_Well$|^Image_Metadata_Plate$')
36
- FROM
37
- read_parquet('image.parquet')
38
- )
39
32
  SELECT
40
- *
33
+ image.Metadata_ImageNumber,
34
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
35
+ cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
36
+ nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
41
37
  FROM
42
38
  read_parquet('cytoplasm.parquet') AS cytoplasm
43
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
44
- cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
45
- AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
46
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
47
- nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
39
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
40
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
41
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
42
+ WHERE
43
+ cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
48
44
  AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
49
- LEFT JOIN Image_Filtered AS image ON
50
- image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
51
45
  """,
52
46
  },
53
47
  "cellprofiler_sqlite": {
@@ -74,26 +68,69 @@ config = {
74
68
  # compartment and metadata joins performed using DuckDB SQL
75
69
  # and modified at runtime as needed
76
70
  "CONFIG_JOINS": """
77
- WITH Per_Image_Filtered AS (
78
- SELECT
79
- Metadata_ImageNumber,
80
- Image_Metadata_Well,
81
- Image_Metadata_Plate
82
- FROM
83
- read_parquet('per_image.parquet')
84
- )
85
71
  SELECT
86
- *
72
+ per_image.Metadata_ImageNumber,
73
+ per_image.Image_Metadata_Well,
74
+ per_image.Image_Metadata_Plate,
75
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
76
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
77
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
87
78
  FROM
88
79
  read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
89
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
90
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
91
- AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
92
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
93
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
80
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
81
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
82
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
83
+ WHERE
84
+ per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
94
85
  AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
95
- LEFT JOIN Per_Image_Filtered AS per_image ON
96
- per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
86
+ """,
87
+ },
88
+ "cellprofiler_sqlite_cpg0016_jump": {
89
+ # version specifications using related references
90
+ "CONFIG_SOURCE_VERSION": {
91
+ "cellprofiler": "v4.0.0",
92
+ },
93
+ # names of source table compartments (for ex. cells.csv, etc.)
94
+ "CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
95
+ # names of source table metadata (for ex. image.csv, etc.)
96
+ "CONFIG_NAMES_METADATA": ("image",),
97
+ # column names in any compartment or metadata tables which contain
98
+ # unique names to avoid renaming
99
+ "CONFIG_IDENTIFYING_COLUMNS": (
100
+ "ImageNumber",
101
+ "ObjectNumber",
102
+ "Metadata_Well",
103
+ "Metadata_Plate",
104
+ "Parent_Cells",
105
+ "Parent_Nuclei",
106
+ ),
107
+ # chunk size to use for join operations to help with possible performance issues
108
+ # note: this number is an estimate and is may need changes contingent on data
109
+ # and system used by this library.
110
+ "CONFIG_CHUNK_SIZE": 1000,
111
+ # compartment and metadata joins performed using DuckDB SQL
112
+ # and modified at runtime as needed
113
+ "CONFIG_JOINS": """
114
+ SELECT
115
+ image.Image_TableNumber,
116
+ image.Metadata_ImageNumber,
117
+ image.Metadata_Plate,
118
+ image.Metadata_Well,
119
+ image.Image_Metadata_Site,
120
+ image.Image_Metadata_Row,
121
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
122
+ cells.* EXCLUDE (Metadata_ImageNumber),
123
+ nuclei.* EXCLUDE (Metadata_ImageNumber)
124
+ FROM
125
+ read_parquet('cytoplasm.parquet') AS cytoplasm
126
+ LEFT JOIN read_parquet('cells.parquet') AS cells ON
127
+ cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
128
+ AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
129
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
130
+ nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
131
+ AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
132
+ LEFT JOIN read_parquet('image.parquet') AS image ON
133
+ image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
97
134
  """,
98
135
  },
99
136
  "cellprofiler_sqlite_pycytominer": {
@@ -125,26 +162,21 @@ config = {
125
162
  # compartment and metadata joins performed using DuckDB SQL
126
163
  # and modified at runtime as needed
127
164
  "CONFIG_JOINS": """
128
- WITH Per_Image_Filtered AS (
129
- SELECT
130
- Metadata_ImageNumber,
131
- Image_Metadata_Well,
132
- Image_Metadata_Plate
133
- FROM
134
- read_parquet('per_image.parquet')
135
- )
136
165
  SELECT
137
- *
166
+ per_image.Metadata_ImageNumber,
167
+ per_image.Image_Metadata_Well,
168
+ per_image.Image_Metadata_Plate,
169
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
170
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
171
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
138
172
  FROM
139
173
  read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
140
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
141
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
142
- AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
143
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
144
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
174
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
175
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
176
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
177
+ WHERE
178
+ per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
145
179
  AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
146
- LEFT JOIN Per_Image_Filtered AS per_image ON
147
- per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
148
180
  """,
149
181
  },
150
182
  "cell-health-cellprofiler-to-cytominer-database": {
@@ -178,30 +210,22 @@ config = {
178
210
  # compartment and metadata joins performed using DuckDB SQL
179
211
  # and modified at runtime as needed
180
212
  "CONFIG_JOINS": """
181
- WITH Image_Filtered AS (
182
- SELECT
183
- Metadata_TableNumber,
184
- Metadata_ImageNumber,
185
- Image_Metadata_Well,
186
- Image_Metadata_Plate
187
- FROM
188
- read_parquet('image.parquet')
189
- )
190
213
  SELECT
191
- *
214
+ image.Metadata_TableNumber,
215
+ image.Metadata_ImageNumber,
216
+ image.Image_Metadata_Well,
217
+ image.Image_Metadata_Plate,
218
+ cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
219
+ cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
220
+ nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
192
221
  FROM
193
222
  read_parquet('cytoplasm.parquet') AS cytoplasm
194
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
195
- cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
196
- AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
197
- AND cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
198
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
199
- nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
200
- AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
223
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_TableNumber, Metadata_ImageNumber)
224
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_TableNumber, Metadata_ImageNumber)
225
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_TableNumber, Metadata_ImageNumber)
226
+ WHERE
227
+ cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
201
228
  AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
202
- LEFT JOIN Image_Filtered AS image ON
203
- image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
204
- AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
205
229
  """,
206
230
  },
207
231
  "in-carta": {
cytotable/sources.py CHANGED
@@ -7,13 +7,11 @@ import pathlib
7
7
  from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from cloudpathlib import AnyPath
10
- from parsl.app.app import join_app, python_app
11
10
 
11
+ from cytotable.exceptions import NoInputDataException
12
12
 
13
- @python_app
14
- def _build_path(
15
- path: Union[str, pathlib.Path, AnyPath], **kwargs
16
- ) -> Union[pathlib.Path, AnyPath]:
13
+
14
+ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
17
15
  """
18
16
  Build a path client or return local path.
19
17
 
@@ -43,10 +41,9 @@ def _build_path(
43
41
  return processed_path
44
42
 
45
43
 
46
- @python_app
47
44
  def _get_source_filepaths(
48
45
  path: Union[pathlib.Path, AnyPath],
49
- targets: List[str],
46
+ targets: Optional[List[str]] = None,
50
47
  source_datatype: Optional[str] = None,
51
48
  ) -> Dict[str, List[Dict[str, Any]]]:
52
49
  """
@@ -75,7 +72,7 @@ def _get_source_filepaths(
75
72
 
76
73
  if (targets is None or targets == []) and source_datatype is None:
77
74
  raise DatatypeException(
78
- f"A source_datatype must be specified when using undefined compartments and metadata names."
75
+ "A source_datatype must be specified when using undefined compartments and metadata names."
79
76
  )
80
77
 
81
78
  # gathers files from provided path using compartments + metadata as a filter
@@ -87,9 +84,9 @@ def _get_source_filepaths(
87
84
  for subpath in (
88
85
  (path,)
89
86
  # used if the source path is a single file
90
- if AnyPath(path).is_file()
87
+ if path.is_file()
91
88
  # iterates through a source directory
92
- else (x for x in AnyPath(path).glob("**/*") if AnyPath(x).is_file())
89
+ else (x for x in path.glob("**/*") if x.is_file())
93
90
  )
94
91
  # ensure the subpaths meet certain specifications
95
92
  if (
@@ -129,7 +126,8 @@ def _get_source_filepaths(
129
126
  .arrow()["table_name"]
130
127
  .to_pylist()
131
128
  # make sure the table names match with compartment + metadata names
132
- if any(target.lower() in table_name.lower() for target in targets)
129
+ if targets is not None
130
+ and any(target.lower() in table_name.lower() for target in targets)
133
131
  ]
134
132
  else:
135
133
  # if we don't have sqlite source, append the existing element
@@ -181,7 +179,6 @@ def _get_source_filepaths(
181
179
  return grouped_sources
182
180
 
183
181
 
184
- @python_app
185
182
  def _infer_source_datatype(
186
183
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
187
184
  ) -> str:
@@ -230,7 +227,6 @@ def _infer_source_datatype(
230
227
  return source_datatype
231
228
 
232
229
 
233
- @python_app
234
230
  def _filter_source_filepaths(
235
231
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
236
232
  ) -> Dict[str, List[Dict[str, Any]]]:
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
260
256
  if file["source_path"].stat().st_size > 0
261
257
  # ensure the datatype matches the source datatype
262
258
  and file["source_path"].suffix == f".{source_datatype}"
259
+ and _file_is_more_than_one_line(path=file["source_path"])
263
260
  ]
264
261
  for filegroup, files in sources.items()
265
262
  }
266
263
 
267
264
 
268
- @join_app
265
+ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
266
+ """
267
+ Check if the file has more than one line.
268
+
269
+ Args:
270
+ path (Union[pathlib.Path, AnyPath]):
271
+ The path to the file.
272
+
273
+ Returns:
274
+ bool:
275
+ True if the file has more than one line, False otherwise.
276
+
277
+ Raises:
278
+ NoInputDataException: If the file has zero lines.
279
+ """
280
+
281
+ # if we don't have a sqlite file
282
+ # (we can't check sqlite files for lines)
283
+ if path.suffix.lower() != ".sqlite":
284
+ with path.open("r") as f:
285
+ try:
286
+ # read two lines, if the second is empty return false
287
+ return bool(f.readline() and f.readline())
288
+
289
+ except StopIteration:
290
+ # If we encounter the end of the file, it has only one line
291
+ raise NoInputDataException(
292
+ f"Data file has 0 rows of values. Error in file: {path}"
293
+ )
294
+ else:
295
+ return True
296
+
297
+
269
298
  def _gather_sources(
270
299
  source_path: str,
271
300
  source_datatype: Optional[str] = None,
@@ -295,11 +324,11 @@ def _gather_sources(
295
324
  _infer_source_datatype,
296
325
  )
297
326
 
298
- source_path = _build_path(path=source_path, **kwargs)
327
+ built_path = _build_path(path=source_path, **kwargs)
299
328
 
300
329
  # gather filepaths which will be used as the basis for this work
301
330
  sources = _get_source_filepaths(
302
- path=source_path, targets=targets, source_datatype=source_datatype
331
+ path=built_path, targets=targets, source_datatype=source_datatype
303
332
  )
304
333
 
305
334
  # infer or validate the source datatype based on source filepaths
cytotable/utils.py CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
149
149
  INSTALL sqlite_scanner;
150
150
  LOAD sqlite_scanner;
151
151
 
152
+ /* Install httpfs plugin to avoid error
153
+ https://github.com/duckdb/duckdb/issues/3243 */
154
+ INSTALL httpfs;
155
+
152
156
  /*
153
157
  Set threads available to duckdb
154
158
  See the following for more information:
@@ -171,6 +175,8 @@ def _sqlite_mixed_type_query_to_parquet(
171
175
  table_name: str,
172
176
  chunk_size: int,
173
177
  offset: int,
178
+ sort_output: bool,
179
+ add_cytotable_meta: bool = False,
174
180
  ) -> str:
175
181
  """
176
182
  Performs SQLite table data extraction where one or many
@@ -186,6 +192,10 @@ def _sqlite_mixed_type_query_to_parquet(
186
192
  Row count to use for chunked output.
187
193
  offset: int:
188
194
  The offset for chunking the data from source.
195
+ sort_output: bool
196
+ Specifies whether to sort cytotable output or not.
197
+ add_cytotable_meta: bool, default=False:
198
+ Whether to add CytoTable metadata fields or not
189
199
 
190
200
  Returns:
191
201
  pyarrow.Table:
@@ -195,7 +205,10 @@ def _sqlite_mixed_type_query_to_parquet(
195
205
 
196
206
  import pyarrow as pa
197
207
 
198
- from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
208
+ from cytotable.constants import (
209
+ CYOTABLE_META_COLUMN_TYPES,
210
+ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
211
+ )
199
212
  from cytotable.exceptions import DatatypeException
200
213
 
201
214
  # open sqlite3 connection
@@ -207,7 +220,7 @@ def _sqlite_mixed_type_query_to_parquet(
207
220
  # See the following for more information:
208
221
  # https://sqlite.org/pragma.html#pragma_table_info
209
222
  cursor.execute(
210
- f"""
223
+ """
211
224
  SELECT :table_name as table_name,
212
225
  name as column_name,
213
226
  type as column_type
@@ -255,15 +268,45 @@ def _sqlite_mixed_type_query_to_parquet(
255
268
  for col in column_info
256
269
  ]
257
270
 
271
+ if add_cytotable_meta:
272
+ query_parts += [
273
+ (
274
+ f"CAST( '{f'{source_path}_table_{table_name}'}' "
275
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
276
+ "AS cytotable_meta_source_path"
277
+ ),
278
+ (
279
+ f"CAST( {offset} "
280
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
281
+ "AS cytotable_meta_offset"
282
+ ),
283
+ (
284
+ f"CAST( (ROW_NUMBER() OVER ()) AS "
285
+ f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
286
+ "AS cytotable_meta_rownum"
287
+ ),
288
+ ]
289
+
258
290
  # perform the select using the cases built above and using chunksize + offset
259
- cursor.execute(
291
+ sql_stmt = (
260
292
  f"""
261
- SELECT {', '.join(query_parts)}
293
+ SELECT
294
+ {', '.join(query_parts)}
262
295
  FROM {table_name}
263
296
  ORDER BY {', '.join([col['column_name'] for col in column_info])}
264
297
  LIMIT {chunk_size} OFFSET {offset};
265
298
  """
299
+ if sort_output
300
+ else f"""
301
+ SELECT
302
+ {', '.join(query_parts)}
303
+ FROM {table_name}
304
+ LIMIT {chunk_size} OFFSET {offset};
305
+ """
266
306
  )
307
+
308
+ # execute the sql stmt
309
+ cursor.execute(sql_stmt)
267
310
  # collect the results and include the column name with values
268
311
  results = [
269
312
  dict(zip([desc[0] for desc in cursor.description], row))
@@ -283,7 +326,7 @@ def _sqlite_mixed_type_query_to_parquet(
283
326
  return pa.Table.from_pylist(results)
284
327
 
285
328
 
286
- def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
329
+ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
287
330
  """
288
331
  Takes a cloudpath and uses cache to convert to a local copy
289
332
  for use in scenarios where remote work is not possible (sqlite).
@@ -298,24 +341,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
298
341
  A local pathlib.Path to cached version of cloudpath file.
299
342
  """
300
343
 
301
- candidate_path = AnyPath(path)
302
-
303
344
  # check that the path is a file (caching won't work with a dir)
304
345
  # and check that the file is of sqlite type
305
346
  # (other file types will be handled remotely in cloud)
306
- if candidate_path.is_file() and candidate_path.suffix.lower() == ".sqlite":
347
+ if (
348
+ isinstance(path, CloudPath)
349
+ and path.is_file()
350
+ and path.suffix.lower() == ".sqlite"
351
+ ):
307
352
  try:
308
353
  # update the path to be the local filepath for reference in CytoTable ops
309
354
  # note: incurs a data read which will trigger caching of the file
310
- path = CloudPath(path).fspath
355
+ path = pathlib.Path(path.fspath)
311
356
  except InvalidPrefixError:
312
357
  # share information about not finding a cloud path
313
358
  logger.info(
314
359
  "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
315
360
  )
316
361
 
317
- # cast the result as a pathlib.Path
318
- return pathlib.Path(path)
362
+ return path
319
363
 
320
364
 
321
365
  def _arrow_type_cast_if_specified(
@@ -462,3 +506,97 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
462
506
  ),
463
507
  **kwargs,
464
508
  )
509
+
510
+
511
+ def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
512
+ """
513
+ Helper function to unwrap futures from values or return values
514
+ where there are no futures.
515
+
516
+ Args:
517
+ val: Union[parsl.dataflow.futures.AppFuture, Any]
518
+ A value which may or may not be a Parsl future which
519
+ needs to be evaluated.
520
+
521
+ Returns:
522
+ Any
523
+ Returns the value as-is if there's no future, the future
524
+ result if Parsl futures are encountered.
525
+ """
526
+
527
+ # if we have a future value, evaluate the result
528
+ if isinstance(val, parsl.dataflow.futures.AppFuture):
529
+ return val.result()
530
+ elif isinstance(val, list):
531
+ # if we have a list of futures, return the results
532
+ if isinstance(val[0], parsl.dataflow.futures.AppFuture):
533
+ return [elem.result() for elem in val]
534
+ # otherwise return the value
535
+ return val
536
+
537
+
538
+ def _unwrap_source(
539
+ source: Union[
540
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
541
+ Union[parsl.dataflow.futures.AppFuture, Any],
542
+ ]
543
+ ) -> Union[Dict[str, Any], Any]:
544
+ """
545
+ Helper function to unwrap futures from sources.
546
+
547
+ Args:
548
+ source: Union[
549
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
550
+ Union[parsl.dataflow.futures.AppFuture, Any],
551
+ ]
552
+ A source is a portion of an internal data structure used by
553
+ CytoTable for processing and organizing data results.
554
+ Returns:
555
+ Union[Dict[str, Any], Any]
556
+ An evaluated dictionary or other value type.
557
+ """
558
+ # if we have a dictionary, unwrap any values which may be futures
559
+ if isinstance(source, dict):
560
+ return {key: _unwrap_value(val) for key, val in source.items()}
561
+ else:
562
+ # otherwise try to unwrap the source as-is without dictionary nesting
563
+ return _unwrap_value(source)
564
+
565
+
566
+ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
567
+ """
568
+ Evaluates any Parsl futures for use within other tasks.
569
+ This enables a pattern of Parsl app usage as "tasks" and delayed
570
+ future result evaluation for concurrency.
571
+
572
+ Args:
573
+ sources: Union[Dict[str, List[Dict[str, Any]]], str]
574
+ Sources are an internal data structure used by CytoTable for
575
+ processing and organizing data results. They may include futures
576
+ which require asynchronous processing through Parsl, so we
577
+ process them through this function.
578
+
579
+ Returns:
580
+ Union[Dict[str, List[Dict[str, Any]]], str]
581
+ A data structure which includes evaluated futures where they were found.
582
+ """
583
+
584
+ return (
585
+ {
586
+ source_group_name: [
587
+ # unwrap sources into future results
588
+ _unwrap_source(source)
589
+ for source in (
590
+ source_group_vals.result()
591
+ # if we have a future, return the result
592
+ if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
593
+ # otherwise return the value
594
+ else source_group_vals
595
+ )
596
+ ]
597
+ for source_group_name, source_group_vals in sources.items()
598
+ # if we have a dict, use the above, otherwise unwrap the value in case of future
599
+ }
600
+ if isinstance(sources, dict)
601
+ else _unwrap_value(sources)
602
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
- Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
17
+ Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
18
18
  Requires-Dist: duckdb (>=0.10.1)
19
19
  Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
20
20
  Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
2
+ cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
+ cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
6
+ cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
+ cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
8
+ cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
10
+ cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ cytotable-0.0.9.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=3xspHDpARY8WLv1EQOR-RWnqpadANuo2uK_MMKnFD8k,315
2
- cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
- cytotable/convert.py,sha256=EjEZpWvm3oPgDx1dKlfHETgs52blL79dBzfhcPOOK6o,51771
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=HSrINU0XzF4i4zxjNMMw9F0rRxgr6mm3V7Gh_Wb-uFI,10773
6
- cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
- cytotable/utils.py,sha256=E5r1Vk3eaCB42JFquQHpGQXdAy97kGl-YiapmOkURwA,14476
8
- cytotable-0.0.7.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.7.dist-info/METADATA,sha256=U1kwsaRSVKB8iwlSw3iP3tLDO2LeKT9xjG1ctiWnHg0,3420
10
- cytotable-0.0.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.7.dist-info/RECORD,,