CytoTable 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.7"
6
+ __version__ = "0.0.8"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/constants.py CHANGED
@@ -68,6 +68,13 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
+ # metadata column names and types for internal use within CytoTable
72
+ CYOTABLE_META_COLUMN_TYPES = {
73
+ "cytotable_meta_source_path": "VARCHAR",
74
+ "cytotable_meta_offset": "BIGINT",
75
+ "cytotable_meta_rownum": "BIGINT",
76
+ }
77
+
71
78
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
72
79
  "data-producer": "https://github.com/cytomining/CytoTable",
73
80
  "data-producer-version": str(_get_cytotable_version()),
cytotable/convert.py CHANGED
@@ -8,23 +8,26 @@ import uuid
8
8
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
9
 
10
10
  import parsl
11
- import pyarrow as pa
12
- from parsl.app.app import join_app, python_app
11
+ from parsl.app.app import python_app
13
12
 
14
13
  from cytotable.exceptions import CytoTableException
15
14
  from cytotable.presets import config
15
+ from cytotable.sources import _gather_sources
16
16
  from cytotable.utils import (
17
17
  _column_sort,
18
18
  _default_parsl_config,
19
19
  _expand_path,
20
20
  _parsl_loaded,
21
+ evaluate_futures,
21
22
  )
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  @python_app
27
- def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]:
28
+ def _get_table_columns_and_types(
29
+ source: Dict[str, Any], sort_output: bool
30
+ ) -> List[Dict[str, str]]:
28
31
  """
29
32
  Gather column data from table through duckdb.
30
33
 
@@ -32,6 +35,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
32
35
  source: Dict[str, Any]
33
36
  Contains the source data to be chunked. Represents a single
34
37
  file or table of some kind.
38
+ sort_output:
39
+ Specifies whether to sort cytotable output or not.
35
40
 
36
41
  Returns:
37
42
  List[Dict[str, str]]
@@ -109,6 +114,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
109
114
  # offset is set to 0 start at first row
110
115
  # result from table
111
116
  offset=0,
117
+ add_cytotable_meta=False,
118
+ sort_output=sort_output,
112
119
  )
113
120
  with _duckdb_reader() as ddb_reader:
114
121
  return (
@@ -275,6 +282,7 @@ def _source_chunk_to_parquet(
275
282
  chunk_size: int,
276
283
  offset: int,
277
284
  dest_path: str,
285
+ sort_output: bool,
278
286
  ) -> str:
279
287
  """
280
288
  Export source data to chunked parquet file using chunk size and offsets.
@@ -291,6 +299,8 @@ def _source_chunk_to_parquet(
291
299
  The offset for chunking the data from source.
292
300
  dest_path: str
293
301
  Path to store the output data.
302
+ sort_output: bool
303
+ Specifies whether to sort cytotable output or not.
294
304
 
295
305
  Returns:
296
306
  str
@@ -303,6 +313,7 @@ def _source_chunk_to_parquet(
303
313
  from cloudpathlib import AnyPath
304
314
  from pyarrow import parquet
305
315
 
316
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
306
317
  from cytotable.utils import (
307
318
  _duckdb_reader,
308
319
  _sqlite_mixed_type_query_to_parquet,
@@ -316,13 +327,39 @@ def _source_chunk_to_parquet(
316
327
  )
317
328
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
318
329
 
330
+ source_path_str = (
331
+ source["source_path"]
332
+ if "table_name" not in source.keys()
333
+ else f"{source['source_path']}_table_{source['table_name']}"
334
+ )
319
335
  # build the column selection block of query
336
+
337
+ # add cytotable metadata columns
338
+ cytotable_metadata_cols = [
339
+ (
340
+ f"CAST( '{source_path_str}' "
341
+ f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
342
+ ' AS "cytotable_meta_source_path"'
343
+ ),
344
+ f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
345
+ (
346
+ f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
347
+ ' AS "cytotable_meta_rownum"'
348
+ ),
349
+ ]
350
+ # add source table columns
351
+ casted_source_cols = [
352
+ # here we cast the column to the specified type ensure the colname remains the same
353
+ f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
354
+ for column in source["columns"]
355
+ ]
356
+
357
+ # create selection statement from lists above
320
358
  select_columns = ",".join(
321
- [
322
- # here we cast the column to the specified type ensure the colname remains the same
323
- f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
324
- for column in source["columns"]
325
- ]
359
+ # if we should sort the output, add the metadata_cols
360
+ cytotable_metadata_cols + casted_source_cols
361
+ if sort_output
362
+ else casted_source_cols
326
363
  )
327
364
 
328
365
  # build output query and filepath base
@@ -352,6 +389,11 @@ def _source_chunk_to_parquet(
352
389
  ORDER BY ALL
353
390
  LIMIT {chunk_size} OFFSET {offset}
354
391
  """
392
+ if sort_output
393
+ else f"""
394
+ {base_query}
395
+ LIMIT {chunk_size} OFFSET {offset}
396
+ """
355
397
  ).arrow(),
356
398
  where=result_filepath,
357
399
  )
@@ -374,6 +416,8 @@ def _source_chunk_to_parquet(
374
416
  table_name=str(source["table_name"]),
375
417
  chunk_size=chunk_size,
376
418
  offset=offset,
419
+ add_cytotable_meta=True if sort_output else False,
420
+ sort_output=sort_output,
377
421
  ),
378
422
  where=result_filepath,
379
423
  )
@@ -422,7 +466,10 @@ def _prepend_column_name(
422
466
 
423
467
  import pyarrow.parquet as parquet
424
468
 
425
- from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
469
+ from cytotable.constants import (
470
+ CYOTABLE_META_COLUMN_TYPES,
471
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
472
+ )
426
473
  from cytotable.utils import _write_parquet_table_with_metadata
427
474
 
428
475
  logger = logging.getLogger(__name__)
@@ -470,8 +517,10 @@ def _prepend_column_name(
470
517
  # source_group_name_stem: 'Cells'
471
518
  # column_name: 'AreaShape_Area'
472
519
  # updated_column_name: 'Cells_AreaShape_Area'
473
- if column_name not in identifying_columns and not column_name.startswith(
474
- source_group_name_stem.capitalize()
520
+ if (
521
+ column_name not in identifying_columns
522
+ and not column_name.startswith(source_group_name_stem.capitalize())
523
+ and column_name not in CYOTABLE_META_COLUMN_TYPES
475
524
  ):
476
525
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
477
526
  # if-condition for prepending 'Metadata_' to column name
@@ -679,6 +728,7 @@ def _concat_source_group(
679
728
  def _prepare_join_sql(
680
729
  sources: Dict[str, List[Dict[str, Any]]],
681
730
  joins: str,
731
+ sort_output: bool,
682
732
  ) -> str:
683
733
  """
684
734
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -690,6 +740,8 @@ def _prepare_join_sql(
690
740
  joins: str:
691
741
  DuckDB-compatible SQL which will be used to perform the join
692
742
  operations using the join_group keys as a reference.
743
+ sort_output: bool
744
+ Specifies whether to sort cytotable output or not.
693
745
 
694
746
  Returns:
695
747
  str:
@@ -697,15 +749,30 @@ def _prepare_join_sql(
697
749
  """
698
750
  import pathlib
699
751
 
752
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
753
+
700
754
  # replace with real location of sources for join sql
755
+ order_by_tables = []
701
756
  for key, val in sources.items():
702
757
  if pathlib.Path(key).stem.lower() in joins.lower():
758
+ table_name = str(pathlib.Path(key).stem.lower())
703
759
  joins = joins.replace(
704
- f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
760
+ f"'{table_name}.parquet'",
705
761
  str([str(table) for table in val[0]["table"]]),
706
762
  )
763
+ order_by_tables.append(table_name)
764
+
765
+ # create order by statement with from all tables using cytotable metadata
766
+ order_by_sql = "ORDER BY " + ", ".join(
767
+ [
768
+ f"{table}.{meta_column}"
769
+ for table in order_by_tables
770
+ for meta_column in CYOTABLE_META_COLUMN_TYPES
771
+ ]
772
+ )
707
773
 
708
- return joins
774
+ # add the order by statements to the join
775
+ return joins + order_by_sql if sort_output else joins
709
776
 
710
777
 
711
778
  @python_app
@@ -739,8 +806,7 @@ def _join_source_chunk(
739
806
 
740
807
  import pathlib
741
808
 
742
- import pyarrow.parquet as parquet
743
-
809
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
744
810
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
745
811
 
746
812
  # Attempt to read the data to parquet file
@@ -748,12 +814,21 @@ def _join_source_chunk(
748
814
  # writing data to a parquet file.
749
815
  # read data with chunk size + offset
750
816
  # and export to parquet
817
+ exclude_meta_cols = [
818
+ f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
819
+ ]
751
820
  with _duckdb_reader() as ddb_reader:
752
821
  result = ddb_reader.execute(
753
822
  f"""
823
+ WITH joined AS (
754
824
  {joins}
755
- {"ORDER BY ALL" if "ORDER BY" not in joins.upper() else ""}
756
825
  LIMIT {chunk_size} OFFSET {offset}
826
+ )
827
+ SELECT
828
+ /* exclude metadata columns from the results
829
+ by using a lambda on column names based on exclude_meta_cols. */
830
+ COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
831
+ FROM joined;
757
832
  """
758
833
  ).arrow()
759
834
 
@@ -960,40 +1035,20 @@ def _infer_source_group_common_schema(
960
1035
  )
961
1036
 
962
1037
 
963
- @python_app
964
- def _return_future(input: Any) -> Any:
965
- """
966
- This is a simple wrapper python_app to allow
967
- the return of join_app-compliant output (must be a Parsl future)
968
-
969
- Args:
970
- input: Any
971
- Any input which will be used within the context of a
972
- Parsl join_app future return.
973
-
974
- Returns:
975
- Any
976
- Returns the input as provided wrapped within the context
977
- of a python_app for the purpose of a join_app.
978
- """
979
-
980
- return input
981
-
982
-
983
- @join_app
984
1038
  def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
985
1039
  source_path: str,
986
1040
  dest_path: str,
987
1041
  source_datatype: Optional[str],
988
- metadata: Union[List[str], Tuple[str, ...]],
989
- compartments: Union[List[str], Tuple[str, ...]],
990
- identifying_columns: Union[List[str], Tuple[str, ...]],
1042
+ metadata: Optional[Union[List[str], Tuple[str, ...]]],
1043
+ compartments: Optional[Union[List[str], Tuple[str, ...]]],
1044
+ identifying_columns: Optional[Union[List[str], Tuple[str, ...]]],
991
1045
  concat: bool,
992
1046
  join: bool,
993
1047
  joins: Optional[str],
994
1048
  chunk_size: Optional[int],
995
1049
  infer_common_schema: bool,
996
1050
  drop_null: bool,
1051
+ sort_output: bool,
997
1052
  data_type_cast_map: Optional[Dict[str, str]] = None,
998
1053
  **kwargs,
999
1054
  ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1032,6 +1087,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1032
1087
  Whether to infer a common schema when concatenating sources.
1033
1088
  drop_null: bool:
1034
1089
  Whether to drop null results.
1090
+ sort_output: bool
1091
+ Specifies whether to sort cytotable output or not.
1035
1092
  data_type_cast_map: Dict[str, str]
1036
1093
  A dictionary mapping data type groups to specific types.
1037
1094
  Roughly includes Arrow data types language from:
@@ -1047,24 +1104,15 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1047
1104
  result.
1048
1105
  """
1049
1106
 
1050
- from cytotable.convert import (
1051
- _concat_join_sources,
1052
- _concat_source_group,
1053
- _get_table_chunk_offsets,
1054
- _infer_source_group_common_schema,
1055
- _join_source_chunk,
1056
- _prepend_column_name,
1057
- _return_future,
1058
- _source_chunk_to_parquet,
1059
- )
1060
- from cytotable.sources import _gather_sources
1061
- from cytotable.utils import _expand_path
1062
-
1063
1107
  # gather sources to be processed
1064
1108
  sources = _gather_sources(
1065
1109
  source_path=source_path,
1066
1110
  source_datatype=source_datatype,
1067
- targets=list(metadata) + list(compartments),
1111
+ targets=(
1112
+ list(metadata) + list(compartments)
1113
+ if metadata is not None and compartments is not None
1114
+ else []
1115
+ ),
1068
1116
  **kwargs,
1069
1117
  ).result()
1070
1118
 
@@ -1080,7 +1128,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1080
1128
  "offsets": _get_table_chunk_offsets(
1081
1129
  source=source,
1082
1130
  chunk_size=chunk_size,
1083
- ).result()
1131
+ )
1084
1132
  },
1085
1133
  )
1086
1134
  for source in source_group_vals
@@ -1097,7 +1145,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1097
1145
  for source in source_group_vals
1098
1146
  if source["offsets"] is not None
1099
1147
  ]
1100
- for source_group_name, source_group_vals in offsets_prepared.items()
1148
+ for source_group_name, source_group_vals in evaluate_futures(
1149
+ offsets_prepared
1150
+ ).items()
1101
1151
  # ensure we have source_groups with at least one source table
1102
1152
  if len(source_group_vals) > 0
1103
1153
  }
@@ -1110,10 +1160,10 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1110
1160
  **{
1111
1161
  "columns": _prep_cast_column_data_types(
1112
1162
  columns=_get_table_columns_and_types(
1113
- source=source,
1163
+ source=source, sort_output=sort_output
1114
1164
  ),
1115
1165
  data_type_cast_map=data_type_cast_map,
1116
- ).result()
1166
+ )
1117
1167
  },
1118
1168
  )
1119
1169
  for source in source_group_vals
@@ -1136,33 +1186,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1136
1186
  chunk_size=chunk_size,
1137
1187
  offset=offset,
1138
1188
  dest_path=expanded_dest_path,
1189
+ sort_output=sort_output,
1139
1190
  ),
1140
1191
  source_group_name=source_group_name,
1141
1192
  identifying_columns=identifying_columns,
1142
1193
  metadata=metadata,
1143
1194
  compartments=compartments,
1144
- ).result()
1195
+ )
1145
1196
  for offset in source["offsets"]
1146
1197
  ]
1147
1198
  },
1148
1199
  )
1149
1200
  for source in source_group_vals
1150
1201
  ]
1151
- for source_group_name, source_group_vals in column_names_and_types_gathered.items()
1202
+ for source_group_name, source_group_vals in evaluate_futures(
1203
+ column_names_and_types_gathered
1204
+ ).items()
1152
1205
  }
1153
1206
 
1154
1207
  # if we're concatting or joining and need to infer the common schema
1155
1208
  if (concat or join) and infer_common_schema:
1156
1209
  # create a common schema for concatenation work
1157
1210
  common_schema_determined = {
1158
- source_group_name: {
1159
- "sources": source_group_vals,
1160
- "common_schema": _infer_source_group_common_schema(
1161
- source_group=source_group_vals,
1162
- data_type_cast_map=data_type_cast_map,
1163
- ),
1164
- }
1165
- for source_group_name, source_group_vals in results.items()
1211
+ source_group_name: [
1212
+ {
1213
+ "sources": source_group_vals,
1214
+ "common_schema": _infer_source_group_common_schema(
1215
+ source_group=source_group_vals,
1216
+ data_type_cast_map=data_type_cast_map,
1217
+ ),
1218
+ }
1219
+ ]
1220
+ for source_group_name, source_group_vals in evaluate_futures(
1221
+ results
1222
+ ).items()
1166
1223
  }
1167
1224
 
1168
1225
  # if concat or join, concat the source groups
@@ -1174,17 +1231,24 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1174
1231
  results = {
1175
1232
  source_group_name: _concat_source_group(
1176
1233
  source_group_name=source_group_name,
1177
- source_group=source_group_vals["sources"],
1234
+ source_group=source_group_vals[0]["sources"],
1178
1235
  dest_path=expanded_dest_path,
1179
- common_schema=source_group_vals["common_schema"],
1180
- ).result()
1181
- for source_group_name, source_group_vals in common_schema_determined.items()
1236
+ common_schema=source_group_vals[0]["common_schema"],
1237
+ )
1238
+ for source_group_name, source_group_vals in evaluate_futures(
1239
+ common_schema_determined
1240
+ ).items()
1182
1241
  }
1183
1242
 
1184
1243
  # conditional section for merging
1185
1244
  # note: join implies a concat, but concat does not imply a join
1186
1245
  if join:
1187
- prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
1246
+ # evaluate the results as they're used multiple times below
1247
+ evaluated_results = evaluate_futures(results)
1248
+
1249
+ prepared_joins_sql = _prepare_join_sql(
1250
+ sources=evaluated_results, joins=joins, sort_output=sort_output
1251
+ ).result()
1188
1252
 
1189
1253
  # map joined results based on the join groups gathered above
1190
1254
  # note: after mapping we end up with a list of strings (task returns str)
@@ -1198,7 +1262,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1198
1262
  chunk_size=chunk_size,
1199
1263
  offset=offset,
1200
1264
  drop_null=drop_null,
1201
- ).result()
1265
+ )
1202
1266
  # create join group for querying the concatenated
1203
1267
  # data in order to perform memory-safe joining
1204
1268
  # per user chunk size specification.
@@ -1213,12 +1277,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1213
1277
  # for lineage and debugging
1214
1278
  results = _concat_join_sources(
1215
1279
  dest_path=expanded_dest_path,
1216
- join_sources=join_sources_result,
1217
- sources=results,
1218
- ).result()
1280
+ join_sources=[join.result() for join in join_sources_result],
1281
+ sources=evaluated_results,
1282
+ )
1219
1283
 
1220
1284
  # wrap the final result as a future and return
1221
- return _return_future(results)
1285
+ return evaluate_futures(results)
1222
1286
 
1223
1287
 
1224
1288
  def convert( # pylint: disable=too-many-arguments,too-many-locals
@@ -1236,6 +1300,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1236
1300
  infer_common_schema: bool = True,
1237
1301
  drop_null: bool = False,
1238
1302
  data_type_cast_map: Optional[Dict[str, str]] = None,
1303
+ sort_output: bool = True,
1239
1304
  preset: Optional[str] = "cellprofiler_csv",
1240
1305
  parsl_config: Optional[parsl.Config] = None,
1241
1306
  **kwargs,
@@ -1277,8 +1342,14 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1277
1342
  DuckDB-compatible SQL which will be used to perform the join operations.
1278
1343
  chunk_size: Optional[int] (Default value = None)
1279
1344
  Size of join chunks which is used to limit data size during join ops
1280
- infer_common_schema: bool: (Default value = True)
1345
+ infer_common_schema: bool (Default value = True)
1281
1346
  Whether to infer a common schema when concatenating sources.
1347
+ data_type_cast_map: Dict[str, str], (Default value = None)
1348
+ A dictionary mapping data type groups to specific types.
1349
+ Roughly includes Arrow data types language from:
1350
+ https://arrow.apache.org/docs/python/api/datatypes.html
1351
+ sort_output: bool (Default value = True)
1352
+ Specifies whether to sort cytotable output or not.
1282
1353
  drop_null: bool (Default value = False)
1283
1354
  Whether to drop nan/null values from results
1284
1355
  preset: str (Default value = "cellprofiler_csv")
@@ -1393,7 +1464,8 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1393
1464
  infer_common_schema=infer_common_schema,
1394
1465
  drop_null=drop_null,
1395
1466
  data_type_cast_map=data_type_cast_map,
1467
+ sort_output=sort_output,
1396
1468
  **kwargs,
1397
- ).result()
1469
+ )
1398
1470
 
1399
1471
  return output
cytotable/presets.py CHANGED
@@ -29,25 +29,19 @@ config = {
29
29
  # compartment and metadata joins performed using DuckDB SQL
30
30
  # and modified at runtime as needed
31
31
  "CONFIG_JOINS": """
32
- WITH Image_Filtered AS (
33
- SELECT
34
- /* seeks columns by name, avoiding failure if some do not exist */
35
- COLUMNS('^Metadata_ImageNumber$|^Image_Metadata_Well$|^Image_Metadata_Plate$')
36
- FROM
37
- read_parquet('image.parquet')
38
- )
39
32
  SELECT
40
- *
33
+ image.Metadata_ImageNumber,
34
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
35
+ cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
36
+ nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
41
37
  FROM
42
38
  read_parquet('cytoplasm.parquet') AS cytoplasm
43
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
44
- cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
45
- AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
46
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
47
- nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
39
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
40
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
41
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
42
+ WHERE
43
+ cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
48
44
  AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
49
- LEFT JOIN Image_Filtered AS image ON
50
- image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
51
45
  """,
52
46
  },
53
47
  "cellprofiler_sqlite": {
@@ -74,26 +68,21 @@ config = {
74
68
  # compartment and metadata joins performed using DuckDB SQL
75
69
  # and modified at runtime as needed
76
70
  "CONFIG_JOINS": """
77
- WITH Per_Image_Filtered AS (
78
- SELECT
79
- Metadata_ImageNumber,
80
- Image_Metadata_Well,
81
- Image_Metadata_Plate
82
- FROM
83
- read_parquet('per_image.parquet')
84
- )
85
71
  SELECT
86
- *
72
+ per_image.Metadata_ImageNumber,
73
+ per_image.Image_Metadata_Well,
74
+ per_image.Image_Metadata_Plate,
75
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
76
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
77
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
87
78
  FROM
88
79
  read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
89
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
90
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
91
- AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
92
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
93
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
80
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
81
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
82
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
83
+ WHERE
84
+ per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
94
85
  AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
95
- LEFT JOIN Per_Image_Filtered AS per_image ON
96
- per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
97
86
  """,
98
87
  },
99
88
  "cellprofiler_sqlite_pycytominer": {
@@ -125,26 +114,21 @@ config = {
125
114
  # compartment and metadata joins performed using DuckDB SQL
126
115
  # and modified at runtime as needed
127
116
  "CONFIG_JOINS": """
128
- WITH Per_Image_Filtered AS (
129
- SELECT
130
- Metadata_ImageNumber,
131
- Image_Metadata_Well,
132
- Image_Metadata_Plate
133
- FROM
134
- read_parquet('per_image.parquet')
135
- )
136
117
  SELECT
137
- *
118
+ per_image.Metadata_ImageNumber,
119
+ per_image.Image_Metadata_Well,
120
+ per_image.Image_Metadata_Plate,
121
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
122
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
123
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
138
124
  FROM
139
125
  read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
140
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
141
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
142
- AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
143
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
144
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
126
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
127
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
128
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
129
+ WHERE
130
+ per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
145
131
  AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
146
- LEFT JOIN Per_Image_Filtered AS per_image ON
147
- per_image.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
148
132
  """,
149
133
  },
150
134
  "cell-health-cellprofiler-to-cytominer-database": {
@@ -178,30 +162,22 @@ config = {
178
162
  # compartment and metadata joins performed using DuckDB SQL
179
163
  # and modified at runtime as needed
180
164
  "CONFIG_JOINS": """
181
- WITH Image_Filtered AS (
182
- SELECT
183
- Metadata_TableNumber,
184
- Metadata_ImageNumber,
185
- Image_Metadata_Well,
186
- Image_Metadata_Plate
187
- FROM
188
- read_parquet('image.parquet')
189
- )
190
165
  SELECT
191
- *
166
+ image.Metadata_TableNumber,
167
+ image.Metadata_ImageNumber,
168
+ image.Image_Metadata_Well,
169
+ image.Image_Metadata_Plate,
170
+ cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
171
+ cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
172
+ nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
192
173
  FROM
193
174
  read_parquet('cytoplasm.parquet') AS cytoplasm
194
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
195
- cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
196
- AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
197
- AND cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
198
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
199
- nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
200
- AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
175
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_TableNumber, Metadata_ImageNumber)
176
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_TableNumber, Metadata_ImageNumber)
177
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_TableNumber, Metadata_ImageNumber)
178
+ WHERE
179
+ cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
201
180
  AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
202
- LEFT JOIN Image_Filtered AS image ON
203
- image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
204
- AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
205
181
  """,
206
182
  },
207
183
  "in-carta": {
cytotable/utils.py CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -171,6 +171,8 @@ def _sqlite_mixed_type_query_to_parquet(
171
171
  table_name: str,
172
172
  chunk_size: int,
173
173
  offset: int,
174
+ sort_output: bool,
175
+ add_cytotable_meta: bool = False,
174
176
  ) -> str:
175
177
  """
176
178
  Performs SQLite table data extraction where one or many
@@ -186,6 +188,10 @@ def _sqlite_mixed_type_query_to_parquet(
186
188
  Row count to use for chunked output.
187
189
  offset: int:
188
190
  The offset for chunking the data from source.
191
+ sort_output: bool
192
+ Specifies whether to sort cytotable output or not.
193
+ add_cytotable_meta: bool, default=False:
194
+ Whether to add CytoTable metadata fields or not
189
195
 
190
196
  Returns:
191
197
  pyarrow.Table:
@@ -195,7 +201,10 @@ def _sqlite_mixed_type_query_to_parquet(
195
201
 
196
202
  import pyarrow as pa
197
203
 
198
- from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
204
+ from cytotable.constants import (
205
+ CYOTABLE_META_COLUMN_TYPES,
206
+ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
207
+ )
199
208
  from cytotable.exceptions import DatatypeException
200
209
 
201
210
  # open sqlite3 connection
@@ -207,7 +216,7 @@ def _sqlite_mixed_type_query_to_parquet(
207
216
  # See the following for more information:
208
217
  # https://sqlite.org/pragma.html#pragma_table_info
209
218
  cursor.execute(
210
- f"""
219
+ """
211
220
  SELECT :table_name as table_name,
212
221
  name as column_name,
213
222
  type as column_type
@@ -255,15 +264,45 @@ def _sqlite_mixed_type_query_to_parquet(
255
264
  for col in column_info
256
265
  ]
257
266
 
267
+ if add_cytotable_meta:
268
+ query_parts += [
269
+ (
270
+ f"CAST( '{f'{source_path}_table_{table_name}'}' "
271
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
272
+ "AS cytotable_meta_source_path"
273
+ ),
274
+ (
275
+ f"CAST( {offset} "
276
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
277
+ "AS cytotable_meta_offset"
278
+ ),
279
+ (
280
+ f"CAST( (ROW_NUMBER() OVER ()) AS "
281
+ f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
282
+ "AS cytotable_meta_rownum"
283
+ ),
284
+ ]
285
+
258
286
  # perform the select using the cases built above and using chunksize + offset
259
- cursor.execute(
287
+ sql_stmt = (
260
288
  f"""
261
- SELECT {', '.join(query_parts)}
289
+ SELECT
290
+ {', '.join(query_parts)}
262
291
  FROM {table_name}
263
292
  ORDER BY {', '.join([col['column_name'] for col in column_info])}
264
293
  LIMIT {chunk_size} OFFSET {offset};
265
294
  """
295
+ if sort_output
296
+ else f"""
297
+ SELECT
298
+ {', '.join(query_parts)}
299
+ FROM {table_name}
300
+ LIMIT {chunk_size} OFFSET {offset};
301
+ """
266
302
  )
303
+
304
+ # execute the sql stmt
305
+ cursor.execute(sql_stmt)
267
306
  # collect the results and include the column name with values
268
307
  results = [
269
308
  dict(zip([desc[0] for desc in cursor.description], row))
@@ -462,3 +501,97 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
462
501
  ),
463
502
  **kwargs,
464
503
  )
504
+
505
+
506
+ def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
507
+ """
508
+ Helper function to unwrap futures from values or return values
509
+ where there are no futures.
510
+
511
+ Args:
512
+ val: Union[parsl.dataflow.futures.AppFuture, Any]
513
+ A value which may or may not be a Parsl future which
514
+ needs to be evaluated.
515
+
516
+ Returns:
517
+ Any
518
+ Returns the value as-is if there's no future, the future
519
+ result if Parsl futures are encountered.
520
+ """
521
+
522
+ # if we have a future value, evaluate the result
523
+ if isinstance(val, parsl.dataflow.futures.AppFuture):
524
+ return val.result()
525
+ elif isinstance(val, list):
526
+ # if we have a list of futures, return the results
527
+ if isinstance(val[0], parsl.dataflow.futures.AppFuture):
528
+ return [elem.result() for elem in val]
529
+ # otherwise return the value
530
+ return val
531
+
532
+
533
+ def _unwrap_source(
534
+ source: Union[
535
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
536
+ Union[parsl.dataflow.futures.AppFuture, Any],
537
+ ]
538
+ ) -> Union[Dict[str, Any], Any]:
539
+ """
540
+ Helper function to unwrap futures from sources.
541
+
542
+ Args:
543
+ source: Union[
544
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
545
+ Union[parsl.dataflow.futures.AppFuture, Any],
546
+ ]
547
+ A source is a portion of an internal data structure used by
548
+ CytoTable for processing and organizing data results.
549
+ Returns:
550
+ Union[Dict[str, Any], Any]
551
+ An evaluated dictionary or other value type.
552
+ """
553
+ # if we have a dictionary, unwrap any values which may be futures
554
+ if isinstance(source, dict):
555
+ return {key: _unwrap_value(val) for key, val in source.items()}
556
+ else:
557
+ # otherwise try to unwrap the source as-is without dictionary nesting
558
+ return _unwrap_value(source)
559
+
560
+
561
+ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
562
+ """
563
+ Evaluates any Parsl futures for use within other tasks.
564
+ This enables a pattern of Parsl app usage as "tasks" and delayed
565
+ future result evaluation for concurrency.
566
+
567
+ Args:
568
+ sources: Union[Dict[str, List[Dict[str, Any]]], str]
569
+ Sources are an internal data structure used by CytoTable for
570
+ processing and organizing data results. They may include futures
571
+ which require asynchronous processing through Parsl, so we
572
+ process them through this function.
573
+
574
+ Returns:
575
+ Union[Dict[str, List[Dict[str, Any]]], str]
576
+ A data structure which includes evaluated futures where they were found.
577
+ """
578
+
579
+ return (
580
+ {
581
+ source_group_name: [
582
+ # unwrap sources into future results
583
+ _unwrap_source(source)
584
+ for source in (
585
+ source_group_vals.result()
586
+ # if we have a future, return the result
587
+ if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
588
+ # otherwise return the value
589
+ else source_group_vals
590
+ )
591
+ ]
592
+ for source_group_name, source_group_vals in sources.items()
593
+ # if we have a dict, use the above, otherwise unwrap the value in case of future
594
+ }
595
+ if isinstance(sources, dict)
596
+ else _unwrap_value(sources)
597
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.7
3
+ Version: 0.0.8
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
2
+ cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
+ cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
6
+ cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
+ cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
8
+ cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
10
+ cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ cytotable-0.0.8.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=3xspHDpARY8WLv1EQOR-RWnqpadANuo2uK_MMKnFD8k,315
2
- cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
- cytotable/convert.py,sha256=EjEZpWvm3oPgDx1dKlfHETgs52blL79dBzfhcPOOK6o,51771
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=HSrINU0XzF4i4zxjNMMw9F0rRxgr6mm3V7Gh_Wb-uFI,10773
6
- cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
- cytotable/utils.py,sha256=E5r1Vk3eaCB42JFquQHpGQXdAy97kGl-YiapmOkURwA,14476
8
- cytotable-0.0.7.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.7.dist-info/METADATA,sha256=U1kwsaRSVKB8iwlSw3iP3tLDO2LeKT9xjG1ctiWnHg0,3420
10
- cytotable-0.0.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.7.dist-info/RECORD,,