CytoTable 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.6"
6
+ __version__ = "0.0.8"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/constants.py CHANGED
@@ -68,6 +68,13 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
+ # metadata column names and types for internal use within CytoTable
72
+ CYOTABLE_META_COLUMN_TYPES = {
73
+ "cytotable_meta_source_path": "VARCHAR",
74
+ "cytotable_meta_offset": "BIGINT",
75
+ "cytotable_meta_rownum": "BIGINT",
76
+ }
77
+
71
78
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
72
79
  "data-producer": "https://github.com/cytomining/CytoTable",
73
80
  "data-producer-version": str(_get_cytotable_version()),
cytotable/convert.py CHANGED
@@ -8,23 +8,26 @@ import uuid
8
8
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
9
 
10
10
  import parsl
11
- import pyarrow as pa
12
- from parsl.app.app import join_app, python_app
11
+ from parsl.app.app import python_app
13
12
 
14
13
  from cytotable.exceptions import CytoTableException
15
14
  from cytotable.presets import config
15
+ from cytotable.sources import _gather_sources
16
16
  from cytotable.utils import (
17
17
  _column_sort,
18
18
  _default_parsl_config,
19
19
  _expand_path,
20
20
  _parsl_loaded,
21
+ evaluate_futures,
21
22
  )
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  @python_app
27
- def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]:
28
+ def _get_table_columns_and_types(
29
+ source: Dict[str, Any], sort_output: bool
30
+ ) -> List[Dict[str, str]]:
28
31
  """
29
32
  Gather column data from table through duckdb.
30
33
 
@@ -32,6 +35,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
32
35
  source: Dict[str, Any]
33
36
  Contains the source data to be chunked. Represents a single
34
37
  file or table of some kind.
38
+ sort_output:
39
+ Specifies whether to sort cytotable output or not.
35
40
 
36
41
  Returns:
37
42
  List[Dict[str, str]]
@@ -109,6 +114,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
109
114
  # offset is set to 0 start at first row
110
115
  # result from table
111
116
  offset=0,
117
+ add_cytotable_meta=False,
118
+ sort_output=sort_output,
112
119
  )
113
120
  with _duckdb_reader() as ddb_reader:
114
121
  return (
@@ -275,6 +282,7 @@ def _source_chunk_to_parquet(
275
282
  chunk_size: int,
276
283
  offset: int,
277
284
  dest_path: str,
285
+ sort_output: bool,
278
286
  ) -> str:
279
287
  """
280
288
  Export source data to chunked parquet file using chunk size and offsets.
@@ -291,6 +299,8 @@ def _source_chunk_to_parquet(
291
299
  The offset for chunking the data from source.
292
300
  dest_path: str
293
301
  Path to store the output data.
302
+ sort_output: bool
303
+ Specifies whether to sort cytotable output or not.
294
304
 
295
305
  Returns:
296
306
  str
@@ -303,6 +313,7 @@ def _source_chunk_to_parquet(
303
313
  from cloudpathlib import AnyPath
304
314
  from pyarrow import parquet
305
315
 
316
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
306
317
  from cytotable.utils import (
307
318
  _duckdb_reader,
308
319
  _sqlite_mixed_type_query_to_parquet,
@@ -316,13 +327,39 @@ def _source_chunk_to_parquet(
316
327
  )
317
328
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
318
329
 
330
+ source_path_str = (
331
+ source["source_path"]
332
+ if "table_name" not in source.keys()
333
+ else f"{source['source_path']}_table_{source['table_name']}"
334
+ )
319
335
  # build the column selection block of query
336
+
337
+ # add cytotable metadata columns
338
+ cytotable_metadata_cols = [
339
+ (
340
+ f"CAST( '{source_path_str}' "
341
+ f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
342
+ ' AS "cytotable_meta_source_path"'
343
+ ),
344
+ f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
345
+ (
346
+ f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
347
+ ' AS "cytotable_meta_rownum"'
348
+ ),
349
+ ]
350
+ # add source table columns
351
+ casted_source_cols = [
352
+ # here we cast the column to the specified type ensure the colname remains the same
353
+ f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
354
+ for column in source["columns"]
355
+ ]
356
+
357
+ # create selection statement from lists above
320
358
  select_columns = ",".join(
321
- [
322
- # here we cast the column to the specified type ensure the colname remains the same
323
- f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
324
- for column in source["columns"]
325
- ]
359
+ # if we should sort the output, add the metadata_cols
360
+ cytotable_metadata_cols + casted_source_cols
361
+ if sort_output
362
+ else casted_source_cols
326
363
  )
327
364
 
328
365
  # build output query and filepath base
@@ -348,6 +385,13 @@ def _source_chunk_to_parquet(
348
385
  table=ddb_reader.execute(
349
386
  f"""
350
387
  {base_query}
388
+ /* order by all columns for deterministic output */
389
+ ORDER BY ALL
390
+ LIMIT {chunk_size} OFFSET {offset}
391
+ """
392
+ if sort_output
393
+ else f"""
394
+ {base_query}
351
395
  LIMIT {chunk_size} OFFSET {offset}
352
396
  """
353
397
  ).arrow(),
@@ -372,6 +416,8 @@ def _source_chunk_to_parquet(
372
416
  table_name=str(source["table_name"]),
373
417
  chunk_size=chunk_size,
374
418
  offset=offset,
419
+ add_cytotable_meta=True if sort_output else False,
420
+ sort_output=sort_output,
375
421
  ),
376
422
  where=result_filepath,
377
423
  )
@@ -420,7 +466,10 @@ def _prepend_column_name(
420
466
 
421
467
  import pyarrow.parquet as parquet
422
468
 
423
- from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
469
+ from cytotable.constants import (
470
+ CYOTABLE_META_COLUMN_TYPES,
471
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
472
+ )
424
473
  from cytotable.utils import _write_parquet_table_with_metadata
425
474
 
426
475
  logger = logging.getLogger(__name__)
@@ -468,8 +517,10 @@ def _prepend_column_name(
468
517
  # source_group_name_stem: 'Cells'
469
518
  # column_name: 'AreaShape_Area'
470
519
  # updated_column_name: 'Cells_AreaShape_Area'
471
- if column_name not in identifying_columns and not column_name.startswith(
472
- source_group_name_stem.capitalize()
520
+ if (
521
+ column_name not in identifying_columns
522
+ and not column_name.startswith(source_group_name_stem.capitalize())
523
+ and column_name not in CYOTABLE_META_COLUMN_TYPES
473
524
  ):
474
525
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
475
526
  # if-condition for prepending 'Metadata_' to column name
@@ -677,6 +728,7 @@ def _concat_source_group(
677
728
  def _prepare_join_sql(
678
729
  sources: Dict[str, List[Dict[str, Any]]],
679
730
  joins: str,
731
+ sort_output: bool,
680
732
  ) -> str:
681
733
  """
682
734
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -688,6 +740,8 @@ def _prepare_join_sql(
688
740
  joins: str:
689
741
  DuckDB-compatible SQL which will be used to perform the join
690
742
  operations using the join_group keys as a reference.
743
+ sort_output: bool
744
+ Specifies whether to sort cytotable output or not.
691
745
 
692
746
  Returns:
693
747
  str:
@@ -695,15 +749,30 @@ def _prepare_join_sql(
695
749
  """
696
750
  import pathlib
697
751
 
752
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
753
+
698
754
  # replace with real location of sources for join sql
755
+ order_by_tables = []
699
756
  for key, val in sources.items():
700
757
  if pathlib.Path(key).stem.lower() in joins.lower():
758
+ table_name = str(pathlib.Path(key).stem.lower())
701
759
  joins = joins.replace(
702
- f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
760
+ f"'{table_name}.parquet'",
703
761
  str([str(table) for table in val[0]["table"]]),
704
762
  )
763
+ order_by_tables.append(table_name)
764
+
765
+ # create order by statement with from all tables using cytotable metadata
766
+ order_by_sql = "ORDER BY " + ", ".join(
767
+ [
768
+ f"{table}.{meta_column}"
769
+ for table in order_by_tables
770
+ for meta_column in CYOTABLE_META_COLUMN_TYPES
771
+ ]
772
+ )
705
773
 
706
- return joins
774
+ # add the order by statements to the join
775
+ return joins + order_by_sql if sort_output else joins
707
776
 
708
777
 
709
778
  @python_app
@@ -737,8 +806,7 @@ def _join_source_chunk(
737
806
 
738
807
  import pathlib
739
808
 
740
- import pyarrow.parquet as parquet
741
-
809
+ from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
742
810
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
743
811
 
744
812
  # Attempt to read the data to parquet file
@@ -746,11 +814,21 @@ def _join_source_chunk(
746
814
  # writing data to a parquet file.
747
815
  # read data with chunk size + offset
748
816
  # and export to parquet
817
+ exclude_meta_cols = [
818
+ f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
819
+ ]
749
820
  with _duckdb_reader() as ddb_reader:
750
821
  result = ddb_reader.execute(
751
822
  f"""
823
+ WITH joined AS (
752
824
  {joins}
753
825
  LIMIT {chunk_size} OFFSET {offset}
826
+ )
827
+ SELECT
828
+ /* exclude metadata columns from the results
829
+ by using a lambda on column names based on exclude_meta_cols. */
830
+ COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
831
+ FROM joined;
754
832
  """
755
833
  ).arrow()
756
834
 
@@ -957,40 +1035,20 @@ def _infer_source_group_common_schema(
957
1035
  )
958
1036
 
959
1037
 
960
- @python_app
961
- def _return_future(input: Any) -> Any:
962
- """
963
- This is a simple wrapper python_app to allow
964
- the return of join_app-compliant output (must be a Parsl future)
965
-
966
- Args:
967
- input: Any
968
- Any input which will be used within the context of a
969
- Parsl join_app future return.
970
-
971
- Returns:
972
- Any
973
- Returns the input as provided wrapped within the context
974
- of a python_app for the purpose of a join_app.
975
- """
976
-
977
- return input
978
-
979
-
980
- @join_app
981
1038
  def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
982
1039
  source_path: str,
983
1040
  dest_path: str,
984
1041
  source_datatype: Optional[str],
985
- metadata: Union[List[str], Tuple[str, ...]],
986
- compartments: Union[List[str], Tuple[str, ...]],
987
- identifying_columns: Union[List[str], Tuple[str, ...]],
1042
+ metadata: Optional[Union[List[str], Tuple[str, ...]]],
1043
+ compartments: Optional[Union[List[str], Tuple[str, ...]]],
1044
+ identifying_columns: Optional[Union[List[str], Tuple[str, ...]]],
988
1045
  concat: bool,
989
1046
  join: bool,
990
1047
  joins: Optional[str],
991
1048
  chunk_size: Optional[int],
992
1049
  infer_common_schema: bool,
993
1050
  drop_null: bool,
1051
+ sort_output: bool,
994
1052
  data_type_cast_map: Optional[Dict[str, str]] = None,
995
1053
  **kwargs,
996
1054
  ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1029,6 +1087,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1029
1087
  Whether to infer a common schema when concatenating sources.
1030
1088
  drop_null: bool:
1031
1089
  Whether to drop null results.
1090
+ sort_output: bool
1091
+ Specifies whether to sort cytotable output or not.
1032
1092
  data_type_cast_map: Dict[str, str]
1033
1093
  A dictionary mapping data type groups to specific types.
1034
1094
  Roughly includes Arrow data types language from:
@@ -1044,24 +1104,15 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1044
1104
  result.
1045
1105
  """
1046
1106
 
1047
- from cytotable.convert import (
1048
- _concat_join_sources,
1049
- _concat_source_group,
1050
- _get_table_chunk_offsets,
1051
- _infer_source_group_common_schema,
1052
- _join_source_chunk,
1053
- _prepend_column_name,
1054
- _return_future,
1055
- _source_chunk_to_parquet,
1056
- )
1057
- from cytotable.sources import _gather_sources
1058
- from cytotable.utils import _expand_path
1059
-
1060
1107
  # gather sources to be processed
1061
1108
  sources = _gather_sources(
1062
1109
  source_path=source_path,
1063
1110
  source_datatype=source_datatype,
1064
- targets=list(metadata) + list(compartments),
1111
+ targets=(
1112
+ list(metadata) + list(compartments)
1113
+ if metadata is not None and compartments is not None
1114
+ else []
1115
+ ),
1065
1116
  **kwargs,
1066
1117
  ).result()
1067
1118
 
@@ -1077,7 +1128,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1077
1128
  "offsets": _get_table_chunk_offsets(
1078
1129
  source=source,
1079
1130
  chunk_size=chunk_size,
1080
- ).result()
1131
+ )
1081
1132
  },
1082
1133
  )
1083
1134
  for source in source_group_vals
@@ -1094,7 +1145,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1094
1145
  for source in source_group_vals
1095
1146
  if source["offsets"] is not None
1096
1147
  ]
1097
- for source_group_name, source_group_vals in offsets_prepared.items()
1148
+ for source_group_name, source_group_vals in evaluate_futures(
1149
+ offsets_prepared
1150
+ ).items()
1098
1151
  # ensure we have source_groups with at least one source table
1099
1152
  if len(source_group_vals) > 0
1100
1153
  }
@@ -1107,10 +1160,10 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1107
1160
  **{
1108
1161
  "columns": _prep_cast_column_data_types(
1109
1162
  columns=_get_table_columns_and_types(
1110
- source=source,
1163
+ source=source, sort_output=sort_output
1111
1164
  ),
1112
1165
  data_type_cast_map=data_type_cast_map,
1113
- ).result()
1166
+ )
1114
1167
  },
1115
1168
  )
1116
1169
  for source in source_group_vals
@@ -1133,33 +1186,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1133
1186
  chunk_size=chunk_size,
1134
1187
  offset=offset,
1135
1188
  dest_path=expanded_dest_path,
1189
+ sort_output=sort_output,
1136
1190
  ),
1137
1191
  source_group_name=source_group_name,
1138
1192
  identifying_columns=identifying_columns,
1139
1193
  metadata=metadata,
1140
1194
  compartments=compartments,
1141
- ).result()
1195
+ )
1142
1196
  for offset in source["offsets"]
1143
1197
  ]
1144
1198
  },
1145
1199
  )
1146
1200
  for source in source_group_vals
1147
1201
  ]
1148
- for source_group_name, source_group_vals in column_names_and_types_gathered.items()
1202
+ for source_group_name, source_group_vals in evaluate_futures(
1203
+ column_names_and_types_gathered
1204
+ ).items()
1149
1205
  }
1150
1206
 
1151
1207
  # if we're concatting or joining and need to infer the common schema
1152
1208
  if (concat or join) and infer_common_schema:
1153
1209
  # create a common schema for concatenation work
1154
1210
  common_schema_determined = {
1155
- source_group_name: {
1156
- "sources": source_group_vals,
1157
- "common_schema": _infer_source_group_common_schema(
1158
- source_group=source_group_vals,
1159
- data_type_cast_map=data_type_cast_map,
1160
- ),
1161
- }
1162
- for source_group_name, source_group_vals in results.items()
1211
+ source_group_name: [
1212
+ {
1213
+ "sources": source_group_vals,
1214
+ "common_schema": _infer_source_group_common_schema(
1215
+ source_group=source_group_vals,
1216
+ data_type_cast_map=data_type_cast_map,
1217
+ ),
1218
+ }
1219
+ ]
1220
+ for source_group_name, source_group_vals in evaluate_futures(
1221
+ results
1222
+ ).items()
1163
1223
  }
1164
1224
 
1165
1225
  # if concat or join, concat the source groups
@@ -1171,17 +1231,24 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1171
1231
  results = {
1172
1232
  source_group_name: _concat_source_group(
1173
1233
  source_group_name=source_group_name,
1174
- source_group=source_group_vals["sources"],
1234
+ source_group=source_group_vals[0]["sources"],
1175
1235
  dest_path=expanded_dest_path,
1176
- common_schema=source_group_vals["common_schema"],
1177
- ).result()
1178
- for source_group_name, source_group_vals in common_schema_determined.items()
1236
+ common_schema=source_group_vals[0]["common_schema"],
1237
+ )
1238
+ for source_group_name, source_group_vals in evaluate_futures(
1239
+ common_schema_determined
1240
+ ).items()
1179
1241
  }
1180
1242
 
1181
1243
  # conditional section for merging
1182
1244
  # note: join implies a concat, but concat does not imply a join
1183
1245
  if join:
1184
- prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
1246
+ # evaluate the results as they're used multiple times below
1247
+ evaluated_results = evaluate_futures(results)
1248
+
1249
+ prepared_joins_sql = _prepare_join_sql(
1250
+ sources=evaluated_results, joins=joins, sort_output=sort_output
1251
+ ).result()
1185
1252
 
1186
1253
  # map joined results based on the join groups gathered above
1187
1254
  # note: after mapping we end up with a list of strings (task returns str)
@@ -1195,7 +1262,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1195
1262
  chunk_size=chunk_size,
1196
1263
  offset=offset,
1197
1264
  drop_null=drop_null,
1198
- ).result()
1265
+ )
1199
1266
  # create join group for querying the concatenated
1200
1267
  # data in order to perform memory-safe joining
1201
1268
  # per user chunk size specification.
@@ -1210,12 +1277,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1210
1277
  # for lineage and debugging
1211
1278
  results = _concat_join_sources(
1212
1279
  dest_path=expanded_dest_path,
1213
- join_sources=join_sources_result,
1214
- sources=results,
1215
- ).result()
1280
+ join_sources=[join.result() for join in join_sources_result],
1281
+ sources=evaluated_results,
1282
+ )
1216
1283
 
1217
1284
  # wrap the final result as a future and return
1218
- return _return_future(results)
1285
+ return evaluate_futures(results)
1219
1286
 
1220
1287
 
1221
1288
  def convert( # pylint: disable=too-many-arguments,too-many-locals
@@ -1233,6 +1300,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1233
1300
  infer_common_schema: bool = True,
1234
1301
  drop_null: bool = False,
1235
1302
  data_type_cast_map: Optional[Dict[str, str]] = None,
1303
+ sort_output: bool = True,
1236
1304
  preset: Optional[str] = "cellprofiler_csv",
1237
1305
  parsl_config: Optional[parsl.Config] = None,
1238
1306
  **kwargs,
@@ -1274,8 +1342,14 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1274
1342
  DuckDB-compatible SQL which will be used to perform the join operations.
1275
1343
  chunk_size: Optional[int] (Default value = None)
1276
1344
  Size of join chunks which is used to limit data size during join ops
1277
- infer_common_schema: bool: (Default value = True)
1345
+ infer_common_schema: bool (Default value = True)
1278
1346
  Whether to infer a common schema when concatenating sources.
1347
+ data_type_cast_map: Dict[str, str], (Default value = None)
1348
+ A dictionary mapping data type groups to specific types.
1349
+ Roughly includes Arrow data types language from:
1350
+ https://arrow.apache.org/docs/python/api/datatypes.html
1351
+ sort_output: bool (Default value = True)
1352
+ Specifies whether to sort cytotable output or not.
1279
1353
  drop_null: bool (Default value = False)
1280
1354
  Whether to drop nan/null values from results
1281
1355
  preset: str (Default value = "cellprofiler_csv")
@@ -1390,7 +1464,8 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1390
1464
  infer_common_schema=infer_common_schema,
1391
1465
  drop_null=drop_null,
1392
1466
  data_type_cast_map=data_type_cast_map,
1467
+ sort_output=sort_output,
1393
1468
  **kwargs,
1394
- ).result()
1469
+ )
1395
1470
 
1396
1471
  return output
cytotable/presets.py CHANGED
@@ -29,24 +29,18 @@ config = {
29
29
  # compartment and metadata joins performed using DuckDB SQL
30
30
  # and modified at runtime as needed
31
31
  "CONFIG_JOINS": """
32
- WITH Image_Filtered AS (
33
- SELECT
34
- /* seeks columns by name, avoiding failure if some do not exist */
35
- COLUMNS('^Metadata_ImageNumber$|^Image_Metadata_Well$|^Image_Metadata_Plate$')
36
- FROM
37
- read_parquet('image.parquet')
38
- )
39
32
  SELECT
40
- *
33
+ image.Metadata_ImageNumber,
34
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
35
+ cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
36
+ nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
41
37
  FROM
42
- Image_Filtered AS image
43
- LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
44
- cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
45
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
46
- cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
47
- AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
48
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
49
- nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
38
+ read_parquet('cytoplasm.parquet') AS cytoplasm
39
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
40
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
41
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
42
+ WHERE
43
+ cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
50
44
  AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
51
45
  """,
52
46
  },
@@ -74,25 +68,20 @@ config = {
74
68
  # compartment and metadata joins performed using DuckDB SQL
75
69
  # and modified at runtime as needed
76
70
  "CONFIG_JOINS": """
77
- WITH Per_Image_Filtered AS (
78
- SELECT
79
- Metadata_ImageNumber,
80
- Image_Metadata_Well,
81
- Image_Metadata_Plate
82
- FROM
83
- read_parquet('per_image.parquet')
84
- )
85
71
  SELECT
86
- *
72
+ per_image.Metadata_ImageNumber,
73
+ per_image.Image_Metadata_Well,
74
+ per_image.Image_Metadata_Plate,
75
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
76
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
77
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
87
78
  FROM
88
- Per_Image_Filtered AS per_image
89
- LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
90
- per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
91
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
92
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
93
- AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
94
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
95
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
79
+ read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
80
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
81
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
82
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
83
+ WHERE
84
+ per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
96
85
  AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
97
86
  """,
98
87
  },
@@ -125,25 +114,20 @@ config = {
125
114
  # compartment and metadata joins performed using DuckDB SQL
126
115
  # and modified at runtime as needed
127
116
  "CONFIG_JOINS": """
128
- WITH Per_Image_Filtered AS (
129
- SELECT
130
- Metadata_ImageNumber,
131
- Image_Metadata_Well,
132
- Image_Metadata_Plate
133
- FROM
134
- read_parquet('per_image.parquet')
135
- )
136
117
  SELECT
137
- *
118
+ per_image.Metadata_ImageNumber,
119
+ per_image.Image_Metadata_Well,
120
+ per_image.Image_Metadata_Plate,
121
+ per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
122
+ per_cells.* EXCLUDE (Metadata_ImageNumber),
123
+ per_nuclei.* EXCLUDE (Metadata_ImageNumber)
138
124
  FROM
139
- Per_Image_Filtered AS per_image
140
- LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
141
- per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
142
- LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
143
- per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
144
- AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
145
- LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
146
- per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
125
+ read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
126
+ LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
127
+ LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
128
+ LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
129
+ WHERE
130
+ per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
147
131
  AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
148
132
  """,
149
133
  },
@@ -178,29 +162,21 @@ config = {
178
162
  # compartment and metadata joins performed using DuckDB SQL
179
163
  # and modified at runtime as needed
180
164
  "CONFIG_JOINS": """
181
- WITH Image_Filtered AS (
182
- SELECT
183
- Metadata_TableNumber,
184
- Metadata_ImageNumber,
185
- Image_Metadata_Well,
186
- Image_Metadata_Plate
187
- FROM
188
- read_parquet('image.parquet')
189
- )
190
165
  SELECT
191
- *
166
+ image.Metadata_TableNumber,
167
+ image.Metadata_ImageNumber,
168
+ image.Image_Metadata_Well,
169
+ image.Image_Metadata_Plate,
170
+ cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
171
+ cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
172
+ nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
192
173
  FROM
193
- Image_Filtered AS image
194
- LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
195
- cytoplasm.Metadata_TableNumber = image.Metadata_TableNumber
196
- AND cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
197
- LEFT JOIN read_parquet('cells.parquet') AS cells ON
198
- cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
199
- AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
200
- AND cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
201
- LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
202
- nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
203
- AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
174
+ read_parquet('cytoplasm.parquet') AS cytoplasm
175
+ LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_TableNumber, Metadata_ImageNumber)
176
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_TableNumber, Metadata_ImageNumber)
177
+ LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_TableNumber, Metadata_ImageNumber)
178
+ WHERE
179
+ cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
204
180
  AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
205
181
  """,
206
182
  },
cytotable/utils.py CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -171,6 +171,8 @@ def _sqlite_mixed_type_query_to_parquet(
171
171
  table_name: str,
172
172
  chunk_size: int,
173
173
  offset: int,
174
+ sort_output: bool,
175
+ add_cytotable_meta: bool = False,
174
176
  ) -> str:
175
177
  """
176
178
  Performs SQLite table data extraction where one or many
@@ -186,6 +188,10 @@ def _sqlite_mixed_type_query_to_parquet(
186
188
  Row count to use for chunked output.
187
189
  offset: int:
188
190
  The offset for chunking the data from source.
191
+ sort_output: bool
192
+ Specifies whether to sort cytotable output or not.
193
+ add_cytotable_meta: bool, default=False:
194
+ Whether to add CytoTable metadata fields or not
189
195
 
190
196
  Returns:
191
197
  pyarrow.Table:
@@ -195,7 +201,10 @@ def _sqlite_mixed_type_query_to_parquet(
195
201
 
196
202
  import pyarrow as pa
197
203
 
198
- from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
204
+ from cytotable.constants import (
205
+ CYOTABLE_META_COLUMN_TYPES,
206
+ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
207
+ )
199
208
  from cytotable.exceptions import DatatypeException
200
209
 
201
210
  # open sqlite3 connection
@@ -207,7 +216,7 @@ def _sqlite_mixed_type_query_to_parquet(
207
216
  # See the following for more information:
208
217
  # https://sqlite.org/pragma.html#pragma_table_info
209
218
  cursor.execute(
210
- f"""
219
+ """
211
220
  SELECT :table_name as table_name,
212
221
  name as column_name,
213
222
  type as column_type
@@ -255,10 +264,45 @@ def _sqlite_mixed_type_query_to_parquet(
255
264
  for col in column_info
256
265
  ]
257
266
 
267
+ if add_cytotable_meta:
268
+ query_parts += [
269
+ (
270
+ f"CAST( '{f'{source_path}_table_{table_name}'}' "
271
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
272
+ "AS cytotable_meta_source_path"
273
+ ),
274
+ (
275
+ f"CAST( {offset} "
276
+ f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
277
+ "AS cytotable_meta_offset"
278
+ ),
279
+ (
280
+ f"CAST( (ROW_NUMBER() OVER ()) AS "
281
+ f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
282
+ "AS cytotable_meta_rownum"
283
+ ),
284
+ ]
285
+
258
286
  # perform the select using the cases built above and using chunksize + offset
259
- cursor.execute(
260
- f'SELECT {", ".join(query_parts)} FROM {table_name} LIMIT {chunk_size} OFFSET {offset};'
287
+ sql_stmt = (
288
+ f"""
289
+ SELECT
290
+ {', '.join(query_parts)}
291
+ FROM {table_name}
292
+ ORDER BY {', '.join([col['column_name'] for col in column_info])}
293
+ LIMIT {chunk_size} OFFSET {offset};
294
+ """
295
+ if sort_output
296
+ else f"""
297
+ SELECT
298
+ {', '.join(query_parts)}
299
+ FROM {table_name}
300
+ LIMIT {chunk_size} OFFSET {offset};
301
+ """
261
302
  )
303
+
304
+ # execute the sql stmt
305
+ cursor.execute(sql_stmt)
262
306
  # collect the results and include the column name with values
263
307
  results = [
264
308
  dict(zip([desc[0] for desc in cursor.description], row))
@@ -457,3 +501,97 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
457
501
  ),
458
502
  **kwargs,
459
503
  )
504
+
505
+
506
+ def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
507
+ """
508
+ Helper function to unwrap futures from values or return values
509
+ where there are no futures.
510
+
511
+ Args:
512
+ val: Union[parsl.dataflow.futures.AppFuture, Any]
513
+ A value which may or may not be a Parsl future which
514
+ needs to be evaluated.
515
+
516
+ Returns:
517
+ Any
518
+ Returns the value as-is if there's no future, the future
519
+ result if Parsl futures are encountered.
520
+ """
521
+
522
+ # if we have a future value, evaluate the result
523
+ if isinstance(val, parsl.dataflow.futures.AppFuture):
524
+ return val.result()
525
+ elif isinstance(val, list):
526
+ # if we have a list of futures, return the results
527
+ if isinstance(val[0], parsl.dataflow.futures.AppFuture):
528
+ return [elem.result() for elem in val]
529
+ # otherwise return the value
530
+ return val
531
+
532
+
533
+ def _unwrap_source(
534
+ source: Union[
535
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
536
+ Union[parsl.dataflow.futures.AppFuture, Any],
537
+ ]
538
+ ) -> Union[Dict[str, Any], Any]:
539
+ """
540
+ Helper function to unwrap futures from sources.
541
+
542
+ Args:
543
+ source: Union[
544
+ Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
545
+ Union[parsl.dataflow.futures.AppFuture, Any],
546
+ ]
547
+ A source is a portion of an internal data structure used by
548
+ CytoTable for processing and organizing data results.
549
+ Returns:
550
+ Union[Dict[str, Any], Any]
551
+ An evaluated dictionary or other value type.
552
+ """
553
+ # if we have a dictionary, unwrap any values which may be futures
554
+ if isinstance(source, dict):
555
+ return {key: _unwrap_value(val) for key, val in source.items()}
556
+ else:
557
+ # otherwise try to unwrap the source as-is without dictionary nesting
558
+ return _unwrap_value(source)
559
+
560
+
561
+ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
562
+ """
563
+ Evaluates any Parsl futures for use within other tasks.
564
+ This enables a pattern of Parsl app usage as "tasks" and delayed
565
+ future result evaluation for concurrency.
566
+
567
+ Args:
568
+ sources: Union[Dict[str, List[Dict[str, Any]]], str]
569
+ Sources are an internal data structure used by CytoTable for
570
+ processing and organizing data results. They may include futures
571
+ which require asynchronous processing through Parsl, so we
572
+ process them through this function.
573
+
574
+ Returns:
575
+ Union[Dict[str, List[Dict[str, Any]]], str]
576
+ A data structure which includes evaluated futures where they were found.
577
+ """
578
+
579
+ return (
580
+ {
581
+ source_group_name: [
582
+ # unwrap sources into future results
583
+ _unwrap_source(source)
584
+ for source in (
585
+ source_group_vals.result()
586
+ # if we have a future, return the result
587
+ if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
588
+ # otherwise return the value
589
+ else source_group_vals
590
+ )
591
+ ]
592
+ for source_group_name, source_group_vals in sources.items()
593
+ # if we have a dict, use the above, otherwise unwrap the value in case of future
594
+ }
595
+ if isinstance(sources, dict)
596
+ else _unwrap_value(sources)
597
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -14,10 +14,14 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
- Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
18
- Requires-Dist: duckdb (>=0.8.0,<0.10.0)
17
+ Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
18
+ Requires-Dist: duckdb (>=0.10.1)
19
+ Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
20
+ Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
19
21
  Requires-Dist: parsl (>=2023.9.25)
20
22
  Requires-Dist: pyarrow (>=13.0.0)
23
+ Requires-Dist: scipy (<1.12.0) ; python_version < "3.9"
24
+ Requires-Dist: scipy (>=1.12.0,<2.0.0) ; python_version >= "3.9"
21
25
  Project-URL: Documentation, https://cytomining.github.io/CytoTable/
22
26
  Project-URL: Repository, https://github.com/cytomining/CytoTable
23
27
  Description-Content-Type: text/markdown
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
2
+ cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
+ cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
6
+ cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
+ cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
8
+ cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
10
+ cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ cytotable-0.0.8.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=BRJhTCcugpwKD1ONkiYUFjZMyCeO4t8f9161lrboXKY,315
2
- cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
- cytotable/convert.py,sha256=dXvzQPBel4Yp1zs_LZWQR1ZTV19G9WXCkrlTSXV6eWQ,51590
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=SYZXh0-eK-2VRRd8I30GCQcZ4wDMmhGes8KdDsxpFqg,10771
6
- cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
- cytotable/utils.py,sha256=9zqLf_95-phH6IdsDgpK3g3NkDG4odx0NUWogQDs31k,14344
8
- cytotable-0.0.6.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.6.dist-info/METADATA,sha256=j-BSYzl7cjaxsSR74luw-zvpPofTCYXVEBO1JIetvY0,3189
10
- cytotable-0.0.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.6.dist-info/RECORD,,