CytoTable 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.13"
6
+ __version__ = "0.0.15"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/convert.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
7
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
8
8
 
9
9
  import parsl
10
+ import pyarrow as pa
10
11
  from parsl.app.app import python_app
11
12
 
12
13
  from cytotable.exceptions import CytoTableException
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
26
27
  @python_app
27
28
  def _get_table_columns_and_types(
28
29
  source: Dict[str, Any], sort_output: bool
29
- ) -> List[Dict[str, str]]:
30
+ ) -> List[Optional[Dict[str, str]]]:
30
31
  """
31
32
  Gather column data from table through duckdb.
32
33
 
@@ -38,7 +39,7 @@ def _get_table_columns_and_types(
38
39
  Specifies whether to sort cytotable output or not.
39
40
 
40
41
  Returns:
41
- List[Dict[str, str]]
42
+ List[Optional[Dict[str, str]]]
42
43
  list of dictionaries which each include column level information
43
44
  """
44
45
 
@@ -49,6 +50,12 @@ def _get_table_columns_and_types(
49
50
  source_path = source["source_path"]
50
51
  source_type = str(source_path.suffix).lower()
51
52
 
53
+ # If we have .npz files, return a list with None
54
+ # because we're querying a non-tabular data source.
55
+ # These will be handled later by _extract_npz_to_parquet.
56
+ if source_type == ".npz":
57
+ return [None]
58
+
52
59
  # prepare the data source in the form of a duckdb query
53
60
  select_source = (
54
61
  f"read_csv_auto('{source_path}')"
@@ -279,7 +286,9 @@ def _get_table_keyset_pagination_sets(
279
286
  page_key: str,
280
287
  source: Optional[Dict[str, Any]] = None,
281
288
  sql_stmt: Optional[str] = None,
282
- ) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
289
+ ) -> Union[
290
+ List[Optional[Tuple[Union[int, float], Union[int, float]]]], List[None], None
291
+ ]:
283
292
  """
284
293
  Get table data chunk keys for later use in capturing segments
285
294
  of values. This work also provides a chance to catch problematic
@@ -300,7 +309,7 @@ def _get_table_keyset_pagination_sets(
300
309
  data source.
301
310
 
302
311
  Returns:
303
- List[Any]
312
+ Union[List[Optional[Tuple[Union[int, float], Union[int, float]]]], None]
304
313
  List of keys to use for reading the data later on.
305
314
  """
306
315
 
@@ -324,8 +333,15 @@ def _get_table_keyset_pagination_sets(
324
333
  with _duckdb_reader() as ddb_reader:
325
334
  if source_type == ".csv":
326
335
  sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
327
- else:
336
+ elif source_type == ".sqlite":
328
337
  sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
338
+ elif source_type == ".npz":
339
+ # If we have npz files there's no need to paginate
340
+ # so we return None. None within a list is used as
341
+ # a special "passthrough" case within the pipeline
342
+ # so we may specially handle NPZ files later on via
343
+ # _source_pageset_to_parquet and _extract_npz_to_parquet.
344
+ return [None]
329
345
 
330
346
  page_keys = [
331
347
  results[0] for results in ddb_reader.execute(sql_query).fetchall()
@@ -360,14 +376,16 @@ def _get_table_keyset_pagination_sets(
360
376
  page_keys = ddb_reader.execute(sql_query).fetchall()
361
377
  page_keys = [key[0] for key in page_keys]
362
378
 
363
- return _generate_pagesets(page_keys, chunk_size)
379
+ # The type: mention below is used to ignore a mypy linting error
380
+ # wherein it considers _generate_pagesets to be invalid.
381
+ return _generate_pagesets(page_keys, chunk_size) # type: ignore[return-value]
364
382
 
365
383
 
366
384
  @python_app
367
385
  def _source_pageset_to_parquet(
368
386
  source_group_name: str,
369
387
  source: Dict[str, Any],
370
- pageset: Tuple[Union[int, float], Union[int, float]],
388
+ pageset: Optional[Tuple[Union[int, float], Union[int, float]]],
371
389
  dest_path: str,
372
390
  sort_output: bool,
373
391
  ) -> str:
@@ -380,7 +398,7 @@ def _source_pageset_to_parquet(
380
398
  source: Dict[str, Any]
381
399
  Contains the source data to be chunked. Represents a single
382
400
  file or table of some kind along with collected information about table.
383
- pageset: Tuple[int, int]
401
+ pageset: Optional[Tuple[Union[int, float], Union[int, float]]]
384
402
  The pageset for chunking the data from source.
385
403
  dest_path: str
386
404
  Path to store the output data.
@@ -399,10 +417,13 @@ def _source_pageset_to_parquet(
399
417
 
400
418
  from cytotable.utils import (
401
419
  _duckdb_reader,
420
+ _extract_npz_to_parquet,
402
421
  _sqlite_mixed_type_query_to_parquet,
403
422
  _write_parquet_table_with_metadata,
404
423
  )
405
424
 
425
+ source_type = str(source["source_path"].suffix).lower()
426
+
406
427
  # attempt to build dest_path
407
428
  source_dest_path = (
408
429
  f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
@@ -410,6 +431,28 @@ def _source_pageset_to_parquet(
410
431
  )
411
432
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
412
433
 
434
+ # If we have npz files, we need to extract them in a specialized manner.
435
+ # See below for CSV and SQLite handling.
436
+ if source_type == ".npz":
437
+ return _extract_npz_to_parquet(
438
+ source_path=str(source["source_path"]),
439
+ dest_path=f"{source_dest_path}/{str(source['source_path'].stem)}.parquet",
440
+ tablenumber=source["tablenumber"],
441
+ )
442
+
443
+ elif pageset is None:
444
+ # if we have a `None` pageset and we're not using
445
+ # npz, then we have an exception (this shouldn't happen
446
+ # because we will need a pageset range to work with for
447
+ # table queries and npz files are handled above with
448
+ # the none case).
449
+ raise CytoTableException(
450
+ (
451
+ "No pageset range provided for source data"
452
+ " (required for non-NPZ datasets)."
453
+ )
454
+ )
455
+
413
456
  # build tablenumber segment addition (if necessary)
414
457
  tablenumber_sql = (
415
458
  # to become tablenumber in sql select later with bigint (8-byte integer)
@@ -439,11 +482,11 @@ def _source_pageset_to_parquet(
439
482
 
440
483
  # build output query and filepath base
441
484
  # (chunked output will append offset to keep output paths unique)
442
- if str(source["source_path"].suffix).lower() == ".csv":
485
+ if source_type == ".csv":
443
486
  base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
444
487
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
445
488
 
446
- elif str(source["source_path"].suffix).lower() == ".sqlite":
489
+ elif source_type == ".sqlite":
447
490
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
448
491
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
449
492
 
@@ -840,7 +883,7 @@ def _join_source_pageset(
840
883
  dest_path: str,
841
884
  joins: str,
842
885
  page_key: str,
843
- pageset: Tuple[int, int],
886
+ pageset: Union[Tuple[int, int], None],
844
887
  sort_output: bool,
845
888
  drop_null: bool,
846
889
  ) -> str:
@@ -877,7 +920,7 @@ def _join_source_pageset(
877
920
  )
878
921
  SELECT *
879
922
  FROM joined
880
- WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
923
+ {f"WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}" if pageset is not None else ""}
881
924
  /* optional sorting per pagset */
882
925
  {"ORDER BY " + page_key if sort_output else ""};
883
926
  """
@@ -902,11 +945,13 @@ def _join_source_pageset(
902
945
 
903
946
  result_file_path = (
904
947
  # store the result in the parent of the dest_path
905
- f"{str(pathlib.Path(dest_path).parent)}/"
948
+ f"{str(pathlib.Path(dest_path).parent)}/" +
906
949
  # use the dest_path stem in the name
907
- f"{str(pathlib.Path(dest_path).stem)}-"
950
+ f"{str(pathlib.Path(dest_path).stem)}-" +
908
951
  # add the pageset indication to the filename
909
952
  f"{pageset[0]}-{pageset[1]}.parquet"
953
+ if pageset is not None
954
+ else ".parquet"
910
955
  )
911
956
 
912
957
  # write the result
@@ -1001,9 +1046,9 @@ def _concat_join_sources(
1001
1046
  def _infer_source_group_common_schema(
1002
1047
  source_group: List[Dict[str, Any]],
1003
1048
  data_type_cast_map: Optional[Dict[str, str]] = None,
1004
- ) -> List[Tuple[str, str]]:
1049
+ ) -> List[Tuple[str, pa.DataType]]:
1005
1050
  """
1006
- Infers a common schema for group of parquet files which may have
1051
+ Infers a common schema for a group of parquet files which may have
1007
1052
  similar but slightly different schema or data. Intended to assist with
1008
1053
  data concatenation and other operations.
1009
1054
 
@@ -1015,9 +1060,8 @@ def _infer_source_group_common_schema(
1015
1060
  A dictionary mapping data type groups to specific types.
1016
1061
  Roughly includes Arrow data types language from:
1017
1062
  https://arrow.apache.org/docs/python/api/datatypes.html
1018
-
1019
1063
  Returns:
1020
- List[Tuple[str, str]]
1064
+ List[Tuple[str, pa.DataType]]
1021
1065
  A list of tuples which includes column name and PyArrow datatype.
1022
1066
  This data will later be used as the basis for forming a PyArrow schema.
1023
1067
  """
@@ -1025,32 +1069,31 @@ def _infer_source_group_common_schema(
1025
1069
  import pyarrow as pa
1026
1070
  import pyarrow.parquet as parquet
1027
1071
 
1028
- from cytotable.exceptions import SchemaException
1072
+ from cytotable.utils import map_pyarrow_type
1029
1073
 
1030
- # read first file for basis of schema and column order for all others
1074
+ # Read the first file to establish the base schema
1031
1075
  common_schema = parquet.read_schema(source_group[0]["table"][0])
1032
1076
 
1033
- # infer common basis of schema and column order for all others
1077
+ # Infer the common schema by comparing all schemas in the group
1034
1078
  for schema in [
1035
1079
  parquet.read_schema(table)
1036
1080
  for source in source_group
1037
1081
  for table in source["table"]
1038
1082
  ]:
1039
- # account for completely equal schema
1083
+ # Skip if the schema matches the common schema
1040
1084
  if schema.equals(common_schema):
1041
1085
  continue
1042
1086
 
1043
- # gather field names from schema
1087
+ # Gather field names from the schema
1044
1088
  schema_field_names = [item.name for item in schema]
1045
1089
 
1046
- # reversed enumeration because removing indexes ascendingly changes schema field order
1090
+ # Reverse enumeration to avoid index shifting when removing fields
1047
1091
  for index, field in reversed(list(enumerate(common_schema))):
1048
- # check whether field name is contained within writer basis, remove if not
1049
- # note: because this only checks for naming, we defer to initially detected type
1092
+ # Remove fields not present in the current schema
1050
1093
  if field.name not in schema_field_names:
1051
1094
  common_schema = common_schema.remove(index)
1052
1095
 
1053
- # check if we have a nulltype and non-nulltype conflict, deferring to non-nulltype
1096
+ # Handle null vs non-null type conflicts
1054
1097
  elif pa.types.is_null(field.type) and not pa.types.is_null(
1055
1098
  schema.field(field.name).type
1056
1099
  ):
@@ -1058,37 +1101,44 @@ def _infer_source_group_common_schema(
1058
1101
  index, field.with_type(schema.field(field.name).type)
1059
1102
  )
1060
1103
 
1061
- # check if we have an integer to float challenge and enable later casting
1104
+ # Handle integer to float type conflicts
1062
1105
  elif pa.types.is_integer(field.type) and pa.types.is_floating(
1063
1106
  schema.field(field.name).type
1064
1107
  ):
1065
1108
  common_schema = common_schema.set(
1066
1109
  index,
1067
1110
  field.with_type(
1068
- # use float64 as a default here if we aren't casting floats
1069
1111
  pa.float64()
1070
1112
  if data_type_cast_map is None
1071
- or "float" not in data_type_cast_map.keys()
1072
- # otherwise use the float data type cast type
1073
- else pa.type_for_alias(data_type_cast_map["float"])
1113
+ else pa.type_for_alias(
1114
+ data_type_cast_map.get("float", "float64")
1115
+ )
1074
1116
  ),
1075
1117
  )
1076
1118
 
1077
- if len(list(common_schema.names)) == 0:
1078
- raise SchemaException(
1079
- (
1080
- "No common schema basis to perform concatenation for source group."
1081
- " All columns mismatch one another within the group."
1082
- )
1083
- )
1119
+ # Handle nested or complex types dynamically
1120
+ else:
1121
+ common_schema = common_schema.set(
1122
+ index,
1123
+ field.with_type(
1124
+ map_pyarrow_type(
1125
+ field_type=field.type, data_type_cast_map=data_type_cast_map
1126
+ )
1127
+ ),
1128
+ )
1084
1129
 
1085
- # return a python-native list of tuples with column names and str types
1086
- return list(
1087
- zip(
1088
- common_schema.names,
1089
- [str(schema_type) for schema_type in common_schema.types],
1130
+ # Validate the schema to ensure all types are valid PyArrow types
1131
+ validated_schema = [
1132
+ (
1133
+ field.name,
1134
+ map_pyarrow_type(
1135
+ field_type=field.type, data_type_cast_map=data_type_cast_map
1136
+ ),
1090
1137
  )
1091
- )
1138
+ for field in common_schema
1139
+ ]
1140
+
1141
+ return validated_schema
1092
1142
 
1093
1143
 
1094
1144
  def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
@@ -1185,9 +1235,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1185
1235
  matching_keys = [
1186
1236
  key for key in page_keys.keys() if key.lower() in source_group_name.lower()
1187
1237
  ]
1188
- if not matching_keys:
1238
+ if not matching_keys and source_datatype != "npz":
1189
1239
  raise CytoTableException(
1190
- f"No matching key found in page_keys for source_group_name: {source_group_name}."
1240
+ f"No matching key found in page_keys for source_group_name: {source_group_name}. "
1191
1241
  "Please include a pagination key based on a column name from the table."
1192
1242
  )
1193
1243
 
@@ -1198,11 +1248,16 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1198
1248
  source,
1199
1249
  **{
1200
1250
  "page_key": (
1201
- page_key := [
1202
- value
1203
- for key, value in page_keys.items()
1204
- if key.lower() in source_group_name.lower()
1205
- ][0]
1251
+ page_key := next(
1252
+ (
1253
+ value
1254
+ for key, value in page_keys.items()
1255
+ if key.lower() in source_group_name.lower()
1256
+ ),
1257
+ # Placeholder value if no match is found
1258
+ # used in cases for .npz source types.
1259
+ "placeholder",
1260
+ )
1206
1261
  ),
1207
1262
  "pagesets": _get_table_keyset_pagination_sets(
1208
1263
  source=source,
@@ -1598,4 +1653,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1598
1653
  **kwargs,
1599
1654
  )
1600
1655
 
1656
+ # cleanup Parsl executor and related
1657
+ parsl.dfk().cleanup()
1658
+
1601
1659
  return output
cytotable/presets.py CHANGED
@@ -317,6 +317,37 @@ config = {
317
317
  # and modified at runtime as needed
318
318
  "CONFIG_JOINS": "",
319
319
  },
320
+ "deepprofiler": {
321
+ # version specifications using related references
322
+ "CONFIG_SOURCE_VERSION": {
323
+ "deepprofiler": "v0.3.1",
324
+ "cellprofiler": "v4.2.x",
325
+ },
326
+ # names of source table compartments (for ex. cells.csv, etc.)
327
+ # in the case of NPZ files, these sometimes
328
+ # include the name of the well or site
329
+ # but not the compartment, and as a result,
330
+ # we specify an empty tuple.
331
+ "CONFIG_NAMES_COMPARTMENTS": tuple(),
332
+ # names of source table metadata (for ex. image.csv, etc.)
333
+ "CONFIG_NAMES_METADATA": tuple(),
334
+ # column names in any compartment or metadata tables which contain
335
+ # unique names to avoid renaming
336
+ "CONFIG_IDENTIFYING_COLUMNS": tuple(),
337
+ # pagination keys for use with this data
338
+ # of the rough format "table" -> "column".
339
+ # note: page keys are expected to be numeric (int, float)
340
+ "CONFIG_PAGE_KEYS": {
341
+ "join": "Metadata_Site",
342
+ },
343
+ # chunk size to use for join operations to help with possible performance issues
344
+ # note: this number is an estimate and is may need changes contingent on data
345
+ # and system used by this library.
346
+ "CONFIG_CHUNK_SIZE": 1000,
347
+ # compartment and metadata joins performed using DuckDB SQL
348
+ # and modified at runtime as needed
349
+ "CONFIG_JOINS": "",
350
+ },
320
351
  }
321
352
  """
322
353
  Configuration presets for CytoTable
cytotable/sources.py CHANGED
@@ -36,7 +36,10 @@ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
36
36
 
37
37
  # set the client for a CloudPath
38
38
  if isinstance(processed_path, CloudPath):
39
- processed_path.client = processed_path.client.__class__(**kwargs)
39
+ # Create a new client instance with the provided kwargs
40
+ client = processed_path.client.__class__(**kwargs)
41
+ # Recreate the CloudPath object with the new client
42
+ processed_path = client.CloudPath(processed_path)
40
43
 
41
44
  return processed_path
42
45
 
@@ -75,7 +78,9 @@ def _get_source_filepaths(
75
78
  "A source_datatype must be specified when using undefined compartments and metadata names."
76
79
  )
77
80
 
78
- # gathers files from provided path using compartments + metadata as a filter
81
+ source_datatypes = [".csv", ".npz", ".sqlite"] # Default supported extensions
82
+
83
+ # Gather files from the provided path using compartments + metadata as a filter
79
84
  sources = [
80
85
  # build source_paths for all files
81
86
  # note: builds local cache for sqlite files from cloud
@@ -90,16 +95,22 @@ def _get_source_filepaths(
90
95
  )
91
96
  # ensure the subpaths meet certain specifications
92
97
  if (
93
- targets is None
94
- or targets == []
95
- # checks for name of the file from targets (compartment + metadata names)
96
- or str(subpath.stem).lower() in [target.lower() for target in targets]
97
- # checks for sqlite extension (which may include compartment + metadata names)
98
- or subpath.suffix.lower() == ".sqlite"
98
+ # If targets are specified, only include files matching targets
99
+ (
100
+ targets is not None
101
+ and str(subpath.stem).lower() in [target.lower() for target in targets]
102
+ or subpath.suffix.lower() == ".sqlite"
103
+ )
104
+ # Otherwise, include files matching the source_datatypes
105
+ or (
106
+ targets is None
107
+ or targets == []
108
+ and subpath.suffix.lower() in source_datatypes
109
+ )
99
110
  )
100
111
  ]
101
112
 
102
- # expand sources to include sqlite tables similarly to files (one entry per table)
113
+ # Expand sources to include sqlite tables similarly to files (one entry per table)
103
114
  expanded_sources = []
104
115
  with _duckdb_reader() as ddb_reader:
105
116
  for element in sources:
@@ -118,8 +129,8 @@ def _get_source_filepaths(
118
129
  """
119
130
  /* perform query on sqlite_master table for metadata on tables */
120
131
  SELECT name as table_name
121
- from sqlite_scan(?, 'sqlite_master')
122
- where type='table'
132
+ FROM sqlite_scan(?, 'sqlite_master')
133
+ WHERE type='table'
123
134
  """,
124
135
  parameters=[str(element["source_path"])],
125
136
  )
@@ -153,10 +164,14 @@ def _get_source_filepaths(
153
164
  # use lowercase version of the path to infer a commonprefix
154
165
  source["source_path"].stem.lower()
155
166
  for source in sources
156
- if source["source_path"].suffix == f".{source_datatype}"
167
+ if source["source_path"].suffix in source_datatypes
157
168
  ]
158
169
  )
159
- grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
170
+ grouped_sources[
171
+ # construct a grouped source name, deferring to use 'all_files'
172
+ # if no common prefix is found.
173
+ f"{common_prefix if common_prefix != '' else 'all_files'}.{source_datatype}"
174
+ ] = sources
160
175
 
161
176
  # otherwise, use the unique names in the paths to determine source grouping
162
177
  else:
@@ -283,7 +298,7 @@ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
283
298
 
284
299
  # if we don't have a sqlite file
285
300
  # (we can't check sqlite files for lines)
286
- if path.suffix.lower() != ".sqlite":
301
+ if path.suffix.lower() not in [".sqlite", ".npz"]:
287
302
  with path.open("r") as f:
288
303
  try:
289
304
  # read two lines, if the second is empty return false
cytotable/utils.py CHANGED
@@ -196,7 +196,7 @@ def _sqlite_mixed_type_query_to_parquet(
196
196
  The name of the table being queried.
197
197
  page_key: str:
198
198
  The column name to be used to identify pagination chunks.
199
- pageset: Tuple[int, int]:
199
+ pageset: Tuple[Union[int, float], Union[int, float]]:
200
200
  The range for values used for paginating data from source.
201
201
  sort_output: bool
202
202
  Specifies whether to sort cytotable output or not.
@@ -336,7 +336,7 @@ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
336
336
  if (
337
337
  isinstance(path, CloudPath)
338
338
  and path.is_file()
339
- and path.suffix.lower() == ".sqlite"
339
+ and path.suffix.lower() in [".sqlite", ".npz"]
340
340
  ):
341
341
  try:
342
342
  # update the path to be the local filepath for reference in CytoTable ops
@@ -706,3 +706,179 @@ def _natural_sort(list_to_sort):
706
706
  for c in re.split("([0-9]+)", str(key))
707
707
  ],
708
708
  )
709
+
710
+
711
+ def _extract_npz_to_parquet(
712
+ source_path: str,
713
+ dest_path: str,
714
+ tablenumber: Optional[int] = None,
715
+ ) -> str:
716
+ """
717
+ Extract data from an .npz file created by DeepProfiler
718
+ as a tabular dataset and write to parquet.
719
+
720
+ DeepProfiler creates datasets which look somewhat like this:
721
+ Keys in the .npz file: ['features', 'metadata', 'locations']
722
+
723
+ Variable: features
724
+ Shape: (229, 6400)
725
+ Data type: float32
726
+
727
+ Variable: locations
728
+ Shape: (229, 2)
729
+ Data type: float64
730
+
731
+ Variable: metadata
732
+ Shape: ()
733
+ Data type: object
734
+ Whole object: {
735
+ 'Metadata_Plate': 'SQ00014812',
736
+ 'Metadata_Well': 'A01',
737
+ 'Metadata_Site': 1,
738
+ 'Plate_Map_Name': 'C-7161-01-LM6-022',
739
+ 'RNA': 'SQ00014812/r01c01f01p01-ch3sk1fk1fl1.png',
740
+ 'ER': 'SQ00014812/r01c01f01p01-ch2sk1fk1fl1.png',
741
+ 'AGP': 'SQ00014812/r01c01f01p01-ch4sk1fk1fl1.png',
742
+ 'Mito': 'SQ00014812/r01c01f01p01-ch5sk1fk1fl1.png',
743
+ 'DNA': 'SQ00014812/r01c01f01p01-ch1sk1fk1fl1.png',
744
+ 'Treatment_ID': 0,
745
+ 'Treatment_Replicate': 1,
746
+ 'Treatment': 'DMSO@NA',
747
+ 'Compound': 'DMSO',
748
+ 'Concentration': '',
749
+ 'Split': 'Training',
750
+ 'Metadata_Model': 'efficientnet'
751
+ }
752
+
753
+ Args:
754
+ source_path: str
755
+ Path to the .npz file.
756
+ dest_path: str
757
+ Destination path for the parquet file.
758
+ tablenumber: Optional[int]
759
+ Optional tablenumber to be added to the data.
760
+
761
+ Returns:
762
+ str
763
+ Path to the exported parquet file.
764
+ """
765
+
766
+ import pathlib
767
+
768
+ import numpy as np
769
+ import pyarrow as pa
770
+ import pyarrow.parquet as parquet
771
+
772
+ # Load features from the .npz file
773
+ with open(source_path, "rb") as data:
774
+ loaded_npz = np.load(file=data, allow_pickle=True)
775
+ # find the shape of the features, which will help structure
776
+ # data which doesn't yet conform to the same shape (by row count).
777
+ rows = loaded_npz["features"].shape[0]
778
+ # note: we use [()] to load the numpy array as a python dict
779
+ metadata = loaded_npz["metadata"][()]
780
+ # fetch the metadata model name, falling back to "DP" if not found
781
+ feature_prefix = metadata.get("Metadata_Model", "DP")
782
+ # we transpose the feature data for more efficient
783
+ # columnar-focused access
784
+ feature_data = loaded_npz["features"].T
785
+
786
+ npz_as_pydict = {
787
+ # add metadata to the table
788
+ # note: metadata within npz files corresponds to a dictionary of
789
+ # various keys and values related to the feature and location data.
790
+ "Metadata_TableNumber": pa.array([tablenumber] * rows, type=pa.int64()),
791
+ "Metadata_NPZSource": pa.array(
792
+ [pathlib.Path(source_path).name] * rows, type=pa.string()
793
+ ),
794
+ **{key: [metadata[key]] * rows for key in metadata.keys()},
795
+ # add locations data to the table
796
+ "Location_Center_X": [loaded_npz["locations"][i][0] for i in range(rows)],
797
+ "Location_Center_Y": [loaded_npz["locations"][i][1] for i in range(rows)],
798
+ # add features data to the table
799
+ **{
800
+ f"{feature_prefix}_{feature_idx + 1}": feature_data[feature_idx]
801
+ for feature_idx in range(feature_data.shape[0])
802
+ },
803
+ }
804
+
805
+ # convert the numpy arrays to a PyArrow table and write to parquet
806
+ parquet.write_table(pa.Table.from_pydict(npz_as_pydict), dest_path)
807
+
808
+ return dest_path
809
+
810
+
811
+ def map_pyarrow_type(
812
+ field_type: pa.DataType, data_type_cast_map: Optional[Dict[str, str]]
813
+ ) -> pa.DataType:
814
+ """
815
+ Map PyArrow types dynamically to handle nested types and casting.
816
+
817
+ This function takes a PyArrow `field_type` and dynamically maps
818
+ it to a valid PyArrow type, handling nested types (e.g., lists,
819
+ structs) and resolving type conflicts (e.g., integer to float).
820
+ It also supports custom type casting using the
821
+ `data_type_cast_map` parameter.
822
+
823
+ Args:
824
+ field_type: pa.DataType
825
+ The PyArrow data type to be mapped.
826
+ This can include simple types (e.g., int, float, string)
827
+ or nested types (e.g., list, struct).
828
+ data_type_cast_map: Optional[Dict[str, str]], default None
829
+ A dictionary mapping data type groups to specific types.
830
+ This allows for custom type casting.
831
+ For example:
832
+ - {"float": "float32"} maps
833
+ floating-point types to `float32`.
834
+ - {"int": "int64"} maps integer
835
+ types to `int64`.
836
+ If `data_type_cast_map` is
837
+ None, default PyArrow types are used.
838
+
839
+ Returns:
840
+ pa.DataType
841
+ The mapped PyArrow data type.
842
+ If no mapping is needed, the original
843
+ `field_type` is returned.
844
+ """
845
+
846
+ if pa.types.is_list(field_type):
847
+ # Handle list types (e.g., list<element: float>)
848
+ return pa.list_(
849
+ map_pyarrow_type(
850
+ field_type=field_type.value_type, data_type_cast_map=data_type_cast_map
851
+ )
852
+ )
853
+ elif pa.types.is_struct(field_type):
854
+ # Handle struct types recursively
855
+ return pa.struct(
856
+ [
857
+ (
858
+ field.name,
859
+ map_pyarrow_type(
860
+ field_type=field.type, data_type_cast_map=data_type_cast_map
861
+ ),
862
+ )
863
+ for field in field_type
864
+ ]
865
+ )
866
+ elif pa.types.is_floating(field_type):
867
+ # Handle floating-point types
868
+ if data_type_cast_map and "float" in data_type_cast_map:
869
+ return pa.type_for_alias(data_type_cast_map["float"])
870
+ return pa.float64() # Default to float64 if no mapping is provided
871
+ elif pa.types.is_integer(field_type):
872
+ # Handle integer types
873
+ if data_type_cast_map and "integer" in data_type_cast_map:
874
+ return pa.type_for_alias(data_type_cast_map["integer"])
875
+ return pa.int64() # Default to int64 if no mapping is provided
876
+ elif pa.types.is_string(field_type):
877
+ # Handle string types
878
+ return pa.string()
879
+ elif pa.types.is_null(field_type):
880
+ # Handle null types
881
+ return pa.null()
882
+ else:
883
+ # Default to the original type if no mapping is needed
884
+ return field_type
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: CytoTable
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  License: BSD-3-Clause License
6
6
  Keywords: python,cellprofiler,single-cell-analysis,way-lab
@@ -13,7 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
- Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
16
+ Requires-Dist: cloudpathlib[all,s3] (>=0.18,<0.22)
17
17
  Requires-Dist: duckdb (>=0.10.1)
18
18
  Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
19
19
  Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
@@ -29,6 +29,11 @@ Description-Content-Type: text/markdown
29
29
 
30
30
  # CytoTable
31
31
 
32
+ ![PyPI - Version](https://img.shields.io/pypi/v/cytotable)
33
+ [![Build Status](https://github.com/cytomining/cytotable/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/cytomining/cytotable/actions/workflows/test.yml?query=branch%3Amain)
34
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
35
+ [![Software DOI badge](https://zenodo.org/badge/DOI/10.5281/zenodo.14888111.svg)](https://doi.org/10.5281/zenodo.14888111)
36
+
32
37
  ![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
33
38
  _Diagram showing data flow relative to this project._
34
39
 
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=-OBuWpdlzgV0Ioz3hUEMEvpRsWpH6AtrmP61mTIds94,316
2
+ cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
+ cytotable/convert.py,sha256=uvdUM5-_cQCuTYA2RtWoA1_ZuGFmMQZUayChTRUWd5A,62916
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=pWYyZsEP-C1zbMUjTMzcJxCeRHcVpAAI-gMV2Nx-6Zc,16459
6
+ cytotable/sources.py,sha256=lScB3GPTIqDjl2Iea5zivjCEll9zYxJt1gIfj4WbCpQ,12959
7
+ cytotable/utils.py,sha256=nH5CBY8thWS6eJtl_SZ059GmcE4rUExZMWelVgjZvho,28937
8
+ cytotable-0.0.15.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.15.dist-info/METADATA,sha256=PkgYmCUmCj7BY6Uo6SgT-3PhcGmbMbqw01MT9yYXRBo,3866
10
+ cytotable-0.0.15.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
11
+ cytotable-0.0.15.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=oohhN_5xl7FYudOQeaflYlKCwL0fBBM_ANxuuPKSFYM,316
2
- cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
- cytotable/convert.py,sha256=5VHnw0eGdfXTbSfeEoPAPVa-dtobM6VHkIJwscLe68M,60651
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=rmDOJ-r2yXj0Kr9GlnINKQR8QA3BVLM5z0UAiV0kK0k,15036
6
- cytotable/sources.py,sha256=LBkLxOhKo-g3_RYAEQnipV6dxbn7IQETDKfd81jjWSo,12371
7
- cytotable/utils.py,sha256=9zOEFWtGauzoZEJD-2PWFhzHeLlbzKHYfYCcZB4mag8,22581
8
- cytotable-0.0.13.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.13.dist-info/METADATA,sha256=N7p0JoPHS_mhT1EfytBddcxB2fL3qECsJM7an9IaAaw,3374
10
- cytotable-0.0.13.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
11
- cytotable-0.0.13.dist-info/RECORD,,