PyPI - CytoTable - Versions diffs - 0.0.13__tar.gz → 0.0.15__tar.gz - Mend

CytoTable 0.0.13tar.gz → 0.0.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{cytotable-0.0.13 → cytotable-0.0.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: CytoTable
-Version: 0.0.13
+Version: 0.0.15
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 License: BSD-3-Clause License
 Keywords: python,cellprofiler,single-cell-analysis,way-lab
@@ -13,7 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
-Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
+Requires-Dist: cloudpathlib[all,s3] (>=0.18,<0.22)
 Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
 Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
@@ -29,6 +29,11 @@ Description-Content-Type: text/markdown
 # CytoTable
+![PyPI - Version](https://img.shields.io/pypi/v/cytotable)
+[![Build Status](https://github.com/cytomining/cytotable/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/cytomining/cytotable/actions/workflows/test.yml?query=branch%3Amain)
+[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![Software DOI badge](https://zenodo.org/badge/DOI/10.5281/zenodo.14888111.svg)](https://doi.org/10.5281/zenodo.14888111)
 ![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
 _Diagram showing data flow relative to this project._

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.13"
+__version__ = "0.0.15"
 from .convert import convert
 from .exceptions import (

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/convert.py RENAMED Viewed

@@ -7,6 +7,7 @@ import logging
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
 import parsl
+import pyarrow as pa
 from parsl.app.app import python_app
 from cytotable.exceptions import CytoTableException
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
 @python_app
 def _get_table_columns_and_types(
     source: Dict[str, Any], sort_output: bool
-) -> List[Dict[str, str]]:
+) -> List[Optional[Dict[str, str]]]:
     """
     Gather column data from table through duckdb.
@@ -38,7 +39,7 @@ def _get_table_columns_and_types(
             Specifies whether to sort cytotable output or not.
     Returns:
-        List[Dict[str, str]]
+        List[Optional[Dict[str, str]]]
             list of dictionaries which each include column level information
     """
@@ -49,6 +50,12 @@ def _get_table_columns_and_types(
     source_path = source["source_path"]
     source_type = str(source_path.suffix).lower()
+    # If we have .npz files, return a list with None
+    # because we're querying a non-tabular data source.
+    # These will be handled later by _extract_npz_to_parquet.
+    if source_type == ".npz":
+        return [None]
     # prepare the data source in the form of a duckdb query
     select_source = (
         f"read_csv_auto('{source_path}')"
@@ -279,7 +286,9 @@ def _get_table_keyset_pagination_sets(
     page_key: str,
     source: Optional[Dict[str, Any]] = None,
     sql_stmt: Optional[str] = None,
-) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
+) -> Union[
+    List[Optional[Tuple[Union[int, float], Union[int, float]]]], List[None], None
+]:
     """
     Get table data chunk keys for later use in capturing segments
     of values. This work also provides a chance to catch problematic
@@ -300,7 +309,7 @@ def _get_table_keyset_pagination_sets(
             data source.
     Returns:
-        List[Any]
+         Union[List[Optional[Tuple[Union[int, float], Union[int, float]]]], None]
             List of keys to use for reading the data later on.
     """
@@ -324,8 +333,15 @@ def _get_table_keyset_pagination_sets(
             with _duckdb_reader() as ddb_reader:
                 if source_type == ".csv":
                     sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
-                else:
+                elif source_type == ".sqlite":
                     sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
+                elif source_type == ".npz":
+                    # If we have npz files there's no need to paginate
+                    # so we return None. None within a list is used as
+                    # a special "passthrough" case within the pipeline
+                    # so we may specially handle NPZ files later on via
+                    # _source_pageset_to_parquet and _extract_npz_to_parquet.
+                    return [None]
                 page_keys = [
                     results[0] for results in ddb_reader.execute(sql_query).fetchall()
@@ -360,14 +376,16 @@ def _get_table_keyset_pagination_sets(
             page_keys = ddb_reader.execute(sql_query).fetchall()
             page_keys = [key[0] for key in page_keys]
-    return _generate_pagesets(page_keys, chunk_size)
+    # The type: mention below is used to ignore a mypy linting error
+    # wherein it considers _generate_pagesets to be invalid.
+    return _generate_pagesets(page_keys, chunk_size)  # type: ignore[return-value]
 @python_app
 def _source_pageset_to_parquet(
     source_group_name: str,
     source: Dict[str, Any],
-    pageset: Tuple[Union[int, float], Union[int, float]],
+    pageset: Optional[Tuple[Union[int, float], Union[int, float]]],
     dest_path: str,
     sort_output: bool,
 ) -> str:
@@ -380,7 +398,7 @@ def _source_pageset_to_parquet(
         source: Dict[str, Any]
             Contains the source data to be chunked. Represents a single
             file or table of some kind along with collected information about table.
-        pageset: Tuple[int, int]
+        pageset: Optional[Tuple[Union[int, float], Union[int, float]]]
             The pageset for chunking the data from source.
         dest_path: str
             Path to store the output data.
@@ -399,10 +417,13 @@ def _source_pageset_to_parquet(
     from cytotable.utils import (
         _duckdb_reader,
+        _extract_npz_to_parquet,
         _sqlite_mixed_type_query_to_parquet,
         _write_parquet_table_with_metadata,
     )
+    source_type = str(source["source_path"].suffix).lower()
     # attempt to build dest_path
     source_dest_path = (
         f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
@@ -410,6 +431,28 @@ def _source_pageset_to_parquet(
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
+    # If we have npz files, we need to extract them in a specialized manner.
+    # See below for CSV and SQLite handling.
+    if source_type == ".npz":
+        return _extract_npz_to_parquet(
+            source_path=str(source["source_path"]),
+            dest_path=f"{source_dest_path}/{str(source['source_path'].stem)}.parquet",
+            tablenumber=source["tablenumber"],
+        )
+    elif pageset is None:
+        # if we have a `None` pageset and we're not using
+        # npz, then we have an exception (this shouldn't happen
+        # because we will need a pageset range to work with for
+        # table queries and npz files are handled above with
+        # the none case).
+        raise CytoTableException(
+            (
+                "No pageset range provided for source data"
+                " (required for non-NPZ datasets)."
+            )
+        )
     # build tablenumber segment addition (if necessary)
     tablenumber_sql = (
         # to become tablenumber in sql select later with bigint (8-byte integer)
@@ -439,11 +482,11 @@ def _source_pageset_to_parquet(
     # build output query and filepath base
     # (chunked output will append offset to keep output paths unique)
-    if str(source["source_path"].suffix).lower() == ".csv":
+    if source_type == ".csv":
         base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
-    elif str(source["source_path"].suffix).lower() == ".sqlite":
+    elif source_type == ".sqlite":
         base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
@@ -840,7 +883,7 @@ def _join_source_pageset(
     dest_path: str,
     joins: str,
     page_key: str,
-    pageset: Tuple[int, int],
+    pageset: Union[Tuple[int, int], None],
     sort_output: bool,
     drop_null: bool,
 ) -> str:
@@ -877,7 +920,7 @@ def _join_source_pageset(
             )
             SELECT *
             FROM joined
-            WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
+            {f"WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}" if pageset is not None else ""}
             /* optional sorting per pagset */
             {"ORDER BY " + page_key if sort_output else ""};
             """
@@ -902,11 +945,13 @@ def _join_source_pageset(
     result_file_path = (
         # store the result in the parent of the dest_path
-        f"{str(pathlib.Path(dest_path).parent)}/"
+        f"{str(pathlib.Path(dest_path).parent)}/" +
         # use the dest_path stem in the name
-        f"{str(pathlib.Path(dest_path).stem)}-"
+        f"{str(pathlib.Path(dest_path).stem)}-" +
         # add the pageset indication to the filename
         f"{pageset[0]}-{pageset[1]}.parquet"
+        if pageset is not None
+        else ".parquet"
     )
     # write the result
@@ -1001,9 +1046,9 @@ def _concat_join_sources(
 def _infer_source_group_common_schema(
     source_group: List[Dict[str, Any]],
     data_type_cast_map: Optional[Dict[str, str]] = None,
-) -> List[Tuple[str, str]]:
+) -> List[Tuple[str, pa.DataType]]:
     """
-    Infers a common schema for group of parquet files which may have
+    Infers a common schema for a group of parquet files which may have
     similar but slightly different schema or data. Intended to assist with
     data concatenation and other operations.
@@ -1015,9 +1060,8 @@ def _infer_source_group_common_schema(
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
             https://arrow.apache.org/docs/python/api/datatypes.html
     Returns:
-        List[Tuple[str, str]]
+        List[Tuple[str, pa.DataType]]
             A list of tuples which includes column name and PyArrow datatype.
             This data will later be used as the basis for forming a PyArrow schema.
     """
@@ -1025,32 +1069,31 @@ def _infer_source_group_common_schema(
     import pyarrow as pa
     import pyarrow.parquet as parquet
-    from cytotable.exceptions import SchemaException
+    from cytotable.utils import map_pyarrow_type
-    # read first file for basis of schema and column order for all others
+    # Read the first file to establish the base schema
     common_schema = parquet.read_schema(source_group[0]["table"][0])
-    # infer common basis of schema and column order for all others
+    # Infer the common schema by comparing all schemas in the group
     for schema in [
         parquet.read_schema(table)
         for source in source_group
         for table in source["table"]
     ]:
-        # account for completely equal schema
+        # Skip if the schema matches the common schema
         if schema.equals(common_schema):
             continue
-        # gather field names from schema
+        # Gather field names from the schema
         schema_field_names = [item.name for item in schema]
-        # reversed enumeration because removing indexes ascendingly changes schema field order
+        # Reverse enumeration to avoid index shifting when removing fields
         for index, field in reversed(list(enumerate(common_schema))):
-            # check whether field name is contained within writer basis, remove if not
-            # note: because this only checks for naming, we defer to initially detected type
+            # Remove fields not present in the current schema
             if field.name not in schema_field_names:
                 common_schema = common_schema.remove(index)
-            # check if we have a nulltype and non-nulltype conflict, deferring to non-nulltype
+            # Handle null vs non-null type conflicts
             elif pa.types.is_null(field.type) and not pa.types.is_null(
                 schema.field(field.name).type
             ):
@@ -1058,37 +1101,44 @@ def _infer_source_group_common_schema(
                     index, field.with_type(schema.field(field.name).type)
                 )
-            # check if we have an integer to float challenge and enable later casting
+            # Handle integer to float type conflicts
             elif pa.types.is_integer(field.type) and pa.types.is_floating(
                 schema.field(field.name).type
             ):
                 common_schema = common_schema.set(
                     index,
                     field.with_type(
-                        # use float64 as a default here if we aren't casting floats
                         pa.float64()
                         if data_type_cast_map is None
-                        or "float" not in data_type_cast_map.keys()
-                        # otherwise use the float data type cast type
-                        else pa.type_for_alias(data_type_cast_map["float"])
+                        else pa.type_for_alias(
+                            data_type_cast_map.get("float", "float64")
+                        )
                     ),
                 )
-    if len(list(common_schema.names)) == 0:
-        raise SchemaException(
-            (
-                "No common schema basis to perform concatenation for source group."
-                " All columns mismatch one another within the group."
-            )
-        )
+            # Handle nested or complex types dynamically
+            else:
+                common_schema = common_schema.set(
+                    index,
+                    field.with_type(
+                        map_pyarrow_type(
+                            field_type=field.type, data_type_cast_map=data_type_cast_map
+                        )
+                    ),
+                )
-    # return a python-native list of tuples with column names and str types
-    return list(
-        zip(
-            common_schema.names,
-            [str(schema_type) for schema_type in common_schema.types],
+    # Validate the schema to ensure all types are valid PyArrow types
+    validated_schema = [
+        (
+            field.name,
+            map_pyarrow_type(
+                field_type=field.type, data_type_cast_map=data_type_cast_map
+            ),
         )
-    )
+        for field in common_schema
+    ]
+    return validated_schema
 def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
@@ -1185,9 +1235,9 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         matching_keys = [
             key for key in page_keys.keys() if key.lower() in source_group_name.lower()
         ]
-        if not matching_keys:
+        if not matching_keys and source_datatype != "npz":
             raise CytoTableException(
-                f"No matching key found in page_keys for source_group_name: {source_group_name}."
+                f"No matching key found in page_keys for source_group_name: {source_group_name}. "
                 "Please include a pagination key based on a column name from the table."
             )
@@ -1198,11 +1248,16 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 source,
                 **{
                     "page_key": (
-                        page_key := [
-                            value
-                            for key, value in page_keys.items()
-                            if key.lower() in source_group_name.lower()
-                        ][0]
+                        page_key := next(
+                            (
+                                value
+                                for key, value in page_keys.items()
+                                if key.lower() in source_group_name.lower()
+                            ),
+                            # Placeholder value if no match is found
+                            # used in cases for .npz source types.
+                            "placeholder",
+                        )
                     ),
                     "pagesets": _get_table_keyset_pagination_sets(
                         source=source,
@@ -1598,4 +1653,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             **kwargs,
         )
+    # cleanup Parsl executor and related
+    parsl.dfk().cleanup()
     return output

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/presets.py RENAMED Viewed

@@ -317,6 +317,37 @@ config = {
         # and modified at runtime as needed
         "CONFIG_JOINS": "",
     },
+    "deepprofiler": {
+        # version specifications using related references
+        "CONFIG_SOURCE_VERSION": {
+            "deepprofiler": "v0.3.1",
+            "cellprofiler": "v4.2.x",
+        },
+        # names of source table compartments (for ex. cells.csv, etc.)
+        # in the case of NPZ files, these sometimes
+        # include the name of the well or site
+        # but not the compartment, and as a result,
+        # we specify an empty tuple.
+        "CONFIG_NAMES_COMPARTMENTS": tuple(),
+        # names of source table metadata (for ex. image.csv, etc.)
+        "CONFIG_NAMES_METADATA": tuple(),
+        # column names in any compartment or metadata tables which contain
+        # unique names to avoid renaming
+        "CONFIG_IDENTIFYING_COLUMNS": tuple(),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "join": "Metadata_Site",
+        },
+        # chunk size to use for join operations to help with possible performance issues
+        # note: this number is an estimate and is may need changes contingent on data
+        # and system used by this library.
+        "CONFIG_CHUNK_SIZE": 1000,
+        # compartment and metadata joins performed using DuckDB SQL
+        # and modified at runtime as needed
+        "CONFIG_JOINS": "",
+    },
 }
 """
 Configuration presets for CytoTable

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/sources.py RENAMED Viewed

@@ -36,7 +36,10 @@ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
     # set the client for a CloudPath
     if isinstance(processed_path, CloudPath):
-        processed_path.client = processed_path.client.__class__(**kwargs)
+        # Create a new client instance with the provided kwargs
+        client = processed_path.client.__class__(**kwargs)
+        # Recreate the CloudPath object with the new client
+        processed_path = client.CloudPath(processed_path)
     return processed_path
@@ -75,7 +78,9 @@ def _get_source_filepaths(
             "A source_datatype must be specified when using undefined compartments and metadata names."
         )
-    # gathers files from provided path using compartments + metadata as a filter
+    source_datatypes = [".csv", ".npz", ".sqlite"]  # Default supported extensions
+    # Gather files from the provided path using compartments + metadata as a filter
     sources = [
         # build source_paths for all files
         # note: builds local cache for sqlite files from cloud
@@ -90,16 +95,22 @@ def _get_source_filepaths(
         )
         # ensure the subpaths meet certain specifications
         if (
-            targets is None
-            or targets == []
-            # checks for name of the file from targets (compartment + metadata names)
-            or str(subpath.stem).lower() in [target.lower() for target in targets]
-            # checks for sqlite extension (which may include compartment + metadata names)
-            or subpath.suffix.lower() == ".sqlite"
+            # If targets are specified, only include files matching targets
+            (
+                targets is not None
+                and str(subpath.stem).lower() in [target.lower() for target in targets]
+                or subpath.suffix.lower() == ".sqlite"
+            )
+            # Otherwise, include files matching the source_datatypes
+            or (
+                targets is None
+                or targets == []
+                and subpath.suffix.lower() in source_datatypes
+            )
         )
     ]
-    # expand sources to include sqlite tables similarly to files (one entry per table)
+    # Expand sources to include sqlite tables similarly to files (one entry per table)
     expanded_sources = []
     with _duckdb_reader() as ddb_reader:
         for element in sources:
@@ -118,8 +129,8 @@ def _get_source_filepaths(
                         """
                         /* perform query on sqlite_master table for metadata on tables */
                         SELECT name as table_name
-                        from sqlite_scan(?, 'sqlite_master')
-                        where type='table'
+                        FROM sqlite_scan(?, 'sqlite_master')
+                        WHERE type='table'
                         """,
                         parameters=[str(element["source_path"])],
                     )
@@ -153,10 +164,14 @@ def _get_source_filepaths(
                 # use lowercase version of the path to infer a commonprefix
                 source["source_path"].stem.lower()
                 for source in sources
-                if source["source_path"].suffix == f".{source_datatype}"
+                if source["source_path"].suffix in source_datatypes
             ]
         )
-        grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
+        grouped_sources[
+            # construct a grouped source name, deferring to use 'all_files'
+            # if no common prefix is found.
+            f"{common_prefix if common_prefix != '' else 'all_files'}.{source_datatype}"
+        ] = sources
     # otherwise, use the unique names in the paths to determine source grouping
     else:
@@ -283,7 +298,7 @@ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
     # if we don't have a sqlite file
     # (we can't check sqlite files for lines)
-    if path.suffix.lower() != ".sqlite":
+    if path.suffix.lower() not in [".sqlite", ".npz"]:
         with path.open("r") as f:
             try:
                 # read two lines, if the second is empty return false

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/utils.py RENAMED Viewed

@@ -196,7 +196,7 @@ def _sqlite_mixed_type_query_to_parquet(
             The name of the table being queried.
         page_key: str:
             The column name to be used to identify pagination chunks.
-        pageset: Tuple[int, int]:
+        pageset: Tuple[Union[int, float], Union[int, float]]:
             The range for values used for paginating data from source.
         sort_output: bool
             Specifies whether to sort cytotable output or not.
@@ -336,7 +336,7 @@ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
     if (
         isinstance(path, CloudPath)
         and path.is_file()
-        and path.suffix.lower() == ".sqlite"
+        and path.suffix.lower() in [".sqlite", ".npz"]
     ):
         try:
             # update the path to be the local filepath for reference in CytoTable ops
@@ -706,3 +706,179 @@ def _natural_sort(list_to_sort):
             for c in re.split("([0-9]+)", str(key))
         ],
     )
+def _extract_npz_to_parquet(
+    source_path: str,
+    dest_path: str,
+    tablenumber: Optional[int] = None,
+) -> str:
+    """
+    Extract data from an .npz file created by DeepProfiler
+    as a tabular dataset and write to parquet.
+    DeepProfiler creates datasets which look somewhat like this:
+    Keys in the .npz file: ['features', 'metadata', 'locations']
+    Variable: features
+    Shape: (229, 6400)
+    Data type: float32
+    Variable: locations
+    Shape: (229, 2)
+    Data type: float64
+    Variable: metadata
+    Shape: ()
+    Data type: object
+    Whole object: {
+    'Metadata_Plate': 'SQ00014812',
+    'Metadata_Well': 'A01',
+    'Metadata_Site': 1,
+    'Plate_Map_Name': 'C-7161-01-LM6-022',
+    'RNA': 'SQ00014812/r01c01f01p01-ch3sk1fk1fl1.png',
+    'ER': 'SQ00014812/r01c01f01p01-ch2sk1fk1fl1.png',
+    'AGP': 'SQ00014812/r01c01f01p01-ch4sk1fk1fl1.png',
+    'Mito': 'SQ00014812/r01c01f01p01-ch5sk1fk1fl1.png',
+    'DNA': 'SQ00014812/r01c01f01p01-ch1sk1fk1fl1.png',
+    'Treatment_ID': 0,
+    'Treatment_Replicate': 1,
+    'Treatment': 'DMSO@NA',
+    'Compound': 'DMSO',
+    'Concentration': '',
+    'Split': 'Training',
+    'Metadata_Model': 'efficientnet'
+    }
+    Args:
+        source_path: str
+            Path to the .npz file.
+        dest_path: str
+            Destination path for the parquet file.
+        tablenumber: Optional[int]
+            Optional tablenumber to be added to the data.
+    Returns:
+        str
+            Path to the exported parquet file.
+    """
+    import pathlib
+    import numpy as np
+    import pyarrow as pa
+    import pyarrow.parquet as parquet
+    # Load features from the .npz file
+    with open(source_path, "rb") as data:
+        loaded_npz = np.load(file=data, allow_pickle=True)
+        # find the shape of the features, which will help structure
+        # data which doesn't yet conform to the same shape (by row count).
+        rows = loaded_npz["features"].shape[0]
+        # note: we use [()] to load the numpy array as a python dict
+        metadata = loaded_npz["metadata"][()]
+        # fetch the metadata model name, falling back to "DP" if not found
+        feature_prefix = metadata.get("Metadata_Model", "DP")
+        # we transpose the feature data for more efficient
+        # columnar-focused access
+        feature_data = loaded_npz["features"].T
+        npz_as_pydict = {
+            # add metadata to the table
+            # note: metadata within npz files corresponds to a dictionary of
+            # various keys and values related to the feature and location data.
+            "Metadata_TableNumber": pa.array([tablenumber] * rows, type=pa.int64()),
+            "Metadata_NPZSource": pa.array(
+                [pathlib.Path(source_path).name] * rows, type=pa.string()
+            ),
+            **{key: [metadata[key]] * rows for key in metadata.keys()},
+            # add locations data to the table
+            "Location_Center_X": [loaded_npz["locations"][i][0] for i in range(rows)],
+            "Location_Center_Y": [loaded_npz["locations"][i][1] for i in range(rows)],
+            # add features data to the table
+            **{
+                f"{feature_prefix}_{feature_idx + 1}": feature_data[feature_idx]
+                for feature_idx in range(feature_data.shape[0])
+            },
+        }
+    # convert the numpy arrays to a PyArrow table and write to parquet
+    parquet.write_table(pa.Table.from_pydict(npz_as_pydict), dest_path)
+    return dest_path
+def map_pyarrow_type(
+    field_type: pa.DataType, data_type_cast_map: Optional[Dict[str, str]]
+) -> pa.DataType:
+    """
+    Map PyArrow types dynamically to handle nested types and casting.
+    This function takes a PyArrow `field_type` and dynamically maps
+    it to a valid PyArrow type, handling nested types (e.g., lists,
+    structs) and resolving type conflicts (e.g., integer to float).
+    It also supports custom type casting using the
+    `data_type_cast_map` parameter.
+    Args:
+        field_type: pa.DataType
+            The PyArrow data type to be mapped.
+            This can include simple types (e.g., int, float, string)
+            or nested types (e.g., list, struct).
+        data_type_cast_map: Optional[Dict[str, str]], default None
+            A dictionary mapping data type groups to specific types.
+            This allows for custom type casting.
+            For example:
+            - {"float": "float32"} maps
+            floating-point types to `float32`.
+            - {"int": "int64"} maps integer
+            types to `int64`.
+            If `data_type_cast_map` is
+            None, default PyArrow types are used.
+    Returns:
+        pa.DataType
+            The mapped PyArrow data type.
+            If no mapping is needed, the original
+            `field_type` is returned.
+    """
+    if pa.types.is_list(field_type):
+        # Handle list types (e.g., list<element: float>)
+        return pa.list_(
+            map_pyarrow_type(
+                field_type=field_type.value_type, data_type_cast_map=data_type_cast_map
+            )
+        )
+    elif pa.types.is_struct(field_type):
+        # Handle struct types recursively
+        return pa.struct(
+            [
+                (
+                    field.name,
+                    map_pyarrow_type(
+                        field_type=field.type, data_type_cast_map=data_type_cast_map
+                    ),
+                )
+                for field in field_type
+            ]
+        )
+    elif pa.types.is_floating(field_type):
+        # Handle floating-point types
+        if data_type_cast_map and "float" in data_type_cast_map:
+            return pa.type_for_alias(data_type_cast_map["float"])
+        return pa.float64()  # Default to float64 if no mapping is provided
+    elif pa.types.is_integer(field_type):
+        # Handle integer types
+        if data_type_cast_map and "integer" in data_type_cast_map:
+            return pa.type_for_alias(data_type_cast_map["integer"])
+        return pa.int64()  # Default to int64 if no mapping is provided
+    elif pa.types.is_string(field_type):
+        # Handle string types
+        return pa.string()
+    elif pa.types.is_null(field_type):
+        # Handle null types
+        return pa.null()
+    else:
+        # Default to the original type if no mapping is needed
+        return field_type

{cytotable-0.0.13 → cytotable-0.0.15}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ requires = [ "poetry-core>=1", "poetry-dynamic-versioning>=1,<2" ]
 [tool.poetry]
 name = "CytoTable"
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-version = "0.0.13"
+version = "0.0.15"
 description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
 authors = [ "Cytomining Community" ]
 license = "BSD-3-Clause License"
@@ -18,7 +18,7 @@ keywords = [ "python", "cellprofiler", "single-cell-analysis", "way-lab" ]
 [tool.poetry.dependencies]
 python = ">=3.9,<3.14"
 pyarrow = ">=13.0.0"
-cloudpathlib = { extras = [ "all", "s3" ], version = "^0.18.0" }
+cloudpathlib = { extras = [ "all", "s3" ], version = ">=0.18,<0.22" }
 duckdb = ">=0.8.0,!=0.10.0,>=0.10.1"
 parsl = ">=2023.9.25"
 numpy = [

{cytotable-0.0.13 → cytotable-0.0.15}/readme.md RENAMED Viewed

@@ -2,6 +2,11 @@
 # CytoTable
+![PyPI - Version](https://img.shields.io/pypi/v/cytotable)
+[![Build Status](https://github.com/cytomining/cytotable/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/cytomining/cytotable/actions/workflows/test.yml?query=branch%3Amain)
+[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![Software DOI badge](https://zenodo.org/badge/DOI/10.5281/zenodo.14888111.svg)](https://doi.org/10.5281/zenodo.14888111)
 ![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
 _Diagram showing data flow relative to this project._

{cytotable-0.0.13 → cytotable-0.0.15}/LICENSE RENAMED Viewed

File without changes

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/constants.py RENAMED Viewed

File without changes

{cytotable-0.0.13 → cytotable-0.0.15}/cytotable/exceptions.py RENAMED Viewed

File without changes

CytoTable 0.0.13__tar.gz → 0.0.15__tar.gz

CytoTable 0.0.13tar.gz → 0.0.15tar.gz