PyPI - CytoTable - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

CytoTable 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

cytotable/__init__.py +4 -0
cytotable/constants.py +74 -0
cytotable/convert.py +127 -153
cytotable/presets.py +30 -10
cytotable/sources.py +47 -18
cytotable/utils.py +74 -75
{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/METADATA +19 -7
cytotable-0.0.4.dist-info/RECORD +11 -0
{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/WHEEL +1 -1
cytotable-0.0.2.dist-info/RECORD +0 -10
{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/LICENSE +0 -0

cytotable/__init__.py CHANGED Viewed

@@ -1,6 +1,10 @@
 """
 __init__.py for cytotable
 """
+# note: version data is maintained by poetry-dynamic-versioning (do not edit)
+__version__ = "0.0.4"
 from .convert import convert
 from .exceptions import (
     CytoTableException,

cytotable/constants.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+CytoTable: constants - storing various constants to be used throughout cytotable.
+"""
+import multiprocessing
+import os
+from typing import cast
+from cytotable.utils import _get_cytotable_version
+# read max threads from environment if necessary
+# max threads will be used with default Parsl config and Duckdb
+MAX_THREADS = (
+    multiprocessing.cpu_count()
+    if "CYTOTABLE_MAX_THREADS" not in os.environ
+    else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
+)
+# enables overriding default memory mapping behavior with pyarrow memory mapping
+CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
+    os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
+)
+DDB_DATA_TYPE_SYNONYMS = {
+    "real": ["float32", "float4", "float"],
+    "double": ["float64", "float8", "numeric", "decimal"],
+    "integer": ["int32", "int4", "int", "signed"],
+    "bigint": ["int64", "int8", "long"],
+}
+# A reference dictionary for SQLite affinity and storage class types
+# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
+SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
+    "integer": [
+        "int",
+        "integer",
+        "tinyint",
+        "smallint",
+        "mediumint",
+        "bigint",
+        "unsigned big int",
+        "int2",
+        "int8",
+    ],
+    "text": [
+        "character",
+        "varchar",
+        "varying character",
+        "nchar",
+        "native character",
+        "nvarchar",
+        "text",
+        "clob",
+    ],
+    "blob": ["blob"],
+    "real": [
+        "real",
+        "double",
+        "double precision",
+        "float",
+    ],
+    "numeric": [
+        "numeric",
+        "decimal",
+        "boolean",
+        "date",
+        "datetime",
+    ],
+}
+CYTOTABLE_DEFAULT_PARQUET_METADATA = {
+    "data-producer": "https://github.com/cytomining/CytoTable",
+    "data-producer-version": str(_get_cytotable_version()),
+}

cytotable/convert.py CHANGED Viewed

@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
             segment_type as column_dtype
         FROM pragma_storage_info('column_details')
         /* avoid duplicate entries in the form of VALIDITY segment_types */
-        WHERE segment_type != 'VALIDITY';
+        WHERE segment_type != 'VALIDITY'
+        /* explicitly order the columns by their id to avoid inconsistent results */
+        ORDER BY column_id ASC;
         """
     # attempt to read the data to parquet from duckdb
@@ -175,8 +177,9 @@ def _prep_cast_column_data_types(
 @python_app
 def _get_table_chunk_offsets(
-    source: Dict[str, Any],
     chunk_size: int,
+    source: Optional[Dict[str, Any]] = None,
+    sql_stmt: Optional[str] = None,
 ) -> Union[List[int], None]:
     """
     Get table data chunk offsets for later use in capturing segments
@@ -207,39 +210,54 @@ def _get_table_chunk_offsets(
     logger = logging.getLogger(__name__)
-    table_name = source["table_name"] if "table_name" in source.keys() else None
-    source_path = source["source_path"]
-    source_type = str(pathlib.Path(source_path).suffix).lower()
+    if source is not None:
+        table_name = source["table_name"] if "table_name" in source.keys() else None
+        source_path = source["source_path"]
+        source_type = str(pathlib.Path(source_path).suffix).lower()
-    try:
-        # for csv's, check that we have more than one row (a header and data values)
-        if (
-            source_type == ".csv"
-            and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
-        ):
-            raise NoInputDataException(
-                f"Data file has 0 rows of values. Error in file: {source_path}"
+        try:
+            # for csv's, check that we have more than one row (a header and data values)
+            if (
+                source_type == ".csv"
+                and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
+            ):
+                raise NoInputDataException(
+                    f"Data file has 0 rows of values. Error in file: {source_path}"
+                )
+            # gather the total rowcount from csv or sqlite data input sources
+            with _duckdb_reader() as ddb_reader:
+                rowcount = int(
+                    ddb_reader.execute(
+                        # nosec
+                        f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
+                        if source_type == ".csv"
+                        else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
+                    ).fetchone()[0]
+                )
+        # catch input errors which will result in skipped files
+        except (
+            duckdb.InvalidInputException,
+            NoInputDataException,
+        ) as invalid_input_exc:
+            logger.warning(
+                msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
             )
+            return None
+    # find chunk offsets from sql statement
+    elif sql_stmt is not None:
         # gather the total rowcount from csv or sqlite data input sources
         with _duckdb_reader() as ddb_reader:
             rowcount = int(
                 ddb_reader.execute(
                     # nosec
-                    f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
-                    if source_type == ".csv"
-                    else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
+                    f"SELECT COUNT(*) FROM ({sql_stmt})"
                 ).fetchone()[0]
             )
-    # catch input errors which will result in skipped files
-    except (duckdb.InvalidInputException, NoInputDataException) as invalid_input_exc:
-        logger.warning(
-            msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
-        )
-        return None
     return list(
         range(
             0,
@@ -258,7 +276,6 @@ def _source_chunk_to_parquet(
     chunk_size: int,
     offset: int,
     dest_path: str,
-    data_type_cast_map: Optional[Dict[str, str]] = None,
 ) -> str:
     """
     Export source data to chunked parquet file using chunk size and offsets.
@@ -287,7 +304,11 @@ def _source_chunk_to_parquet(
     from cloudpathlib import AnyPath
     from pyarrow import parquet
-    from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
+    from cytotable.utils import (
+        _duckdb_reader,
+        _sqlite_mixed_type_query_to_parquet,
+        _write_parquet_table_with_metadata,
+    )
     # attempt to build dest_path
     source_dest_path = (
@@ -300,7 +321,7 @@ def _source_chunk_to_parquet(
     select_columns = ",".join(
         [
             # here we cast the column to the specified type ensure the colname remains the same
-            f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
+            f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
             for column in source["columns"]
         ]
     )
@@ -324,7 +345,7 @@ def _source_chunk_to_parquet(
         # read data with chunk size + offset
         # and export to parquet
         with _duckdb_reader() as ddb_reader:
-            parquet.write_table(
+            _write_parquet_table_with_metadata(
                 table=ddb_reader.execute(
                     f"""
                     {base_query}
@@ -343,7 +364,7 @@ def _source_chunk_to_parquet(
             "Mismatch Type Error" in str(e)
             and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
         ):
-            parquet.write_table(
+            _write_parquet_table_with_metadata(
                 # here we use sqlite instead of duckdb to extract
                 # data for special cases where column and value types
                 # may not align (which is valid functionality in SQLite).
@@ -395,14 +416,28 @@ def _prepend_column_name(
             Path to the modified file.
     """
+    import logging
     import pathlib
     import pyarrow.parquet as parquet
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.utils import _write_parquet_table_with_metadata
+    logger = logging.getLogger(__name__)
     targets = tuple(metadata) + tuple(compartments)
+    # if we have no targets or metadata to work from, return the table unchanged
+    if len(targets) == 0:
+        logger.warning(
+            msg=(
+                "Skipping column name prepend operations"
+                "because no compartments or metadata were provided."
+            )
+        )
+        return table_path
     table = parquet.read_table(
         source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
     )
@@ -484,7 +519,7 @@ def _prepend_column_name(
             updated_column_names.append(column_name)
     # perform table column name updates
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=table.rename_columns(updated_column_names), where=table_path
     )
@@ -549,13 +584,18 @@ def _concat_source_group(
             Updated dictionary containing concatenated sources.
     """
+    import errno
     import pathlib
     import pyarrow as pa
     import pyarrow.parquet as parquet
+    from cytotable.constants import (
+        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+        CYTOTABLE_DEFAULT_PARQUET_METADATA,
+    )
     from cytotable.exceptions import SchemaException
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.utils import _write_parquet_table_with_metadata
     # build a result placeholder
     concatted: List[Dict[str, Any]] = [
@@ -585,7 +625,9 @@ def _concat_source_group(
     destination_path.parent.mkdir(parents=True, exist_ok=True)
     # build the schema for concatenation writer
-    writer_schema = pa.schema(common_schema)
+    writer_schema = pa.schema(common_schema).with_metadata(
+        CYTOTABLE_DEFAULT_PARQUET_METADATA
+    )
     # build a parquet file writer which will be used to append files
     # as a single concatted parquet file, referencing the first file's schema
@@ -623,7 +665,7 @@ def _concat_source_group(
                 pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
             except OSError as os_err:
                 # raise only if we don't have a dir not empty errno
-                if os_err.errno != 66:
+                if os_err.errno != errno.ENOTEMPTY:
                     raise
     # return the concatted parquet filename
@@ -632,75 +674,51 @@ def _concat_source_group(
     return concatted
-@python_app
-def _get_join_chunks(
+@python_app()
+def _prepare_join_sql(
     sources: Dict[str, List[Dict[str, Any]]],
-    metadata: Union[List[str], Tuple[str, ...]],
-    chunk_columns: Union[List[str], Tuple[str, ...]],
-    chunk_size: int,
-) -> List[List[Dict[str, Any]]]:
+    joins: str,
+) -> str:
     """
-    Build groups of join keys for later join operations
+    Prepare join SQL statement with actual locations of data based on the sources.
     Args:
-        sources: Dict[List[Dict[str, Any]]]:
+        sources: Dict[str, List[Dict[str, Any]]]:
             Grouped datasets of files which will be used by other functions.
-        metadata: Union[List[str], Tuple[str, ...]]:
-            List of source data names which are used as metadata.
-        chunk_columns: Union[List[str], Tuple[str, ...]]:
-            Column names which appear in all compartments to use when performing join.
-        chunk_size: int:
-            Size of join chunks which is used to limit data size during join ops.
+            Includes the metadata concerning location of actual data.
+        joins: str:
+            DuckDB-compatible SQL which will be used to perform the join
+            operations using the join_group keys as a reference.
     Returns:
-        List[List[Dict[str, Any]]]]:
-            A list of lists with at most chunk size length that contain join keys.
+        str:
+            String representing the SQL to be used in later join work.
     """
     import pathlib
-    import pyarrow.parquet as parquet
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
-    # fetch the compartment concat result as the basis for join groups
-    for key, source in sources.items():
-        if any(name.lower() in pathlib.Path(key).stem.lower() for name in metadata):
-            first_result = source
-            break
-    # gather the workflow result for basis if it's not yet returned
-    basis = first_result
-    # read only the table's chunk_columns
-    join_column_rows = parquet.read_table(
-        source=basis[0]["table"],
-        columns=list(chunk_columns),
-        memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
-    ).to_pylist()
+    # replace with real location of sources for join sql
+    for key, val in sources.items():
+        if pathlib.Path(key).stem.lower() in joins.lower():
+            joins = joins.replace(
+                f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
+                str([str(table) for table in val[0]["table"]]),
+            )
-    # build and return the chunked join column rows
-    return [
-        join_column_rows[i : i + chunk_size]
-        for i in range(0, len(join_column_rows), chunk_size)
-    ]
+    return joins
 @python_app
 def _join_source_chunk(
-    sources: Dict[str, List[Dict[str, Any]]],
     dest_path: str,
     joins: str,
-    join_group: List[Dict[str, Any]],
+    chunk_size: int,
+    offset: int,
     drop_null: bool,
 ) -> str:
     """
     Join sources based on join group keys (group of specific join column values)
     Args:
-        sources: Dict[str, List[Dict[str, Any]]]:
-            Grouped datasets of files which will be used by other functions.
-            Includes the metadata concerning location of actual data.
         dest_path: str:
             Destination path to write file-based content.
         joins: str:
@@ -722,54 +740,20 @@ def _join_source_chunk(
     import pyarrow.parquet as parquet
-    from cytotable.utils import _duckdb_reader
-    # replace with real location of sources for join sql
-    for key, val in sources.items():
-        if pathlib.Path(key).stem.lower() in joins.lower():
-            joins = joins.replace(
-                f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
-                str([str(table) for table in val[0]["table"]]),
-            )
-    # update the join groups to include unique values per table
-    updated_join_group = []
-    for key in sources.keys():
-        updated_join_group.extend(
-            [
-                {
-                    f"{str(pathlib.Path(key).stem)}.{join_key}": val
-                    for join_key, val in chunk.items()
-                }
-                for chunk in join_group
-            ]
-        )
-    # form where clause for sql joins to filter the results
-    joins += (
-        "WHERE ("
-        + ") OR (".join(
-            [
-                " AND ".join(
-                    [
-                        # create groups of join column filters where values always
-                        # are expected to equal those within the join_group together
-                        f"{join_column} = {join_column_value}"
-                        if not isinstance(join_column_value, str)
-                        # account for string values
-                        else (f"{join_column} = " f"'{join_column_value}'")
-                        for join_column, join_column_value in chunk.items()
-                    ]
-                )
-                for chunk in updated_join_group
-            ]
-        )
-        + ")"
-    )
+    from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
+    # Attempt to read the data to parquet file
+    # using duckdb for extraction and pyarrow for
+    # writing data to a parquet file.
+    # read data with chunk size + offset
+    # and export to parquet
     with _duckdb_reader() as ddb_reader:
-        # perform compartment joins using duckdb over parquet files
-        result = ddb_reader.execute(joins).arrow()
+        result = ddb_reader.execute(
+            f"""
+                {joins}
+                LIMIT {chunk_size} OFFSET {offset}
+                """
+        ).arrow()
     # drop nulls if specified
     if drop_null:
@@ -800,7 +784,7 @@ def _join_source_chunk(
     )
     # write the result
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=result,
         where=result_file_path,
     )
@@ -840,7 +824,11 @@ def _concat_join_sources(
     import pyarrow.parquet as parquet
-    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.constants import (
+        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+        CYTOTABLE_DEFAULT_PARQUET_METADATA,
+    )
+    from cytotable.utils import _write_parquet_table_with_metadata
     # remove the unjoined concatted compartments to prepare final dest_path usage
     # (we now have joined results)
@@ -854,7 +842,7 @@ def _concat_join_sources(
         shutil.rmtree(path=dest_path)
     # write the concatted result as a parquet file
-    parquet.write_table(
+    _write_parquet_table_with_metadata(
         table=pa.concat_tables(
             tables=[
                 parquet.read_table(
@@ -869,7 +857,9 @@ def _concat_join_sources(
     # build a parquet file writer which will be used to append files
     # as a single concatted parquet file, referencing the first file's schema
     # (all must be the same schema)
-    writer_schema = parquet.read_schema(join_sources[0])
+    writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
+        CYTOTABLE_DEFAULT_PARQUET_METADATA
+    )
     with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
         for table_path in join_sources:
             writer.write_table(
@@ -1012,7 +1002,6 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     concat: bool,
     join: bool,
     joins: Optional[str],
-    chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
     chunk_size: Optional[int],
     infer_common_schema: bool,
     drop_null: bool,
@@ -1048,8 +1037,6 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             Whether to join the compartment data together into one dataset.
         joins: str:
             DuckDB-compatible SQL which will be used to perform the join operations.
-        chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
-            Column names which appear in all compartments to use when performing join.
         chunk_size: Optional[int],
             Size of join chunks which is used to limit data size during join ops.
         infer_common_schema: bool:  (Default value = True)
@@ -1074,7 +1061,6 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     from cytotable.convert import (
         _concat_join_sources,
         _concat_source_group,
-        _get_join_chunks,
         _get_table_chunk_offsets,
         _infer_source_group_common_schema,
         _join_source_chunk,
@@ -1161,7 +1147,6 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                                 chunk_size=chunk_size,
                                 offset=offset,
                                 dest_path=expanded_dest_path,
-                                data_type_cast_map=data_type_cast_map,
                             ),
                             source_group_name=source_group_name,
                             identifying_columns=identifying_columns,
@@ -1210,6 +1195,8 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     # conditional section for merging
     # note: join implies a concat, but concat does not imply a join
     if join:
+        prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
         # map joined results based on the join groups gathered above
         # note: after mapping we end up with a list of strings (task returns str)
         join_sources_result = [
@@ -1217,21 +1204,18 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 # gather the result of concatted sources prior to
                 # join group merging as each mapped task run will need
                 # full concat results
-                sources=results,
                 dest_path=expanded_dest_path,
-                joins=joins,
-                # get merging chunks by join columns
-                join_group=join_group,
+                joins=prepared_joins_sql,
+                chunk_size=chunk_size,
+                offset=offset,
                 drop_null=drop_null,
             ).result()
             # create join group for querying the concatenated
             # data in order to perform memory-safe joining
             # per user chunk size specification.
-            for join_group in _get_join_chunks(
-                sources=results,
-                chunk_columns=chunk_columns,
+            for offset in _get_table_chunk_offsets(
+                sql_stmt=prepared_joins_sql,
                 chunk_size=chunk_size,
-                metadata=metadata,
             ).result()
         ]
@@ -1259,7 +1243,6 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     concat: bool = True,
     join: bool = True,
     joins: Optional[str] = None,
-    chunk_columns: Optional[Union[List[str], Tuple[str, ...]]] = None,
     chunk_size: Optional[int] = None,
     infer_common_schema: bool = True,
     drop_null: bool = False,
@@ -1303,9 +1286,6 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             Whether to join the compartment data together into one dataset
         joins: str: (Default value = None):
             DuckDB-compatible SQL which will be used to perform the join operations.
-        chunk_columns: Optional[Union[List[str], Tuple[str, ...]]]
-            (Default value = None)
-            Column names which appear in all compartments to use when performing join
         chunk_size: Optional[int] (Default value = None)
             Size of join chunks which is used to limit data size during join ops
         infer_common_schema: bool: (Default value = True)
@@ -1402,11 +1382,6 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             else identifying_columns
         )
         joins = cast(str, config[preset]["CONFIG_JOINS"]) if joins is None else joins
-        chunk_columns = (
-            cast(list, config[preset]["CONFIG_CHUNK_COLUMNS"])
-            if chunk_columns is None
-            else chunk_columns
-        )
         chunk_size = (
             cast(int, config[preset]["CONFIG_CHUNK_SIZE"])
             if chunk_size is None
@@ -1425,7 +1400,6 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             concat=concat,
             join=join,
             joins=joins,
-            chunk_columns=chunk_columns,
             chunk_size=chunk_size,
             infer_common_schema=infer_common_schema,
             drop_null=drop_null,

cytotable/presets.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Presets for common pycytominer-transform configurations.
+Presets for common CytoTable configurations.
 """
 config = {
@@ -26,8 +26,6 @@ config = {
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
         "CONFIG_CHUNK_SIZE": 1000,
-        # chunking columns to use along with chunk size for join operations
-        "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
@@ -73,8 +71,6 @@ config = {
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
         "CONFIG_CHUNK_SIZE": 1000,
-        # chunking columns to use along with chunk size for join operations
-        "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
@@ -126,8 +122,6 @@ config = {
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
         "CONFIG_CHUNK_SIZE": 1000,
-        # chunking columns to use along with chunk size for join operations
-        "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
@@ -181,8 +175,6 @@ config = {
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
         "CONFIG_CHUNK_SIZE": 1000,
-        # chunking columns to use along with chunk size for join operations
-        "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
@@ -212,7 +204,35 @@ config = {
                 AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
         """,
     },
+    "in-carta": {
+        # version specifications using related references
+        "CONFIG_SOURCE_VERSION": {
+            "in-carta": "v1.17.0412545",
+        },
+        # names of source table compartments (for ex. cells.csv, etc.)
+        "CONFIG_NAMES_COMPARTMENTS": tuple(),
+        # names of source table metadata (for ex. image.csv, etc.)
+        "CONFIG_NAMES_METADATA": tuple(),
+        # column names in any compartment or metadata tables which contain
+        # unique names to avoid renaming
+        "CONFIG_IDENTIFYING_COLUMNS": (
+            "OBJECT ID",
+            "Row",
+            "Column",
+            "FOV",
+            "WELL LABEL",
+            "Z",
+            "T",
+        ),
+        # chunk size to use for join operations to help with possible performance issues
+        # note: this number is an estimate and is may need changes contingent on data
+        # and system used by this library.
+        "CONFIG_CHUNK_SIZE": 1000,
+        # compartment and metadata joins performed using DuckDB SQL
+        # and modified at runtime as needed
+        "CONFIG_JOINS": "",
+    },
 }
 """
-Configuration presets for pycytominer-transform
+Configuration presets for CytoTable
 """

cytotable/sources.py CHANGED Viewed

@@ -47,6 +47,7 @@ def _build_path(
 def _get_source_filepaths(
     path: Union[pathlib.Path, AnyPath],
     targets: List[str],
+    source_datatype: Optional[str] = None,
 ) -> Dict[str, List[Dict[str, Any]]]:
     """
     Gather dataset of filepaths from a provided directory path.
@@ -56,19 +57,27 @@ def _get_source_filepaths(
             Either a directory path to seek filepaths within or a path directly to a file.
         targets: List[str]:
             Compartment and metadata names to seek within the provided path.
+        source_datatype: Optional[str]:  (Default value = None)
+            The source datatype (extension) to use for reading the tables.
     Returns:
         Dict[str, List[Dict[str, Any]]]
             Data structure which groups related files based on the compartments.
     """
+    import os
     import pathlib
     from cloudpathlib import AnyPath
-    from cytotable.exceptions import NoInputDataException
+    from cytotable.exceptions import DatatypeException, NoInputDataException
     from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
+    if (targets is None or targets == []) and source_datatype is None:
+        raise DatatypeException(
+            f"A source_datatype must be specified when using undefined compartments and metadata names."
+        )
     # gathers files from provided path using compartments + metadata as a filter
     sources = [
         # build source_paths for all files
@@ -85,6 +94,7 @@ def _get_source_filepaths(
         # ensure the subpaths meet certain specifications
         if (
             targets is None
+            or targets == []
             # checks for name of the file from targets (compartment + metadata names)
             or str(subpath.stem).lower() in [target.lower() for target in targets]
             # checks for sqlite extension (which may include compartment + metadata names)
@@ -134,21 +144,38 @@ def _get_source_filepaths(
     # group files together by similar filename for later data operations
     grouped_sources = {}
-    for unique_source in set(source["source_path"].name for source in sources):
-        grouped_sources[unique_source.capitalize()] = [
-            # case for files besides sqlite
-            source if source["source_path"].suffix.lower() != ".sqlite"
-            # if we have sqlite entries, update the source_path to the parent
-            # (the parent table database file) as grouped key name will now
-            # encapsulate the table name details.
-            else {
-                "source_path": source["source_path"].parent,
-                "table_name": source["table_name"],
-            }
-            for source in sources
-            # focus only on entries which include the unique_source name
-            if source["source_path"].name == unique_source
-        ]
+    # if we have no targets, create a single group inferred from a common prefix and suffix
+    # note: this may apply for scenarios where no compartments or metadata are
+    # provided as input to CytoTable operations.
+    if targets is None or targets == []:
+        # gather a common prefix to use for the group
+        common_prefix = os.path.commonprefix(
+            [
+                source["source_path"].stem
+                for source in sources
+                if source["source_path"].suffix == f".{source_datatype}"
+            ]
+        )
+        grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
+    # otherwise, use the unique names in the paths to determine source grouping
+    else:
+        for unique_source in set(source["source_path"].name for source in sources):
+            grouped_sources[unique_source.capitalize()] = [
+                # case for files besides sqlite
+                source if source["source_path"].suffix.lower() != ".sqlite"
+                # if we have sqlite entries, update the source_path to the parent
+                # (the parent table database file) as grouped key name will now
+                # encapsulate the table name details.
+                else {
+                    "source_path": source["source_path"].parent,
+                    "table_name": source["table_name"],
+                }
+                for source in sources
+                # focus only on entries which include the unique_source name
+                if source["source_path"].name == unique_source
+            ]
     return grouped_sources
@@ -190,7 +217,7 @@ def _infer_source_datatype(
         raise DatatypeException(
             (
                 f"Unable to find source datatype {source_datatype} "
-                "within files. Detected datatypes: {suffixes}"
+                f"within files. Detected datatypes: {suffixes}"
             )
         )
@@ -270,7 +297,9 @@ def _gather_sources(
     source_path = _build_path(path=source_path, **kwargs)
     # gather filepaths which will be used as the basis for this work
-    sources = _get_source_filepaths(path=source_path, targets=targets)
+    sources = _get_source_filepaths(
+        path=source_path, targets=targets, source_datatype=source_datatype
+    )
     # infer or validate the source datatype based on source filepaths
     source_datatype = _infer_source_datatype(

cytotable/utils.py CHANGED Viewed

@@ -3,83 +3,22 @@ Utility functions for CytoTable
 """
 import logging
-import multiprocessing
 import os
 import pathlib
-from typing import Any, Dict, Union, cast
+from typing import Any, Dict, Optional, Union, cast
 import duckdb
 import parsl
+import pyarrow as pa
 from cloudpathlib import AnyPath, CloudPath
 from cloudpathlib.exceptions import InvalidPrefixError
 from parsl.app.app import AppBase
 from parsl.config import Config
-from parsl.errors import ConfigurationError
+from parsl.errors import NoDataFlowKernelError
 from parsl.executors import HighThroughputExecutor
 logger = logging.getLogger(__name__)
-# read max threads from environment if necessary
-# max threads will be used with default Parsl config and Duckdb
-MAX_THREADS = (
-    multiprocessing.cpu_count()
-    if "CYTOTABLE_MAX_THREADS" not in os.environ
-    else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
-)
-# enables overriding default memory mapping behavior with pyarrow memory mapping
-CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
-    os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
-)
-DDB_DATA_TYPE_SYNONYMS = {
-    "real": ["float32", "float4", "float"],
-    "double": ["float64", "float8", "numeric", "decimal"],
-    "integer": ["int32", "int4", "int", "signed"],
-    "bigint": ["int64", "int8", "long"],
-}
-# A reference dictionary for SQLite affinity and storage class types
-# See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
-SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
-    "integer": [
-        "int",
-        "integer",
-        "tinyint",
-        "smallint",
-        "mediumint",
-        "bigint",
-        "unsigned big int",
-        "int2",
-        "int8",
-    ],
-    "text": [
-        "character",
-        "varchar",
-        "varying character",
-        "nchar",
-        "native character",
-        "nvarchar",
-        "text",
-        "clob",
-    ],
-    "blob": ["blob"],
-    "real": [
-        "real",
-        "double",
-        "double precision",
-        "float",
-    ],
-    "numeric": [
-        "numeric",
-        "decimal",
-        "boolean",
-        "date",
-        "datetime",
-    ],
-}
 # reference the original init
 original_init = AppBase.__init__
@@ -108,15 +47,10 @@ def _parsl_loaded() -> bool:
     try:
         # try to reference Parsl dataflowkernel
         parsl.dfk()
-    except ConfigurationError as pce:
-        # if we detect a Parsl ConfigurationError that states we need to load config
+    except NoDataFlowKernelError:
+        # if we detect a Parsl NoDataFlowKernelError
         # return false to indicate parsl config has not yet been loaded.
-        if pce.args[0] == "Must first load config":
-            return False
-        # otherwise we raise other ConfigurationError's
-        else:
-            raise
+        return False
     # otherwise we indicate parsl config has already been loaded
     return True
@@ -203,6 +137,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
         duckdb.DuckDBPyConnection
     """
+    import duckdb
+    from cytotable.constants import MAX_THREADS
     return duckdb.connect().execute(
         # note: we use an f-string here to
         # dynamically configure threads as appropriate
@@ -257,20 +195,25 @@ def _sqlite_mixed_type_query_to_parquet(
     import pyarrow as pa
+    from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
     from cytotable.exceptions import DatatypeException
-    from cytotable.utils import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
     # open sqlite3 connection
     with sqlite3.connect(source_path) as conn:
         cursor = conn.cursor()
-        # gather table column details including datatype
+        # Gather table column details including datatype.
+        # Note: uses SQLite pragma for table information.
+        # See the following for more information:
+        # https://sqlite.org/pragma.html#pragma_table_info
         cursor.execute(
             f"""
             SELECT :table_name as table_name,
                     name as column_name,
                     type as column_type
-            FROM pragma_table_info(:table_name);
+            FROM pragma_table_info(:table_name)
+            /* explicit column ordering by 'cid' */
+            ORDER BY cid ASC;
             """,
             {"table_name": table_name},
         )
@@ -389,6 +332,9 @@ def _arrow_type_cast_if_specified(
         Dict[str, str]
             A potentially data type updated dictionary of column information
     """
+    from cytotable.constants import DDB_DATA_TYPE_SYNONYMS
     # for casting to new float type
     if "float" in data_type_cast_map.keys() and column["column_dtype"] in [
         "REAL",
@@ -458,3 +404,56 @@ def _expand_path(
         modifed_path = modifed_path.expanduser()
     return modifed_path.resolve()
+def _get_cytotable_version() -> str:
+    """
+    Seeks the current version of CytoTable using either pkg_resources
+    or dunamai to determine the current version being used.
+    Returns:
+        str
+            A string representing the version of CytoTable currently being used.
+    """
+    try:
+        # attempt to gather the development version from dunamai
+        # for scenarios where cytotable from source is used.
+        import dunamai
+        return dunamai.Version.from_any_vcs().serialize()
+    except (RuntimeError, ModuleNotFoundError):
+        # else grab a static version from __init__.py
+        # for scenarios where the built/packaged cytotable is used.
+        import cytotable
+        return cytotable.__version__
+def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
+    """
+    Adds metadata to parquet output from CytoTable.
+    Note: this mostly wraps pyarrow.parquet.write_table
+    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
+    Args:
+        table: pa.Table:
+            Pyarrow table to be serialized as parquet table.
+        **kwargs: Any:
+            kwargs provided to this function roughly align with
+            pyarrow.parquet.write_table. The following might be
+            examples of what to expect here:
+            - where: str or pyarrow.NativeFile
+    """
+    from pyarrow import parquet
+    from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA
+    from cytotable.utils import _get_cytotable_version
+    parquet.write_table(
+        table=table.replace_schema_metadata(
+            metadata=CYTOTABLE_DEFAULT_PARQUET_METADATA
+        ),
+        **kwargs,
+    )

{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.2
+Version: 0.0.4
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License
@@ -13,10 +13,11 @@ Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
-Requires-Dist: duckdb (>=0.8.0,<0.9.0)
-Requires-Dist: parsl (>=2023.9.18)
-Requires-Dist: pyarrow (>=13.0.0,<14.0.0)
+Requires-Dist: duckdb (>=0.8.0)
+Requires-Dist: parsl (>=2023.9.25)
+Requires-Dist: pyarrow (>=13.0.0)
 Project-URL: Documentation, https://cytomining.github.io/CytoTable/
 Project-URL: Repository, https://github.com/cytomining/CytoTable
 Description-Content-Type: text/markdown
@@ -25,20 +26,31 @@ Description-Content-Type: text/markdown
 # CytoTable
-![dataflow](docs/source/_static/dataflow.svg)
+![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
 _Diagram showing data flow relative to this project._
 ## Summary
-CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
+CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
 CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
 The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
+The name for the project is inspired from:
+- __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
+- __Table__:
+  - "1. Furniture with a top surface to accommodate a variety of uses."
+  - "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
 ## Installation
-Install CytoTable with the following command:
+Install CytoTable from [PyPI](https://pypi.org/) or from source:
 ```shell
+# install from pypi
+pip install cytotable
+# install directly from source
 pip install git+https://github.com/cytomining/CytoTable.git
 ```

cytotable-0.0.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+cytotable/__init__.py,sha256=b0078yKBlAAnc7ms0n5nBRxK94xuKD52S4TFb4eTSiE,315
+cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
+cytotable/convert.py,sha256=ORn2MmDmBUBEHDelDHc_j4J3LQgCEflXyzLouvf5h6Y,51971
+cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
+cytotable/presets.py,sha256=SYZXh0-eK-2VRRd8I30GCQcZ4wDMmhGes8KdDsxpFqg,10771
+cytotable/sources.py,sha256=M03pV0Z9YIiWs9pgoAFci3-S63uGCHq9HxvGLqhNV_0,11199
+cytotable/utils.py,sha256=9zqLf_95-phH6IdsDgpK3g3NkDG4odx0NUWogQDs31k,14344
+cytotable-0.0.4.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
+cytotable-0.0.4.dist-info/METADATA,sha256=fUPPn1ufKVe0nIvtHapwEBaNlr9di0hlmnsxh8n_BI0,3181
+cytotable-0.0.4.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+cytotable-0.0.4.dist-info/RECORD,,

{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.7.0
+Generator: poetry-core 1.8.1
 Root-Is-Purelib: true
 Tag: py3-none-any

cytotable-0.0.2.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-cytotable/__init__.py,sha256=_rBEpjjZTru1zqcGCxbqKD0LS20jM_jEeLnBTQP1Afw,213
-cytotable/convert.py,sha256=09nx5eJbF9iWScz60CjjSZ05VoAC79lo3BzNlN2WRVU,53350
-cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
-cytotable/presets.py,sha256=uDJzOIqVCVqT00GHccWcTo5Ud98NCfAD_bMFYMvILJY,10234
-cytotable/sources.py,sha256=jCzlm9jvezXABEeucfit6XRJ7HU3cKL5BQci-Oj-yzA,9910
-cytotable/utils.py,sha256=4dEdzWPGhziAxyzkdkgUwxX7rlVw1phDyOZVE1fOxjs,13949
-cytotable-0.0.2.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
-cytotable-0.0.2.dist-info/METADATA,sha256=7C__ynPX2YgEwIi_b-LxWNVqzZ5S54gMYXjJLymYs1g,2588
-cytotable-0.0.2.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-cytotable-0.0.2.dist-info/RECORD,,

{cytotable-0.0.2.dist-info → cytotable-0.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

CytoTable 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

CytoTable 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl