PyPI - CytoTable - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

CytoTable 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cytotable/__init__.py +1 -1
cytotable/constants.py +7 -0
cytotable/convert.py +154 -79
cytotable/presets.py +47 -71
cytotable/utils.py +143 -5
{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/METADATA +7 -3
cytotable-0.0.8.dist-info/RECORD +11 -0
cytotable-0.0.6.dist-info/RECORD +0 -11
{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/LICENSE +0 -0
{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/WHEEL +0 -0

cytotable/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.6"
+__version__ = "0.0.8"
 from .convert import convert
 from .exceptions import (

cytotable/constants.py CHANGED Viewed

@@ -68,6 +68,13 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
     ],
 }
+# metadata column names and types for internal use within CytoTable
+CYOTABLE_META_COLUMN_TYPES = {
+    "cytotable_meta_source_path": "VARCHAR",
+    "cytotable_meta_offset": "BIGINT",
+    "cytotable_meta_rownum": "BIGINT",
+}
 CYTOTABLE_DEFAULT_PARQUET_METADATA = {
     "data-producer": "https://github.com/cytomining/CytoTable",
     "data-producer-version": str(_get_cytotable_version()),

cytotable/convert.py CHANGED Viewed

@@ -8,23 +8,26 @@ import uuid
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
 import parsl
-import pyarrow as pa
-from parsl.app.app import join_app, python_app
+from parsl.app.app import python_app
 from cytotable.exceptions import CytoTableException
 from cytotable.presets import config
+from cytotable.sources import _gather_sources
 from cytotable.utils import (
     _column_sort,
     _default_parsl_config,
     _expand_path,
     _parsl_loaded,
+    evaluate_futures,
 )
 logger = logging.getLogger(__name__)
 @python_app
-def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]:
+def _get_table_columns_and_types(
+    source: Dict[str, Any], sort_output: bool
+) -> List[Dict[str, str]]:
     """
     Gather column data from table through duckdb.
@@ -32,6 +35,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
         source: Dict[str, Any]
             Contains the source data to be chunked. Represents a single
             file or table of some kind.
+        sort_output:
+            Specifies whether to sort cytotable output or not.
     Returns:
         List[Dict[str, str]]
@@ -109,6 +114,8 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
                 # offset is set to 0 start at first row
                 # result from table
                 offset=0,
+                add_cytotable_meta=False,
+                sort_output=sort_output,
             )
             with _duckdb_reader() as ddb_reader:
                 return (
@@ -275,6 +282,7 @@ def _source_chunk_to_parquet(
     chunk_size: int,
     offset: int,
     dest_path: str,
+    sort_output: bool,
 ) -> str:
     """
     Export source data to chunked parquet file using chunk size and offsets.
@@ -291,6 +299,8 @@ def _source_chunk_to_parquet(
             The offset for chunking the data from source.
         dest_path: str
             Path to store the output data.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         str
@@ -303,6 +313,7 @@ def _source_chunk_to_parquet(
     from cloudpathlib import AnyPath
     from pyarrow import parquet
+    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import (
         _duckdb_reader,
         _sqlite_mixed_type_query_to_parquet,
@@ -316,13 +327,39 @@ def _source_chunk_to_parquet(
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
+    source_path_str = (
+        source["source_path"]
+        if "table_name" not in source.keys()
+        else f"{source['source_path']}_table_{source['table_name']}"
+    )
     # build the column selection block of query
+    # add cytotable metadata columns
+    cytotable_metadata_cols = [
+        (
+            f"CAST( '{source_path_str}' "
+            f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
+            ' AS "cytotable_meta_source_path"'
+        ),
+        f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
+        (
+            f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
+            ' AS "cytotable_meta_rownum"'
+        ),
+    ]
+    # add source table columns
+    casted_source_cols = [
+        # here we cast the column to the specified type ensure the colname remains the same
+        f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
+        for column in source["columns"]
+    ]
+    # create selection statement from lists above
     select_columns = ",".join(
-        [
-            # here we cast the column to the specified type ensure the colname remains the same
-            f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
-            for column in source["columns"]
-        ]
+        # if we should sort the output, add the metadata_cols
+        cytotable_metadata_cols + casted_source_cols
+        if sort_output
+        else casted_source_cols
     )
     # build output query and filepath base
@@ -348,6 +385,13 @@ def _source_chunk_to_parquet(
                 table=ddb_reader.execute(
                     f"""
                     {base_query}
+                    /* order by all columns for deterministic output */
+                    ORDER BY ALL
+                    LIMIT {chunk_size} OFFSET {offset}
+                    """
+                    if sort_output
+                    else f"""
+                    {base_query}
                     LIMIT {chunk_size} OFFSET {offset}
                     """
                 ).arrow(),
@@ -372,6 +416,8 @@ def _source_chunk_to_parquet(
                     table_name=str(source["table_name"]),
                     chunk_size=chunk_size,
                     offset=offset,
+                    add_cytotable_meta=True if sort_output else False,
+                    sort_output=sort_output,
                 ),
                 where=result_filepath,
             )
@@ -420,7 +466,10 @@ def _prepend_column_name(
     import pyarrow.parquet as parquet
-    from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    from cytotable.constants import (
+        CYOTABLE_META_COLUMN_TYPES,
+        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+    )
     from cytotable.utils import _write_parquet_table_with_metadata
     logger = logging.getLogger(__name__)
@@ -468,8 +517,10 @@ def _prepend_column_name(
         #   source_group_name_stem: 'Cells'
         #   column_name: 'AreaShape_Area'
         #   updated_column_name: 'Cells_AreaShape_Area'
-        if column_name not in identifying_columns and not column_name.startswith(
-            source_group_name_stem.capitalize()
+        if (
+            column_name not in identifying_columns
+            and not column_name.startswith(source_group_name_stem.capitalize())
+            and column_name not in CYOTABLE_META_COLUMN_TYPES
         ):
             updated_column_names.append(f"{source_group_name_stem}_{column_name}")
         # if-condition for prepending 'Metadata_' to column name
@@ -677,6 +728,7 @@ def _concat_source_group(
 def _prepare_join_sql(
     sources: Dict[str, List[Dict[str, Any]]],
     joins: str,
+    sort_output: bool,
 ) -> str:
     """
     Prepare join SQL statement with actual locations of data based on the sources.
@@ -688,6 +740,8 @@ def _prepare_join_sql(
         joins: str:
             DuckDB-compatible SQL which will be used to perform the join
             operations using the join_group keys as a reference.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         str:
@@ -695,15 +749,30 @@ def _prepare_join_sql(
     """
     import pathlib
+    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     # replace with real location of sources for join sql
+    order_by_tables = []
     for key, val in sources.items():
         if pathlib.Path(key).stem.lower() in joins.lower():
+            table_name = str(pathlib.Path(key).stem.lower())
             joins = joins.replace(
-                f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
+                f"'{table_name}.parquet'",
                 str([str(table) for table in val[0]["table"]]),
             )
+            order_by_tables.append(table_name)
+    # create order by statement with from all tables using cytotable metadata
+    order_by_sql = "ORDER BY " + ", ".join(
+        [
+            f"{table}.{meta_column}"
+            for table in order_by_tables
+            for meta_column in CYOTABLE_META_COLUMN_TYPES
+        ]
+    )
-    return joins
+    # add the order by statements to the join
+    return joins + order_by_sql if sort_output else joins
 @python_app
@@ -737,8 +806,7 @@ def _join_source_chunk(
     import pathlib
-    import pyarrow.parquet as parquet
+    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
     # Attempt to read the data to parquet file
@@ -746,11 +814,21 @@ def _join_source_chunk(
     # writing data to a parquet file.
     # read data with chunk size + offset
     # and export to parquet
+    exclude_meta_cols = [
+        f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
+    ]
     with _duckdb_reader() as ddb_reader:
         result = ddb_reader.execute(
             f"""
+                WITH joined AS (
                 {joins}
                 LIMIT {chunk_size} OFFSET {offset}
+                )
+                SELECT
+                /* exclude metadata columns from the results
+                by using a lambda on column names based on exclude_meta_cols. */
+                COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
+                FROM joined;
                 """
         ).arrow()
@@ -957,40 +1035,20 @@ def _infer_source_group_common_schema(
     )
-@python_app
-def _return_future(input: Any) -> Any:
-    """
-    This is a simple wrapper python_app to allow
-    the return of join_app-compliant output (must be a Parsl future)
-    Args:
-        input: Any
-            Any input which will be used within the context of a
-            Parsl join_app future return.
-    Returns:
-        Any
-            Returns the input as provided wrapped within the context
-            of a python_app for the purpose of a join_app.
-    """
-    return input
-@join_app
 def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     source_path: str,
     dest_path: str,
     source_datatype: Optional[str],
-    metadata: Union[List[str], Tuple[str, ...]],
-    compartments: Union[List[str], Tuple[str, ...]],
-    identifying_columns: Union[List[str], Tuple[str, ...]],
+    metadata: Optional[Union[List[str], Tuple[str, ...]]],
+    compartments: Optional[Union[List[str], Tuple[str, ...]]],
+    identifying_columns: Optional[Union[List[str], Tuple[str, ...]]],
     concat: bool,
     join: bool,
     joins: Optional[str],
     chunk_size: Optional[int],
     infer_common_schema: bool,
     drop_null: bool,
+    sort_output: bool,
     data_type_cast_map: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1029,6 +1087,8 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             Whether to infer a common schema when concatenating sources.
         drop_null: bool:
             Whether to drop null results.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
         data_type_cast_map: Dict[str, str]
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
@@ -1044,24 +1104,15 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             result.
     """
-    from cytotable.convert import (
-        _concat_join_sources,
-        _concat_source_group,
-        _get_table_chunk_offsets,
-        _infer_source_group_common_schema,
-        _join_source_chunk,
-        _prepend_column_name,
-        _return_future,
-        _source_chunk_to_parquet,
-    )
-    from cytotable.sources import _gather_sources
-    from cytotable.utils import _expand_path
     # gather sources to be processed
     sources = _gather_sources(
         source_path=source_path,
         source_datatype=source_datatype,
-        targets=list(metadata) + list(compartments),
+        targets=(
+            list(metadata) + list(compartments)
+            if metadata is not None and compartments is not None
+            else []
+        ),
         **kwargs,
     ).result()
@@ -1077,7 +1128,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                     "offsets": _get_table_chunk_offsets(
                         source=source,
                         chunk_size=chunk_size,
-                    ).result()
+                    )
                 },
             )
             for source in source_group_vals
@@ -1094,7 +1145,9 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             for source in source_group_vals
             if source["offsets"] is not None
         ]
-        for source_group_name, source_group_vals in offsets_prepared.items()
+        for source_group_name, source_group_vals in evaluate_futures(
+            offsets_prepared
+        ).items()
         # ensure we have source_groups with at least one source table
         if len(source_group_vals) > 0
     }
@@ -1107,10 +1160,10 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 **{
                     "columns": _prep_cast_column_data_types(
                         columns=_get_table_columns_and_types(
-                            source=source,
+                            source=source, sort_output=sort_output
                         ),
                         data_type_cast_map=data_type_cast_map,
-                    ).result()
+                    )
                 },
             )
             for source in source_group_vals
@@ -1133,33 +1186,40 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                                 chunk_size=chunk_size,
                                 offset=offset,
                                 dest_path=expanded_dest_path,
+                                sort_output=sort_output,
                             ),
                             source_group_name=source_group_name,
                             identifying_columns=identifying_columns,
                             metadata=metadata,
                             compartments=compartments,
-                        ).result()
+                        )
                         for offset in source["offsets"]
                     ]
                 },
             )
             for source in source_group_vals
         ]
-        for source_group_name, source_group_vals in column_names_and_types_gathered.items()
+        for source_group_name, source_group_vals in evaluate_futures(
+            column_names_and_types_gathered
+        ).items()
     }
     # if we're concatting or joining and need to infer the common schema
     if (concat or join) and infer_common_schema:
         # create a common schema for concatenation work
         common_schema_determined = {
-            source_group_name: {
-                "sources": source_group_vals,
-                "common_schema": _infer_source_group_common_schema(
-                    source_group=source_group_vals,
-                    data_type_cast_map=data_type_cast_map,
-                ),
-            }
-            for source_group_name, source_group_vals in results.items()
+            source_group_name: [
+                {
+                    "sources": source_group_vals,
+                    "common_schema": _infer_source_group_common_schema(
+                        source_group=source_group_vals,
+                        data_type_cast_map=data_type_cast_map,
+                    ),
+                }
+            ]
+            for source_group_name, source_group_vals in evaluate_futures(
+                results
+            ).items()
         }
     # if concat or join, concat the source groups
@@ -1171,17 +1231,24 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         results = {
             source_group_name: _concat_source_group(
                 source_group_name=source_group_name,
-                source_group=source_group_vals["sources"],
+                source_group=source_group_vals[0]["sources"],
                 dest_path=expanded_dest_path,
-                common_schema=source_group_vals["common_schema"],
-            ).result()
-            for source_group_name, source_group_vals in common_schema_determined.items()
+                common_schema=source_group_vals[0]["common_schema"],
+            )
+            for source_group_name, source_group_vals in evaluate_futures(
+                common_schema_determined
+            ).items()
         }
     # conditional section for merging
     # note: join implies a concat, but concat does not imply a join
     if join:
-        prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
+        # evaluate the results as they're used multiple times below
+        evaluated_results = evaluate_futures(results)
+        prepared_joins_sql = _prepare_join_sql(
+            sources=evaluated_results, joins=joins, sort_output=sort_output
+        ).result()
         # map joined results based on the join groups gathered above
         # note: after mapping we end up with a list of strings (task returns str)
@@ -1195,7 +1262,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 chunk_size=chunk_size,
                 offset=offset,
                 drop_null=drop_null,
-            ).result()
+            )
             # create join group for querying the concatenated
             # data in order to perform memory-safe joining
             # per user chunk size specification.
@@ -1210,12 +1277,12 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         # for lineage and debugging
         results = _concat_join_sources(
             dest_path=expanded_dest_path,
-            join_sources=join_sources_result,
-            sources=results,
-        ).result()
+            join_sources=[join.result() for join in join_sources_result],
+            sources=evaluated_results,
+        )
     # wrap the final result as a future and return
-    return _return_future(results)
+    return evaluate_futures(results)
 def convert(  # pylint: disable=too-many-arguments,too-many-locals
@@ -1233,6 +1300,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     infer_common_schema: bool = True,
     drop_null: bool = False,
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    sort_output: bool = True,
     preset: Optional[str] = "cellprofiler_csv",
     parsl_config: Optional[parsl.Config] = None,
     **kwargs,
@@ -1274,8 +1342,14 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             DuckDB-compatible SQL which will be used to perform the join operations.
         chunk_size: Optional[int] (Default value = None)
             Size of join chunks which is used to limit data size during join ops
-        infer_common_schema: bool: (Default value = True)
+        infer_common_schema: bool (Default value = True)
             Whether to infer a common schema when concatenating sources.
+        data_type_cast_map: Dict[str, str], (Default value = None)
+            A dictionary mapping data type groups to specific types.
+            Roughly includes Arrow data types language from:
+            https://arrow.apache.org/docs/python/api/datatypes.html
+        sort_output: bool (Default value = True)
+            Specifies whether to sort cytotable output or not.
         drop_null: bool (Default value = False)
             Whether to drop nan/null values from results
         preset: str (Default value = "cellprofiler_csv")
@@ -1390,7 +1464,8 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             infer_common_schema=infer_common_schema,
             drop_null=drop_null,
             data_type_cast_map=data_type_cast_map,
+            sort_output=sort_output,
             **kwargs,
-        ).result()
+        )
     return output

cytotable/presets.py CHANGED Viewed

@@ -29,24 +29,18 @@ config = {
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
-            WITH Image_Filtered AS (
-                SELECT
-                    /* seeks columns by name, avoiding failure if some do not exist */
-                    COLUMNS('^Metadata_ImageNumber$|^Image_Metadata_Well$|^Image_Metadata_Plate$')
-                FROM
-                    read_parquet('image.parquet')
-                )
             SELECT
-                *
+                image.Metadata_ImageNumber,
+                cytoplasm.* EXCLUDE (Metadata_ImageNumber),
+                cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
+                nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
             FROM
-                Image_Filtered AS image
-            LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
-                cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
-            LEFT JOIN read_parquet('cells.parquet') AS cells ON
-                cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
-                AND cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
-            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
-                nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                read_parquet('cytoplasm.parquet') AS cytoplasm
+            LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
+            WHERE
+                cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
                 AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
             """,
     },
@@ -74,25 +68,20 @@ config = {
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
-            WITH Per_Image_Filtered AS (
-                SELECT
-                    Metadata_ImageNumber,
-                    Image_Metadata_Well,
-                    Image_Metadata_Plate
-                FROM
-                    read_parquet('per_image.parquet')
-                )
             SELECT
-                *
+                per_image.Metadata_ImageNumber,
+                per_image.Image_Metadata_Well,
+                per_image.Image_Metadata_Plate,
+                per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
+                per_cells.* EXCLUDE (Metadata_ImageNumber),
+                per_nuclei.* EXCLUDE (Metadata_ImageNumber)
             FROM
-                Per_Image_Filtered AS per_image
-            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
-                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
-            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
-                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
-                AND per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
-            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
-                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
+                read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
+            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
+            WHERE
+                per_cells.Cells_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Cells
                 AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
             """,
     },
@@ -125,25 +114,20 @@ config = {
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
-            WITH Per_Image_Filtered AS (
-                SELECT
-                    Metadata_ImageNumber,
-                    Image_Metadata_Well,
-                    Image_Metadata_Plate
-                FROM
-                    read_parquet('per_image.parquet')
-                )
             SELECT
-                *
+                per_image.Metadata_ImageNumber,
+                per_image.Image_Metadata_Well,
+                per_image.Image_Metadata_Plate,
+                per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
+                per_cells.* EXCLUDE (Metadata_ImageNumber),
+                per_nuclei.* EXCLUDE (Metadata_ImageNumber)
             FROM
-                Per_Image_Filtered AS per_image
-            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
-                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
-            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
-                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
-                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
-            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
-                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
+                read_parquet('per_cytoplasm.parquet') AS per_cytoplasm
+            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)
+            LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)
+            WHERE
+                per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
                 AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
             """,
     },
@@ -178,29 +162,21 @@ config = {
         # compartment and metadata joins performed using DuckDB SQL
         # and modified at runtime as needed
         "CONFIG_JOINS": """
-            WITH Image_Filtered AS (
-                SELECT
-                    Metadata_TableNumber,
-                    Metadata_ImageNumber,
-                    Image_Metadata_Well,
-                    Image_Metadata_Plate
-                FROM
-                    read_parquet('image.parquet')
-                )
             SELECT
-                *
+                image.Metadata_TableNumber,
+                image.Metadata_ImageNumber,
+                image.Image_Metadata_Well,
+                image.Image_Metadata_Plate,
+                cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
+                cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
+                nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
             FROM
-                Image_Filtered AS image
-            LEFT JOIN read_parquet('cytoplasm.parquet') AS cytoplasm ON
-                cytoplasm.Metadata_TableNumber = image.Metadata_TableNumber
-                AND cytoplasm.Metadata_ImageNumber = image.Metadata_ImageNumber
-            LEFT JOIN read_parquet('cells.parquet') AS cells ON
-                cells.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
-                AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
-                AND cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
-            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
-                nuclei.Metadata_TableNumber = cytoplasm.Metadata_TableNumber
-                AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                read_parquet('cytoplasm.parquet') AS cytoplasm
+            LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_TableNumber, Metadata_ImageNumber)
+            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_TableNumber, Metadata_ImageNumber)
+            LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_TableNumber, Metadata_ImageNumber)
+            WHERE
+                cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
                 AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
         """,
     },

cytotable/utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ Utility functions for CytoTable
 import logging
 import os
 import pathlib
-from typing import Any, Dict, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Union, cast
 import duckdb
 import parsl
@@ -171,6 +171,8 @@ def _sqlite_mixed_type_query_to_parquet(
     table_name: str,
     chunk_size: int,
     offset: int,
+    sort_output: bool,
+    add_cytotable_meta: bool = False,
 ) -> str:
     """
     Performs SQLite table data extraction where one or many
@@ -186,6 +188,10 @@ def _sqlite_mixed_type_query_to_parquet(
             Row count to use for chunked output.
         offset: int:
             The offset for chunking the data from source.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
+        add_cytotable_meta: bool, default=False:
+            Whether to add CytoTable metadata fields or not
     Returns:
         pyarrow.Table:
@@ -195,7 +201,10 @@ def _sqlite_mixed_type_query_to_parquet(
     import pyarrow as pa
-    from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
+    from cytotable.constants import (
+        CYOTABLE_META_COLUMN_TYPES,
+        SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
+    )
     from cytotable.exceptions import DatatypeException
     # open sqlite3 connection
@@ -207,7 +216,7 @@ def _sqlite_mixed_type_query_to_parquet(
         # See the following for more information:
         # https://sqlite.org/pragma.html#pragma_table_info
         cursor.execute(
-            f"""
+            """
             SELECT :table_name as table_name,
                     name as column_name,
                     type as column_type
@@ -255,10 +264,45 @@ def _sqlite_mixed_type_query_to_parquet(
             for col in column_info
         ]
+        if add_cytotable_meta:
+            query_parts += [
+                (
+                    f"CAST( '{f'{source_path}_table_{table_name}'}' "
+                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
+                    "AS cytotable_meta_source_path"
+                ),
+                (
+                    f"CAST( {offset} "
+                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
+                    "AS cytotable_meta_offset"
+                ),
+                (
+                    f"CAST( (ROW_NUMBER() OVER ()) AS "
+                    f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
+                    "AS cytotable_meta_rownum"
+                ),
+            ]
         # perform the select using the cases built above and using chunksize + offset
-        cursor.execute(
-            f'SELECT {", ".join(query_parts)} FROM {table_name} LIMIT {chunk_size} OFFSET {offset};'
+        sql_stmt = (
+            f"""
+            SELECT
+                {', '.join(query_parts)}
+            FROM {table_name}
+            ORDER BY {', '.join([col['column_name'] for col in column_info])}
+            LIMIT {chunk_size} OFFSET {offset};
+            """
+            if sort_output
+            else f"""
+            SELECT
+                {', '.join(query_parts)}
+            FROM {table_name}
+            LIMIT {chunk_size} OFFSET {offset};
+            """
         )
+        # execute the sql stmt
+        cursor.execute(sql_stmt)
         # collect the results and include the column name with values
         results = [
             dict(zip([desc[0] for desc in cursor.description], row))
@@ -457,3 +501,97 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
         ),
         **kwargs,
     )
+def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
+    """
+    Helper function to unwrap futures from values or return values
+    where there are no futures.
+    Args:
+        val: Union[parsl.dataflow.futures.AppFuture, Any]
+            A value which may or may not be a Parsl future which
+            needs to be evaluated.
+    Returns:
+        Any
+            Returns the value as-is if there's no future, the future
+            result if Parsl futures are encountered.
+    """
+    # if we have a future value, evaluate the result
+    if isinstance(val, parsl.dataflow.futures.AppFuture):
+        return val.result()
+    elif isinstance(val, list):
+        # if we have a list of futures, return the results
+        if isinstance(val[0], parsl.dataflow.futures.AppFuture):
+            return [elem.result() for elem in val]
+    # otherwise return the value
+    return val
+def _unwrap_source(
+    source: Union[
+        Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
+        Union[parsl.dataflow.futures.AppFuture, Any],
+    ]
+) -> Union[Dict[str, Any], Any]:
+    """
+    Helper function to unwrap futures from sources.
+    Args:
+        source: Union[
+            Dict[str, Union[parsl.dataflow.futures.AppFuture, Any]],
+            Union[parsl.dataflow.futures.AppFuture, Any],
+        ]
+            A source is a portion of an internal data structure used by
+            CytoTable for processing and organizing data results.
+    Returns:
+        Union[Dict[str, Any], Any]
+            An evaluated dictionary or other value type.
+    """
+    # if we have a dictionary, unwrap any values which may be futures
+    if isinstance(source, dict):
+        return {key: _unwrap_value(val) for key, val in source.items()}
+    else:
+        # otherwise try to unwrap the source as-is without dictionary nesting
+        return _unwrap_value(source)
+def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
+    """
+    Evaluates any Parsl futures for use within other tasks.
+    This enables a pattern of Parsl app usage as "tasks" and delayed
+    future result evaluation for concurrency.
+    Args:
+        sources: Union[Dict[str, List[Dict[str, Any]]], str]
+            Sources are an internal data structure used by CytoTable for
+            processing and organizing data results. They may include futures
+            which require asynchronous processing through Parsl, so we
+            process them through this function.
+    Returns:
+        Union[Dict[str, List[Dict[str, Any]]], str]
+            A data structure which includes evaluated futures where they were found.
+    """
+    return (
+        {
+            source_group_name: [
+                # unwrap sources into future results
+                _unwrap_source(source)
+                for source in (
+                    source_group_vals.result()
+                    # if we have a future, return the result
+                    if isinstance(source_group_vals, parsl.dataflow.futures.AppFuture)
+                    # otherwise return the value
+                    else source_group_vals
+                )
+            ]
+            for source_group_name, source_group_vals in sources.items()
+            # if we have a dict, use the above, otherwise unwrap the value in case of future
+        }
+        if isinstance(sources, dict)
+        else _unwrap_value(sources)
+    )

{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.6
+Version: 0.0.8
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License
@@ -14,10 +14,14 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
-Requires-Dist: duckdb (>=0.8.0,<0.10.0)
+Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
+Requires-Dist: duckdb (>=0.10.1)
+Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
+Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
 Requires-Dist: parsl (>=2023.9.25)
 Requires-Dist: pyarrow (>=13.0.0)
+Requires-Dist: scipy (<1.12.0) ; python_version < "3.9"
+Requires-Dist: scipy (>=1.12.0,<2.0.0) ; python_version >= "3.9"
 Project-URL: Documentation, https://cytomining.github.io/CytoTable/
 Project-URL: Repository, https://github.com/cytomining/CytoTable
 Description-Content-Type: text/markdown

cytotable-0.0.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
+cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
+cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
+cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
+cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
+cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
+cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
+cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
+cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
+cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+cytotable-0.0.8.dist-info/RECORD,,

cytotable-0.0.6.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-cytotable/__init__.py,sha256=BRJhTCcugpwKD1ONkiYUFjZMyCeO4t8f9161lrboXKY,315
-cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
-cytotable/convert.py,sha256=dXvzQPBel4Yp1zs_LZWQR1ZTV19G9WXCkrlTSXV6eWQ,51590
-cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
-cytotable/presets.py,sha256=SYZXh0-eK-2VRRd8I30GCQcZ4wDMmhGes8KdDsxpFqg,10771
-cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
-cytotable/utils.py,sha256=9zqLf_95-phH6IdsDgpK3g3NkDG4odx0NUWogQDs31k,14344
-cytotable-0.0.6.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
-cytotable-0.0.6.dist-info/METADATA,sha256=j-BSYzl7cjaxsSR74luw-zvpPofTCYXVEBO1JIetvY0,3189
-cytotable-0.0.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-cytotable-0.0.6.dist-info/RECORD,,

{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{cytotable-0.0.6.dist-info → cytotable-0.0.8.dist-info}/WHEEL RENAMED Viewed

File without changes

CytoTable 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

CytoTable 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl