PyPI - CytoTable - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

CytoTable 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

cytotable/__init__.py +1 -1
cytotable/convert.py +145 -14
cytotable/utils.py +71 -7
{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/METADATA +1 -1
cytotable-0.0.11.dist-info/RECORD +11 -0
{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/WHEEL +1 -1
cytotable-0.0.10.dist-info/RECORD +0 -11
{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/LICENSE +0 -0

cytotable/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.10"
+__version__ = "0.0.11"
 from .convert import convert
 from .exceptions import (

cytotable/convert.py CHANGED Viewed

@@ -173,6 +173,106 @@ def _prep_cast_column_data_types(
     return columns
+@python_app
+def _set_tablenumber(
+    sources: Dict[str, List[Dict[str, Any]]],
+    add_tablenumber: Optional[bool] = None,
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Gathers a "TableNumber" from the image table (if CSV) or
+    SQLite file (if SQLite source) which is a unique identifier
+    intended to help differentiate between imagenumbers
+    to create distinct records for single-cell profiles
+    referenced across multiple source data exports.
+    For example, ImageNumber column values from CellProfiler
+    will repeat across exports, meaning we may lose distinction
+    when combining multiple export files together through CytoTable.
+    Note:
+    - If using CSV data sources, the image.csv table is used for checksum.
+    - If using SQLite data sources, the entire SQLite database is used for checksum.
+    Args:
+        sources: Dict[str, List[Dict[str, Any]]]
+            Contains metadata about data tables and related contents.
+        add_tablenumber: Optional[bool]
+            Whether to add a calculated tablenumber.
+            Note: when False, adds None as the tablenumber
+    Returns:
+        List[Dict[str, Any]]
+            New source group with added TableNumber details.
+    """
+    from cloudpathlib import AnyPath
+    from cytotable.utils import _gather_tablenumber_checksum
+    image_table_groups = {
+        # create a data structure with the common parent for each dataset
+        # and the calculated checksum from the image table.
+        # note: the source_path parent is used for non-SQLite files
+        # whereas the direct source path is used for SQLite files.
+        (
+            str(source["source_path"].parent)
+            if source["source_path"].suffix != "sqlite"
+            else source["source_path"]
+        ): source["source_path"]
+        for source_group_name, source_group_vals in sources.items()
+        # use the image tables references only for the basis of the
+        # these calculations.
+        if any(
+            value in str(AnyPath(source_group_name).stem).lower()
+            for value in ["image", "per_image"]
+        )
+        for source in source_group_vals
+    }
+    # determine if we need to add tablenumber data
+    if (
+        # case for detecting multiple image tables which need to be differentiated
+        add_tablenumber is None
+        and (len(image_table_groups) <= 1)
+    ) or (
+        # case for explicitly set no tablenumbers
+        add_tablenumber
+        is False
+    ):
+        return {
+            source_group_name: [
+                dict(
+                    source,
+                    **{
+                        "tablenumber": None,
+                    },
+                )
+                for source in source_group_vals
+            ]
+            for source_group_name, source_group_vals in sources.items()
+        }
+    # gather the image table from the source_group
+    tablenumber_table = {
+        # create a data structure with the common parent for each dataset
+        # and the calculated checksum from the image table
+        group: _gather_tablenumber_checksum(path)
+        for group, path in image_table_groups.items()
+    }
+    # return a modified sources data structure with the tablenumber added
+    return {
+        source_group_name: [
+            dict(
+                source,
+                **{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
+            )
+            for source in source_group_vals
+            if str(source["source_path"].parent) in list(tablenumber_table.keys())
+        ]
+        for source_group_name, source_group_vals in sources.items()
+    }
 @python_app
 def _get_table_keyset_pagination_sets(
     chunk_size: int,
@@ -310,6 +410,18 @@ def _source_pageset_to_parquet(
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
+    # build tablenumber segment addition (if necessary)
+    tablenumber_sql = (
+        # to become tablenumber in sql select later with bigint (8-byte integer)
+        # we cast here to bigint to avoid concat or join conflicts later due to
+        # misaligned automatic data typing.
+        f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
+        if source["tablenumber"] is not None
+        # don't introduce the column if we aren't supposed to add tablenumber
+        # as per parameter.
+        else ""
+    )
     # add source table columns
     casted_source_cols = [
         # here we cast the column to the specified type ensure the colname remains the same
@@ -317,8 +429,8 @@ def _source_pageset_to_parquet(
         for column in source["columns"]
     ]
-    # create selection statement from lists above
-    select_columns = ",".join(
+    # create selection statement from tablenumber_sql + lists above
+    select_columns = tablenumber_sql + ",".join(
         # if we should sort the output, add the metadata_cols
         casted_source_cols
         if sort_output
@@ -376,6 +488,7 @@ def _source_pageset_to_parquet(
                     page_key=source["page_key"],
                     pageset=pageset,
                     sort_output=sort_output,
+                    tablenumber=source["tablenumber"],
                 ),
                 where=result_filepath,
             )
@@ -994,8 +1107,9 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     sort_output: bool,
     page_keys: Dict[str, str],
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    add_tablenumber: Optional[bool] = None,
     **kwargs,
-) -> Union[Dict[str, List[Dict[str, Any]]], str]:
+) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
     """
     Export data to parquet.
@@ -1137,6 +1251,12 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         for source_group_name, source_group_vals in invalid_files_dropped.items()
     }
+    # add tablenumber details, appending None if not add_tablenumber
+    tablenumber_prepared = _set_tablenumber(
+        sources=evaluate_futures(column_names_and_types_gathered),
+        add_tablenumber=add_tablenumber,
+    ).result()
     results = {
         source_group_name: [
             dict(
@@ -1165,7 +1285,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             for source in source_group_vals
         ]
         for source_group_name, source_group_vals in evaluate_futures(
-            column_names_and_types_gathered
+            tablenumber_prepared
         ).items()
     }
@@ -1244,15 +1364,19 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             ).result()
         ]
-        # concat our join chunks together as one cohesive dataset
-        # return results in common format which includes metadata
-        # for lineage and debugging
-        results = _concat_join_sources(
-            dest_path=expanded_dest_path,
-            join_sources=[join.result() for join in join_sources_result],
-            sources=evaluated_results,
-            sort_output=sort_output,
-        )
+        if concat:
+            # concat our join chunks together as one cohesive dataset
+            # return results in common format which includes metadata
+            # for lineage and debugging
+            results = _concat_join_sources(
+                dest_path=expanded_dest_path,
+                join_sources=[join.result() for join in join_sources_result],
+                sources=evaluated_results,
+                sort_output=sort_output,
+            )
+        else:
+            # else we leave the joined chunks as-is and return them
+            return evaluate_futures(join_sources_result)
     # wrap the final result as a future and return
     return evaluate_futures(results)
@@ -1273,12 +1397,13 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     infer_common_schema: bool = True,
     drop_null: bool = False,
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    add_tablenumber: Optional[bool] = None,
     page_keys: Optional[Dict[str, str]] = None,
     sort_output: bool = True,
     preset: Optional[str] = "cellprofiler_csv",
     parsl_config: Optional[parsl.Config] = None,
     **kwargs,
-) -> Union[Dict[str, List[Dict[str, Any]]], str]:
+) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
     """
     Convert file-based data from various sources to Pycytominer-compatible standards.
@@ -1322,6 +1447,11 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
             https://arrow.apache.org/docs/python/api/datatypes.html
+        add_tablenumber: Optional[bool]
+            Whether to add a calculated tablenumber which helps differentiate
+            various repeated values (such as ObjectNumber) within source data.
+            Useful for processing multiple SQLite or CSV data sources together
+            to retain distinction from each dataset.
         page_keys: str:
             The table and column names to be used for key pagination.
             Uses the form: {"table_name":"column_name"}.
@@ -1462,6 +1592,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             infer_common_schema=infer_common_schema,
             drop_null=drop_null,
             data_type_cast_map=data_type_cast_map,
+            add_tablenumber=add_tablenumber,
             sort_output=sort_output,
             page_keys=cast(dict, page_keys),
             **kwargs,

cytotable/utils.py CHANGED Viewed

@@ -166,6 +166,12 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
         https://duckdb.org/docs/sql/configuration#configuration-reference
         */
         PRAGMA preserve_insertion_order=FALSE;
+        /*
+        Disable progress bar from displaying (defaults to TRUE)
+        See earlier documentation references above for more information.
+        */
+        SET enable_progress_bar=FALSE;
         """,
     )
@@ -176,6 +182,7 @@ def _sqlite_mixed_type_query_to_parquet(
     page_key: str,
     pageset: Tuple[Union[int, float], Union[int, float]],
     sort_output: bool,
+    tablenumber: Optional[int] = None,
 ) -> str:
     """
     Performs SQLite table data extraction where one or many
@@ -195,6 +202,9 @@ def _sqlite_mixed_type_query_to_parquet(
             Specifies whether to sort cytotable output or not.
         add_cytotable_meta: bool, default=False:
             Whether to add CytoTable metadata fields or not
+        tablenumber: Optional[int], default=None:
+            An optional table number to append to the results.
+            Defaults to None.
     Returns:
         pyarrow.Table:
@@ -250,9 +260,19 @@ def _sqlite_mixed_type_query_to_parquet(
             # return the translated type for use in SQLite
             return translated_type[0]
+        # build tablenumber segment addition (if necessary)
+        tablenumber_sql = (
+            # to become tablenumber in sql select later with integer
+            f"CAST({tablenumber} AS INTEGER) as TableNumber, "
+            if tablenumber is not None
+            # if we don't have a tablenumber value, don't introduce the column
+            else ""
+        )
         # create cases for mixed-type handling in each column discovered above
-        query_parts = [
-            f"""
+        query_parts = tablenumber_sql + ", ".join(
+            [
+                f"""
             CASE
                 /* when the storage class type doesn't match the column, return nulltype */
                 WHEN typeof({col['column_name']}) !=
@@ -261,13 +281,14 @@ def _sqlite_mixed_type_query_to_parquet(
                 ELSE {col['column_name']}
             END AS {col['column_name']}
             """
-            for col in column_info
-        ]
+                for col in column_info
+            ]
+        )
         # perform the select using the cases built above and using chunksize + offset
         sql_stmt = f"""
             SELECT
-                {', '.join(query_parts)}
+                {query_parts}
             FROM {table_name}
             WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
             {"ORDER BY " + page_key if sort_output else ""};
@@ -476,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
     )
+def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
+    """
+    Build and return a checksum for use as a unique identifier across datasets
+    referenced from cytominer-database:
+    https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
+    Args:
+        pathname: str:
+            A path to a file with which to generate the checksum on.
+        buffer_size: int:
+            Buffer size to use for reading data.
+    Returns:
+        int
+            an integer representing the checksum of the pathname file.
+    """
+    import os
+    import zlib
+    # check whether the buffer size is larger than the file_size
+    file_size = os.path.getsize(pathname)
+    if file_size < buffer_size:
+        buffer_size = file_size
+    # open file
+    with open(str(pathname), "rb") as stream:
+        # begin result formation
+        result = zlib.crc32(bytes(0))
+        while True:
+            # read data from stream using buffer size
+            buffer = stream.read(buffer_size)
+            if not buffer:
+                # if we have no more data to use, break while loop
+                break
+            # use buffer read data to form checksum
+            result = zlib.crc32(buffer, result)
+    return result & 0xFFFFFFFF
 def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
     """
     Helper function to unwrap futures from values or return values
@@ -531,14 +593,16 @@ def _unwrap_source(
         return _unwrap_value(source)
-def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
+def evaluate_futures(
+    sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
+) -> Any:
     """
     Evaluates any Parsl futures for use within other tasks.
     This enables a pattern of Parsl app usage as "tasks" and delayed
     future result evaluation for concurrency.
     Args:
-        sources: Union[Dict[str, List[Dict[str, Any]]], str]
+        sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
             Sources are an internal data structure used by CytoTable for
             processing and organizing data results. They may include futures
             which require asynchronous processing through Parsl, so we

{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.10
+Version: 0.0.11
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License

cytotable-0.0.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+cytotable/__init__.py,sha256=KSVr7xOOrpmQ_ybzcsZkblTAzPIYEq7_bm-Cjc874FM,316
+cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
+cytotable/convert.py,sha256=5VHnw0eGdfXTbSfeEoPAPVa-dtobM6VHkIJwscLe68M,60651
+cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
+cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
+cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
+cytotable/utils.py,sha256=tywZg1Gr78ebLlOp8R7trkiV7jsQ4iiZt4B6qG6SrxY,22578
+cytotable-0.0.11.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
+cytotable-0.0.11.dist-info/METADATA,sha256=sOvdWxld2Ryyjd5bluZt8Z78uElg1CyWG0UIRJn0F8E,3424
+cytotable-0.0.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+cytotable-0.0.11.dist-info/RECORD,,

{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.9.0
+Generator: poetry-core 1.9.1
 Root-Is-Purelib: true
 Tag: py3-none-any

cytotable-0.0.10.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
-cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
-cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
-cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
-cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
-cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
-cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
-cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
-cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
-cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-cytotable-0.0.10.dist-info/RECORD,,

{cytotable-0.0.10.dist-info → cytotable-0.0.11.dist-info}/LICENSE RENAMED Viewed

File without changes

CytoTable 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

CytoTable 0.0.10py3-none-any.whl → 0.0.11py3-none-any.whl