PyPI - CytoTable - Versions diffs - 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

CytoTable 0.0.8py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cytotable/__init__.py +1 -1
cytotable/convert.py +11 -18
cytotable/presets.py +48 -0
cytotable/sources.py +45 -16
cytotable/utils.py +12 -7
{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/METADATA +2 -2
cytotable-0.0.9.dist-info/RECORD +11 -0
cytotable-0.0.8.dist-info/RECORD +0 -11
{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/LICENSE +0 -0
{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/WHEEL +0 -0

cytotable/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.8"
+__version__ = "0.0.9"
 from .convert import convert
 from .exceptions import (

cytotable/convert.py CHANGED Viewed

@@ -46,11 +46,12 @@ def _get_table_columns_and_types(
     import pathlib
     import duckdb
+    from cloudpathlib import AnyPath
     from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
     source_path = source["source_path"]
-    source_type = str(pathlib.Path(source_path).suffix).lower()
+    source_type = str(source_path.suffix).lower()
     # prepare the data source in the form of a duckdb query
     select_source = (
@@ -209,7 +210,7 @@ def _get_table_chunk_offsets(
     import pathlib
     import duckdb
-    from cloudpathlib import AnyPath
+    from cloudpathlib import AnyPath, CloudPath
     from cytotable.exceptions import NoInputDataException
     from cytotable.utils import _duckdb_reader
@@ -219,18 +220,9 @@ def _get_table_chunk_offsets(
     if source is not None:
         table_name = source["table_name"] if "table_name" in source.keys() else None
         source_path = source["source_path"]
-        source_type = str(pathlib.Path(source_path).suffix).lower()
+        source_type = str(source_path.suffix).lower()
         try:
-            # for csv's, check that we have more than one row (a header and data values)
-            if (
-                source_type == ".csv"
-                and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
-            ):
-                raise NoInputDataException(
-                    f"Data file has 0 rows of values. Error in file: {source_path}"
-                )
             # gather the total rowcount from csv or sqlite data input sources
             with _duckdb_reader() as ddb_reader:
                 rowcount = int(
@@ -322,8 +314,8 @@ def _source_chunk_to_parquet(
     # attempt to build dest_path
     source_dest_path = (
-        f"{dest_path}/{str(pathlib.Path(source_group_name).stem).lower()}/"
-        f"{str(pathlib.Path(source['source_path']).parent.name).lower()}"
+        f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
+        f"{str(source['source_path'].parent.name).lower()}"
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
@@ -364,11 +356,11 @@ def _source_chunk_to_parquet(
     # build output query and filepath base
     # (chunked output will append offset to keep output paths unique)
-    if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
+    if str(source["source_path"].suffix).lower() == ".csv":
         base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
-    elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
+    elif str(source["source_path"].suffix).lower() == ".sqlite":
         base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
@@ -405,7 +397,7 @@ def _source_chunk_to_parquet(
         # to handle the mixed types
         if (
             "Mismatch Type Error" in str(e)
-            and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
+            and str(source["source_path"].suffix).lower() == ".sqlite"
         ):
             _write_parquet_table_with_metadata(
                 # here we use sqlite instead of duckdb to extract
@@ -817,6 +809,7 @@ def _join_source_chunk(
     exclude_meta_cols = [
         f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
     ]
     with _duckdb_reader() as ddb_reader:
         result = ddb_reader.execute(
             f"""
@@ -1114,7 +1107,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             else []
         ),
         **kwargs,
-    ).result()
+    )
     # expand the destination path
     expanded_dest_path = _expand_path(path=dest_path)

cytotable/presets.py CHANGED Viewed

@@ -85,6 +85,54 @@ config = {
                 AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
             """,
     },
+    "cellprofiler_sqlite_cpg0016_jump": {
+        # version specifications using related references
+        "CONFIG_SOURCE_VERSION": {
+            "cellprofiler": "v4.0.0",
+        },
+        # names of source table compartments (for ex. cells.csv, etc.)
+        "CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
+        # names of source table metadata (for ex. image.csv, etc.)
+        "CONFIG_NAMES_METADATA": ("image",),
+        # column names in any compartment or metadata tables which contain
+        # unique names to avoid renaming
+        "CONFIG_IDENTIFYING_COLUMNS": (
+            "ImageNumber",
+            "ObjectNumber",
+            "Metadata_Well",
+            "Metadata_Plate",
+            "Parent_Cells",
+            "Parent_Nuclei",
+        ),
+        # chunk size to use for join operations to help with possible performance issues
+        # note: this number is an estimate and is may need changes contingent on data
+        # and system used by this library.
+        "CONFIG_CHUNK_SIZE": 1000,
+        # compartment and metadata joins performed using DuckDB SQL
+        # and modified at runtime as needed
+        "CONFIG_JOINS": """
+            SELECT
+                image.Image_TableNumber,
+                image.Metadata_ImageNumber,
+                image.Metadata_Plate,
+                image.Metadata_Well,
+                image.Image_Metadata_Site,
+                image.Image_Metadata_Row,
+                cytoplasm.* EXCLUDE (Metadata_ImageNumber),
+                cells.* EXCLUDE (Metadata_ImageNumber),
+                nuclei.* EXCLUDE (Metadata_ImageNumber)
+            FROM
+                read_parquet('cytoplasm.parquet') AS cytoplasm
+            LEFT JOIN read_parquet('cells.parquet') AS cells ON
+                cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
+            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
+                nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
+            LEFT JOIN read_parquet('image.parquet') AS image ON
+                image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+            """,
+    },
     "cellprofiler_sqlite_pycytominer": {
         # version specifications using related references
         "CONFIG_SOURCE_VERSION": {

cytotable/sources.py CHANGED Viewed

@@ -7,13 +7,11 @@ import pathlib
 from typing import Any, Dict, List, Optional, Union
 from cloudpathlib import AnyPath
-from parsl.app.app import join_app, python_app
+from cytotable.exceptions import NoInputDataException
-@python_app
-def _build_path(
-    path: Union[str, pathlib.Path, AnyPath], **kwargs
-) -> Union[pathlib.Path, AnyPath]:
+def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
     """
     Build a path client or return local path.
@@ -43,10 +41,9 @@ def _build_path(
     return processed_path
-@python_app
 def _get_source_filepaths(
     path: Union[pathlib.Path, AnyPath],
-    targets: List[str],
+    targets: Optional[List[str]] = None,
     source_datatype: Optional[str] = None,
 ) -> Dict[str, List[Dict[str, Any]]]:
     """
@@ -75,7 +72,7 @@ def _get_source_filepaths(
     if (targets is None or targets == []) and source_datatype is None:
         raise DatatypeException(
-            f"A source_datatype must be specified when using undefined compartments and metadata names."
+            "A source_datatype must be specified when using undefined compartments and metadata names."
         )
     # gathers files from provided path using compartments + metadata as a filter
@@ -87,9 +84,9 @@ def _get_source_filepaths(
         for subpath in (
             (path,)
             # used if the source path is a single file
-            if AnyPath(path).is_file()
+            if path.is_file()
             # iterates through a source directory
-            else (x for x in AnyPath(path).glob("**/*") if AnyPath(x).is_file())
+            else (x for x in path.glob("**/*") if x.is_file())
         )
         # ensure the subpaths meet certain specifications
         if (
@@ -129,7 +126,8 @@ def _get_source_filepaths(
                     .arrow()["table_name"]
                     .to_pylist()
                     # make sure the table names match with compartment + metadata names
-                    if any(target.lower() in table_name.lower() for target in targets)
+                    if targets is not None
+                    and any(target.lower() in table_name.lower() for target in targets)
                 ]
             else:
                 # if we don't have sqlite source, append the existing element
@@ -181,7 +179,6 @@ def _get_source_filepaths(
     return grouped_sources
-@python_app
 def _infer_source_datatype(
     sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
 ) -> str:
@@ -230,7 +227,6 @@ def _infer_source_datatype(
     return source_datatype
-@python_app
 def _filter_source_filepaths(
     sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
 ) -> Dict[str, List[Dict[str, Any]]]:
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
             if file["source_path"].stat().st_size > 0
             # ensure the datatype matches the source datatype
             and file["source_path"].suffix == f".{source_datatype}"
+            and _file_is_more_than_one_line(path=file["source_path"])
         ]
         for filegroup, files in sources.items()
     }
-@join_app
+def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
+    """
+    Check if the file has more than one line.
+    Args:
+        path (Union[pathlib.Path, AnyPath]):
+            The path to the file.
+    Returns:
+        bool:
+            True if the file has more than one line, False otherwise.
+    Raises:
+        NoInputDataException: If the file has zero lines.
+    """
+    # if we don't have a sqlite file
+    # (we can't check sqlite files for lines)
+    if path.suffix.lower() != ".sqlite":
+        with path.open("r") as f:
+            try:
+                # read two lines, if the second is empty return false
+                return bool(f.readline() and f.readline())
+            except StopIteration:
+                # If we encounter the end of the file, it has only one line
+                raise NoInputDataException(
+                    f"Data file has 0 rows of values. Error in file: {path}"
+                )
+    else:
+        return True
 def _gather_sources(
     source_path: str,
     source_datatype: Optional[str] = None,
@@ -295,11 +324,11 @@ def _gather_sources(
         _infer_source_datatype,
     )
-    source_path = _build_path(path=source_path, **kwargs)
+    built_path = _build_path(path=source_path, **kwargs)
     # gather filepaths which will be used as the basis for this work
     sources = _get_source_filepaths(
-        path=source_path, targets=targets, source_datatype=source_datatype
+        path=built_path, targets=targets, source_datatype=source_datatype
     )
     # infer or validate the source datatype based on source filepaths

cytotable/utils.py CHANGED Viewed

@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
         INSTALL sqlite_scanner;
         LOAD sqlite_scanner;
+        /* Install httpfs plugin to avoid error
+        https://github.com/duckdb/duckdb/issues/3243 */
+        INSTALL httpfs;
         /*
         Set threads available to duckdb
         See the following for more information:
@@ -322,7 +326,7 @@ def _sqlite_mixed_type_query_to_parquet(
     return pa.Table.from_pylist(results)
-def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
+def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
     """
     Takes a cloudpath and uses cache to convert to a local copy
     for use in scenarios where remote work is not possible (sqlite).
@@ -337,24 +341,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
             A local pathlib.Path to cached version of cloudpath file.
     """
-    candidate_path = AnyPath(path)
     # check that the path is a file (caching won't work with a dir)
     # and check that the file is of sqlite type
     # (other file types will be handled remotely in cloud)
-    if candidate_path.is_file() and candidate_path.suffix.lower() == ".sqlite":
+    if (
+        isinstance(path, CloudPath)
+        and path.is_file()
+        and path.suffix.lower() == ".sqlite"
+    ):
         try:
             # update the path to be the local filepath for reference in CytoTable ops
             # note: incurs a data read which will trigger caching of the file
-            path = CloudPath(path).fspath
+            path = pathlib.Path(path.fspath)
         except InvalidPrefixError:
             # share information about not finding a cloud path
             logger.info(
                 "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
             )
-    # cast the result as a pathlib.Path
-    return pathlib.Path(path)
+    return path
 def _arrow_type_cast_if_specified(

{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.8
+Version: 0.0.9
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
+Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
 Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
 Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"

cytotable-0.0.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
+cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
+cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
+cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
+cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
+cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
+cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
+cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
+cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
+cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+cytotable-0.0.9.dist-info/RECORD,,

cytotable-0.0.8.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
-cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
-cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
-cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
-cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
-cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
-cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
-cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
-cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
-cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-cytotable-0.0.8.dist-info/RECORD,,

{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{cytotable-0.0.8.dist-info → cytotable-0.0.9.dist-info}/WHEEL RENAMED Viewed

File without changes

CytoTable 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl

CytoTable 0.0.8py3-none-any.whl → 0.0.9py3-none-any.whl