PyPI - CytoTable - Versions diffs - 0.0.8__tar.gz → 0.0.10__tar.gz - Mend

CytoTable 0.0.8tar.gz → 0.0.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{cytotable-0.0.8 → cytotable-0.0.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.8
+Version: 0.0.10
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
+Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
 Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
 Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.8"
+__version__ = "0.0.10"
 from .convert import convert
 from .exceptions import (

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/constants.py RENAMED Viewed

@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
     ],
 }
-# metadata column names and types for internal use within CytoTable
-CYOTABLE_META_COLUMN_TYPES = {
-    "cytotable_meta_source_path": "VARCHAR",
-    "cytotable_meta_offset": "BIGINT",
-    "cytotable_meta_rownum": "BIGINT",
-}
 CYTOTABLE_DEFAULT_PARQUET_METADATA = {
     "data-producer": "https://github.com/cytomining/CytoTable",
     "data-producer-version": str(_get_cytotable_version()),

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/convert.py RENAMED Viewed

@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
 import itertools
 import logging
-import uuid
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
 import parsl
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
     Args:
         source: Dict[str, Any]
-            Contains the source data to be chunked. Represents a single
+            Contains source data details. Represents a single
             file or table of some kind.
         sort_output:
             Specifies whether to sort cytotable output or not.
@@ -43,14 +42,12 @@ def _get_table_columns_and_types(
             list of dictionaries which each include column level information
     """
-    import pathlib
     import duckdb
     from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
     source_path = source["source_path"]
-    source_type = str(pathlib.Path(source_path).suffix).lower()
+    source_type = str(source_path.suffix).lower()
     # prepare the data source in the form of a duckdb query
     select_source = (
@@ -88,7 +85,7 @@ def _get_table_columns_and_types(
     # with exception handling to read mixed-type data
     # using sqlite3 and special utility function
     try:
-        # isolate using new connection to read data with chunk size + offset
+        # isolate using new connection to read data based on pageset
         # and export directly to parquet via duckdb (avoiding need to return data to python)
         # perform the query and create a list of dictionaries with the column data for table
         with _duckdb_reader() as ddb_reader:
@@ -108,13 +105,8 @@ def _get_table_columns_and_types(
             arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
                 source_path=str(source["source_path"]),
                 table_name=str(source["table_name"]),
-                # chunk size is set to 5 as a limit similar
-                # to above SQL within select_query variable
-                chunk_size=5,
-                # offset is set to 0 start at first row
-                # result from table
-                offset=0,
-                add_cytotable_meta=False,
+                page_key=source["page_key"],
+                pageset=source["pagesets"][0],
                 sort_output=sort_output,
             )
             with _duckdb_reader() as ddb_reader:
@@ -182,13 +174,14 @@ def _prep_cast_column_data_types(
 @python_app
-def _get_table_chunk_offsets(
+def _get_table_keyset_pagination_sets(
     chunk_size: int,
+    page_key: str,
     source: Optional[Dict[str, Any]] = None,
     sql_stmt: Optional[str] = None,
-) -> Union[List[int], None]:
+) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
     """
-    Get table data chunk offsets for later use in capturing segments
+    Get table data chunk keys for later use in capturing segments
     of values. This work also provides a chance to catch problematic
     input data which will be ignored with warnings.
@@ -198,51 +191,59 @@ def _get_table_chunk_offsets(
             file or table of some kind.
         chunk_size: int
             The size in rowcount of the chunks to create.
+        page_key: str
+            The column name to be used to identify pagination chunks.
+            Expected to be of numeric type (int, float) for ordering.
+        sql_stmt:
+            Optional sql statement to form the pagination set from.
+            Default behavior extracts pagination sets from the full
+            data source.
     Returns:
-        List[int]
-            List of integers which represent offsets to use for reading
-            the data later on.
+        List[Any]
+            List of keys to use for reading the data later on.
     """
     import logging
-    import pathlib
+    import sqlite3
+    from contextlib import closing
     import duckdb
-    from cloudpathlib import AnyPath
     from cytotable.exceptions import NoInputDataException
-    from cytotable.utils import _duckdb_reader
+    from cytotable.utils import _duckdb_reader, _generate_pagesets
     logger = logging.getLogger(__name__)
     if source is not None:
         table_name = source["table_name"] if "table_name" in source.keys() else None
         source_path = source["source_path"]
-        source_type = str(pathlib.Path(source_path).suffix).lower()
+        source_type = str(source_path.suffix).lower()
         try:
-            # for csv's, check that we have more than one row (a header and data values)
-            if (
-                source_type == ".csv"
-                and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
-            ):
-                raise NoInputDataException(
-                    f"Data file has 0 rows of values. Error in file: {source_path}"
-                )
-            # gather the total rowcount from csv or sqlite data input sources
             with _duckdb_reader() as ddb_reader:
-                rowcount = int(
-                    ddb_reader.execute(
-                        # nosec
-                        f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
-                        if source_type == ".csv"
-                        else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
-                    ).fetchone()[0]
-                )
+                if source_type == ".csv":
+                    sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
+                else:
+                    sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
+                page_keys = [
+                    results[0] for results in ddb_reader.execute(sql_query).fetchall()
+                ]
+        # exception case for when we have mixed types
+        # (i.e. integer col with string and ints) in a sqlite column
+        except duckdb.TypeMismatchException:
+            with closing(sqlite3.connect(source_path)) as cx:
+                with cx:
+                    page_keys = [
+                        key[0]
+                        for key in cx.execute(
+                            f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
+                        ).fetchall()
+                        if isinstance(key[0], (int, float))
+                    ]
-        # catch input errors which will result in skipped files
         except (
             duckdb.InvalidInputException,
             NoInputDataException,
@@ -253,34 +254,20 @@ def _get_table_chunk_offsets(
             return None
-    # find chunk offsets from sql statement
     elif sql_stmt is not None:
-        # gather the total rowcount from csv or sqlite data input sources
         with _duckdb_reader() as ddb_reader:
-            rowcount = int(
-                ddb_reader.execute(
-                    # nosec
-                    f"SELECT COUNT(*) FROM ({sql_stmt})"
-                ).fetchone()[0]
-            )
+            sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
+            page_keys = ddb_reader.execute(sql_query).fetchall()
+            page_keys = [key[0] for key in page_keys]
-    return list(
-        range(
-            0,
-            # gather rowcount from table and use as maximum for range
-            rowcount,
-            # step through using chunk size
-            chunk_size,
-        )
-    )
+    return _generate_pagesets(page_keys, chunk_size)
 @python_app
-def _source_chunk_to_parquet(
+def _source_pageset_to_parquet(
     source_group_name: str,
     source: Dict[str, Any],
-    chunk_size: int,
-    offset: int,
+    pageset: Tuple[Union[int, float], Union[int, float]],
     dest_path: str,
     sort_output: bool,
 ) -> str:
@@ -293,10 +280,8 @@ def _source_chunk_to_parquet(
         source: Dict[str, Any]
             Contains the source data to be chunked. Represents a single
             file or table of some kind along with collected information about table.
-        chunk_size: int
-            Row count to use for chunked output.
-        offset: int
-            The offset for chunking the data from source.
+        pageset: Tuple[int, int]
+            The pageset for chunking the data from source.
         dest_path: str
             Path to store the output data.
         sort_output: bool
@@ -311,9 +296,7 @@ def _source_chunk_to_parquet(
     import duckdb
     from cloudpathlib import AnyPath
-    from pyarrow import parquet
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import (
         _duckdb_reader,
         _sqlite_mixed_type_query_to_parquet,
@@ -322,31 +305,11 @@ def _source_chunk_to_parquet(
     # attempt to build dest_path
     source_dest_path = (
-        f"{dest_path}/{str(pathlib.Path(source_group_name).stem).lower()}/"
-        f"{str(pathlib.Path(source['source_path']).parent.name).lower()}"
+        f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
+        f"{str(source['source_path'].parent.name).lower()}"
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
-    source_path_str = (
-        source["source_path"]
-        if "table_name" not in source.keys()
-        else f"{source['source_path']}_table_{source['table_name']}"
-    )
-    # build the column selection block of query
-    # add cytotable metadata columns
-    cytotable_metadata_cols = [
-        (
-            f"CAST( '{source_path_str}' "
-            f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
-            ' AS "cytotable_meta_source_path"'
-        ),
-        f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
-        (
-            f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
-            ' AS "cytotable_meta_rownum"'
-        ),
-    ]
     # add source table columns
     casted_source_cols = [
         # here we cast the column to the specified type ensure the colname remains the same
@@ -357,22 +320,23 @@ def _source_chunk_to_parquet(
     # create selection statement from lists above
     select_columns = ",".join(
         # if we should sort the output, add the metadata_cols
-        cytotable_metadata_cols + casted_source_cols
+        casted_source_cols
         if sort_output
         else casted_source_cols
     )
     # build output query and filepath base
     # (chunked output will append offset to keep output paths unique)
-    if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
+    if str(source["source_path"].suffix).lower() == ".csv":
         base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
-    elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
+    elif str(source["source_path"].suffix).lower() == ".sqlite":
         base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
-    result_filepath = f"{result_filepath_base}-{offset}.parquet"
+    # form a filepath which indicates the pageset
+    result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
     # Attempt to read the data to parquet file
     # using duckdb for extraction and pyarrow for
@@ -385,14 +349,9 @@ def _source_chunk_to_parquet(
                 table=ddb_reader.execute(
                     f"""
                     {base_query}
-                    /* order by all columns for deterministic output */
-                    ORDER BY ALL
-                    LIMIT {chunk_size} OFFSET {offset}
-                    """
-                    if sort_output
-                    else f"""
-                    {base_query}
-                    LIMIT {chunk_size} OFFSET {offset}
+                    WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
+                    /* optional ordering per pageset */
+                    {"ORDER BY " + source['page_key'] if sort_output else ""};
                     """
                 ).arrow(),
                 where=result_filepath,
@@ -405,7 +364,7 @@ def _source_chunk_to_parquet(
         # to handle the mixed types
         if (
             "Mismatch Type Error" in str(e)
-            and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
+            and str(source["source_path"].suffix).lower() == ".sqlite"
         ):
             _write_parquet_table_with_metadata(
                 # here we use sqlite instead of duckdb to extract
@@ -414,9 +373,8 @@ def _source_chunk_to_parquet(
                 table=_sqlite_mixed_type_query_to_parquet(
                     source_path=str(source["source_path"]),
                     table_name=str(source["table_name"]),
-                    chunk_size=chunk_size,
-                    offset=offset,
-                    add_cytotable_meta=True if sort_output else False,
+                    page_key=source["page_key"],
+                    pageset=pageset,
                     sort_output=sort_output,
                 ),
                 where=result_filepath,
@@ -466,10 +424,7 @@ def _prepend_column_name(
     import pyarrow.parquet as parquet
-    from cytotable.constants import (
-        CYOTABLE_META_COLUMN_TYPES,
-        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
-    )
+    from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
     from cytotable.utils import _write_parquet_table_with_metadata
     logger = logging.getLogger(__name__)
@@ -480,7 +435,7 @@ def _prepend_column_name(
     if len(targets) == 0:
         logger.warning(
             msg=(
-                "Skipping column name prepend operations"
+                "Skipping column name prepend operations "
                 "because no compartments or metadata were provided."
             )
         )
@@ -517,10 +472,8 @@ def _prepend_column_name(
         #   source_group_name_stem: 'Cells'
         #   column_name: 'AreaShape_Area'
         #   updated_column_name: 'Cells_AreaShape_Area'
-        if (
-            column_name not in identifying_columns
-            and not column_name.startswith(source_group_name_stem.capitalize())
-            and column_name not in CYOTABLE_META_COLUMN_TYPES
+        if column_name not in identifying_columns and not column_name.startswith(
+            source_group_name_stem.capitalize()
         ):
             updated_column_names.append(f"{source_group_name_stem}_{column_name}")
         # if-condition for prepending 'Metadata_' to column name
@@ -582,6 +535,7 @@ def _concat_source_group(
     source_group: List[Dict[str, Any]],
     dest_path: str,
     common_schema: Optional[List[Tuple[str, str]]] = None,
+    sort_output: bool = True,
 ) -> List[Dict[str, Any]]:
     """
     Concatenate group of source data together as single file.
@@ -628,6 +582,8 @@ def _concat_source_group(
         common_schema: List[Tuple[str, str]] (Default value = None)
             Common schema to use for concatenation amongst arrow tables
             which may have slightly different but compatible schema.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         List[Dict[str, Any]]
@@ -645,7 +601,7 @@ def _concat_source_group(
         CYTOTABLE_DEFAULT_PARQUET_METADATA,
     )
     from cytotable.exceptions import SchemaException
-    from cytotable.utils import _write_parquet_table_with_metadata
+    from cytotable.utils import _natural_sort
     # build a result placeholder
     concatted: List[Dict[str, Any]] = [
@@ -684,7 +640,10 @@ def _concat_source_group(
     # (all must be the same schema)
     with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
         for source in source_group:
-            for table in [table for table in source["table"]]:
+            tables = [table for table in source["table"]]
+            if sort_output:
+                tables = _natural_sort(tables)
+            for table in tables:
                 # if we haven't inferred the common schema
                 # check that our file matches the expected schema, otherwise raise an error
                 if common_schema is None and not writer_schema.equals(
@@ -728,7 +687,6 @@ def _concat_source_group(
 def _prepare_join_sql(
     sources: Dict[str, List[Dict[str, Any]]],
     joins: str,
-    sort_output: bool,
 ) -> str:
     """
     Prepare join SQL statement with actual locations of data based on the sources.
@@ -749,8 +707,6 @@ def _prepare_join_sql(
     """
     import pathlib
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     # replace with real location of sources for join sql
     order_by_tables = []
     for key, val in sources.items():
@@ -762,25 +718,17 @@ def _prepare_join_sql(
             )
             order_by_tables.append(table_name)
-    # create order by statement with from all tables using cytotable metadata
-    order_by_sql = "ORDER BY " + ", ".join(
-        [
-            f"{table}.{meta_column}"
-            for table in order_by_tables
-            for meta_column in CYOTABLE_META_COLUMN_TYPES
-        ]
-    )
     # add the order by statements to the join
-    return joins + order_by_sql if sort_output else joins
+    return joins
 @python_app
-def _join_source_chunk(
+def _join_source_pageset(
     dest_path: str,
     joins: str,
-    chunk_size: int,
-    offset: int,
+    page_key: str,
+    pageset: Tuple[int, int],
+    sort_output: bool,
     drop_null: bool,
 ) -> str:
     """
@@ -806,30 +754,20 @@ def _join_source_chunk(
     import pathlib
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
-    # Attempt to read the data to parquet file
-    # using duckdb for extraction and pyarrow for
-    # writing data to a parquet file.
-    # read data with chunk size + offset
-    # and export to parquet
-    exclude_meta_cols = [
-        f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
-    ]
     with _duckdb_reader() as ddb_reader:
         result = ddb_reader.execute(
             f"""
-                WITH joined AS (
+            WITH joined AS (
                 {joins}
-                LIMIT {chunk_size} OFFSET {offset}
-                )
-                SELECT
-                /* exclude metadata columns from the results
-                by using a lambda on column names based on exclude_meta_cols. */
-                COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
-                FROM joined;
-                """
+            )
+            SELECT *
+            FROM joined
+            WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
+            /* optional sorting per pagset */
+            {"ORDER BY " + page_key if sort_output else ""};
+            """
         ).arrow()
     # drop nulls if specified
@@ -854,10 +792,8 @@ def _join_source_chunk(
         f"{str(pathlib.Path(dest_path).parent)}/"
         # use the dest_path stem in the name
         f"{str(pathlib.Path(dest_path).stem)}-"
-        # give the join chunk result a unique to arbitrarily
-        # differentiate from other chunk groups which are mapped
-        # and before they are brought together as one dataset
-        f"{str(uuid.uuid4().hex)}.parquet"
+        # add the pageset indication to the filename
+        f"{pageset[0]}-{pageset[1]}.parquet"
     )
     # write the result
@@ -874,6 +810,7 @@ def _concat_join_sources(
     sources: Dict[str, List[Dict[str, Any]]],
     dest_path: str,
     join_sources: List[str],
+    sort_output: bool = True,
 ) -> str:
     """
     Concatenate join sources from parquet-based chunks.
@@ -890,6 +827,8 @@ def _concat_join_sources(
         join_sources: List[str]:
             List of local filepath destination for join source chunks
             which will be concatenated.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         str
@@ -905,7 +844,7 @@ def _concat_join_sources(
         CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
         CYTOTABLE_DEFAULT_PARQUET_METADATA,
     )
-    from cytotable.utils import _write_parquet_table_with_metadata
+    from cytotable.utils import _natural_sort
     # remove the unjoined concatted compartments to prepare final dest_path usage
     # (we now have joined results)
@@ -925,7 +864,11 @@ def _concat_join_sources(
         CYTOTABLE_DEFAULT_PARQUET_METADATA
     )
     with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
-        for table_path in join_sources:
+        for table_path in (
+            join_sources
+            if not sort_output
+            else _natural_sort(list_to_sort=join_sources)
+        ):
             writer.write_table(
                 parquet.read_table(
                     table_path,
@@ -1049,6 +992,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     infer_common_schema: bool,
     drop_null: bool,
     sort_output: bool,
+    page_keys: Dict[str, str],
     data_type_cast_map: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1089,6 +1033,9 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             Whether to drop null results.
         sort_output: bool
             Specifies whether to sort cytotable output or not.
+        page_keys: Dict[str, str]
+            A dictionary which defines which column names are used for keyset pagination
+            in order to perform data extraction.
         data_type_cast_map: Dict[str, str]
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
@@ -1114,21 +1061,40 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             else []
         ),
         **kwargs,
-    ).result()
+    )
     # expand the destination path
     expanded_dest_path = _expand_path(path=dest_path)
-    # prepare offsets for chunked data export from source tables
-    offsets_prepared = {
+    # check that each source group name has a pagination key
+    for source_group_name in sources.keys():
+        matching_keys = [
+            key for key in page_keys.keys() if key.lower() in source_group_name.lower()
+        ]
+        if not matching_keys:
+            raise CytoTableException(
+                f"No matching key found in page_keys for source_group_name: {source_group_name}."
+                "Please include a pagination key based on a column name from the table."
+            )
+    # prepare pagesets for chunked data export from source tables
+    pagesets_prepared = {
         source_group_name: [
             dict(
                 source,
                 **{
-                    "offsets": _get_table_chunk_offsets(
+                    "page_key": (
+                        page_key := [
+                            value
+                            for key, value in page_keys.items()
+                            if key.lower() in source_group_name.lower()
+                        ][0]
+                    ),
+                    "pagesets": _get_table_keyset_pagination_sets(
                         source=source,
                         chunk_size=chunk_size,
-                    )
+                        page_key=page_key,
+                    ),
                 },
             )
             for source in source_group_vals
@@ -1136,17 +1102,17 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         for source_group_name, source_group_vals in sources.items()
     }
-    # if offsets is none and we haven't halted, remove the file as there
+    # if pagesets is none and we haven't halted, remove the file as there
     # were input formatting errors which will create challenges downstream
     invalid_files_dropped = {
         source_group_name: [
-            # ensure we have offsets
+            # ensure we have pagesets
             source
             for source in source_group_vals
-            if source["offsets"] is not None
+            if source["pagesets"] is not None
         ]
         for source_group_name, source_group_vals in evaluate_futures(
-            offsets_prepared
+            pagesets_prepared
         ).items()
         # ensure we have source_groups with at least one source table
         if len(source_group_vals) > 0
@@ -1179,12 +1145,11 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                     "table": [
                         # perform column renaming and create potential return result
                         _prepend_column_name(
-                            # perform chunked data export to parquet using offsets
-                            table_path=_source_chunk_to_parquet(
+                            # perform chunked data export to parquet using pagesets
+                            table_path=_source_pageset_to_parquet(
                                 source_group_name=source_group_name,
                                 source=source,
-                                chunk_size=chunk_size,
-                                offset=offset,
+                                pageset=pageset,
                                 dest_path=expanded_dest_path,
                                 sort_output=sort_output,
                             ),
@@ -1193,7 +1158,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                             metadata=metadata,
                             compartments=compartments,
                         )
-                        for offset in source["offsets"]
+                        for pageset in source["pagesets"]
                     ]
                 },
             )
@@ -1234,6 +1199,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 source_group=source_group_vals[0]["sources"],
                 dest_path=expanded_dest_path,
                 common_schema=source_group_vals[0]["common_schema"],
+                sort_output=sort_output,
             )
             for source_group_name, source_group_vals in evaluate_futures(
                 common_schema_determined
@@ -1247,28 +1213,34 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         evaluated_results = evaluate_futures(results)
         prepared_joins_sql = _prepare_join_sql(
-            sources=evaluated_results, joins=joins, sort_output=sort_output
+            sources=evaluated_results, joins=joins
         ).result()
+        page_key_join = [
+            value for key, value in page_keys.items() if key.lower() == "join"
+        ][0]
         # map joined results based on the join groups gathered above
         # note: after mapping we end up with a list of strings (task returns str)
         join_sources_result = [
-            _join_source_chunk(
+            _join_source_pageset(
                 # gather the result of concatted sources prior to
                 # join group merging as each mapped task run will need
                 # full concat results
                 dest_path=expanded_dest_path,
                 joins=prepared_joins_sql,
-                chunk_size=chunk_size,
-                offset=offset,
+                page_key=page_key_join,
+                pageset=pageset,
+                sort_output=sort_output,
                 drop_null=drop_null,
             )
             # create join group for querying the concatenated
             # data in order to perform memory-safe joining
             # per user chunk size specification.
-            for offset in _get_table_chunk_offsets(
+            for pageset in _get_table_keyset_pagination_sets(
                 sql_stmt=prepared_joins_sql,
                 chunk_size=chunk_size,
+                page_key=page_key_join,
             ).result()
         ]
@@ -1279,6 +1251,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             dest_path=expanded_dest_path,
             join_sources=[join.result() for join in join_sources_result],
             sources=evaluated_results,
+            sort_output=sort_output,
         )
     # wrap the final result as a future and return
@@ -1300,6 +1273,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     infer_common_schema: bool = True,
     drop_null: bool = False,
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    page_keys: Optional[Dict[str, str]] = None,
     sort_output: bool = True,
     preset: Optional[str] = "cellprofiler_csv",
     parsl_config: Optional[parsl.Config] = None,
@@ -1348,6 +1322,12 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
             https://arrow.apache.org/docs/python/api/datatypes.html
+        page_keys: str:
+            The table and column names to be used for key pagination.
+            Uses the form: {"table_name":"column_name"}.
+            Expects columns to include numeric data (ints or floats).
+            Interacts with the `chunk_size` parameter to form
+            pages of `chunk_size`.
         sort_output: bool (Default value = True)
             Specifies whether to sort cytotable output or not.
         drop_null: bool (Default value = False)
@@ -1447,6 +1427,24 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             if chunk_size is None
             else chunk_size
         )
+        page_keys = (
+            cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
+            if page_keys is None
+            else page_keys
+        )
+    # Raise an exception for scenarios where one configures CytoTable to join
+    # but does not provide a pagination key for the joins.
+    if join and (page_keys is None or "join" not in page_keys.keys()):
+        raise CytoTableException(
+            (
+                "When using join=True one must pass a 'join' pagination key "
+                "in the page_keys parameter. The 'join' pagination key is a column "
+                "name found within the joined results based on the SQL provided from "
+                "the joins parameter. This special key is required as not all columns "
+                "from the source tables might not be included."
+            )
+        )
     # send sources to be written to parquet if selected
     if dest_datatype == "parquet":
@@ -1465,6 +1463,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             drop_null=drop_null,
             data_type_cast_map=data_type_cast_map,
             sort_output=sort_output,
+            page_keys=cast(dict, page_keys),
             **kwargs,
         )

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/presets.py RENAMED Viewed

@@ -22,6 +22,16 @@ config = {
             "Parent_Cells",
             "Parent_Nuclei",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -61,6 +71,16 @@ config = {
             "Parent_Cells",
             "Parent_Nuclei",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "Cells_Number_Object_Number",
+            "nuclei": "Nuclei_Number_Object_Number",
+            "cytoplasm": "Cytoplasm_Number_Object_Number",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -85,6 +105,64 @@ config = {
                 AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
             """,
     },
+    "cellprofiler_sqlite_cpg0016_jump": {
+        # version specifications using related references
+        "CONFIG_SOURCE_VERSION": {
+            "cellprofiler": "v4.0.0",
+        },
+        # names of source table compartments (for ex. cells.csv, etc.)
+        "CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
+        # names of source table metadata (for ex. image.csv, etc.)
+        "CONFIG_NAMES_METADATA": ("image",),
+        # column names in any compartment or metadata tables which contain
+        # unique names to avoid renaming
+        "CONFIG_IDENTIFYING_COLUMNS": (
+            "ImageNumber",
+            "ObjectNumber",
+            "Metadata_Well",
+            "Metadata_Plate",
+            "Parent_Cells",
+            "Parent_Nuclei",
+        ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
+        # chunk size to use for join operations to help with possible performance issues
+        # note: this number is an estimate and is may need changes contingent on data
+        # and system used by this library.
+        "CONFIG_CHUNK_SIZE": 1000,
+        # compartment and metadata joins performed using DuckDB SQL
+        # and modified at runtime as needed
+        "CONFIG_JOINS": """
+            SELECT
+                image.Image_TableNumber,
+                image.Metadata_ImageNumber,
+                image.Metadata_Plate,
+                image.Metadata_Well,
+                image.Image_Metadata_Site,
+                image.Image_Metadata_Row,
+                cytoplasm.* EXCLUDE (Metadata_ImageNumber),
+                cells.* EXCLUDE (Metadata_ImageNumber),
+                nuclei.* EXCLUDE (Metadata_ImageNumber)
+            FROM
+                read_parquet('cytoplasm.parquet') AS cytoplasm
+            LEFT JOIN read_parquet('cells.parquet') AS cells ON
+                cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
+            LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
+                nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+                AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
+            LEFT JOIN read_parquet('image.parquet') AS image ON
+                image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
+            """,
+    },
     "cellprofiler_sqlite_pycytominer": {
         # version specifications using related references
         "CONFIG_SOURCE_VERSION": {
@@ -107,6 +185,16 @@ config = {
             "Cells_Number_Object_Number",
             "Nuclei_Number_Object_Number",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "Cells_Number_Object_Number",
+            "nuclei": "Nuclei_Number_Object_Number",
+            "cytoplasm": "Cytoplasm_Number_Object_Number",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -155,6 +243,16 @@ config = {
             "Cells_ObjectNumber",
             "Nuclei_ObjectNumber",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -200,6 +298,12 @@ config = {
             "Z",
             "T",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "test": '"OBJECT ID"',
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/sources.py RENAMED Viewed

@@ -7,13 +7,11 @@ import pathlib
 from typing import Any, Dict, List, Optional, Union
 from cloudpathlib import AnyPath
-from parsl.app.app import join_app, python_app
+from cytotable.exceptions import NoInputDataException
-@python_app
-def _build_path(
-    path: Union[str, pathlib.Path, AnyPath], **kwargs
-) -> Union[pathlib.Path, AnyPath]:
+def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
     """
     Build a path client or return local path.
@@ -43,10 +41,9 @@ def _build_path(
     return processed_path
-@python_app
 def _get_source_filepaths(
     path: Union[pathlib.Path, AnyPath],
-    targets: List[str],
+    targets: Optional[List[str]] = None,
     source_datatype: Optional[str] = None,
 ) -> Dict[str, List[Dict[str, Any]]]:
     """
@@ -75,7 +72,7 @@ def _get_source_filepaths(
     if (targets is None or targets == []) and source_datatype is None:
         raise DatatypeException(
-            f"A source_datatype must be specified when using undefined compartments and metadata names."
+            "A source_datatype must be specified when using undefined compartments and metadata names."
         )
     # gathers files from provided path using compartments + metadata as a filter
@@ -87,9 +84,9 @@ def _get_source_filepaths(
         for subpath in (
             (path,)
             # used if the source path is a single file
-            if AnyPath(path).is_file()
+            if path.is_file()
             # iterates through a source directory
-            else (x for x in AnyPath(path).glob("**/*") if AnyPath(x).is_file())
+            else (x for x in path.glob("**/*") if x.is_file())
         )
         # ensure the subpaths meet certain specifications
         if (
@@ -129,7 +126,8 @@ def _get_source_filepaths(
                     .arrow()["table_name"]
                     .to_pylist()
                     # make sure the table names match with compartment + metadata names
-                    if any(target.lower() in table_name.lower() for target in targets)
+                    if targets is not None
+                    and any(target.lower() in table_name.lower() for target in targets)
                 ]
             else:
                 # if we don't have sqlite source, append the existing element
@@ -181,7 +179,6 @@ def _get_source_filepaths(
     return grouped_sources
-@python_app
 def _infer_source_datatype(
     sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
 ) -> str:
@@ -230,7 +227,6 @@ def _infer_source_datatype(
     return source_datatype
-@python_app
 def _filter_source_filepaths(
     sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
 ) -> Dict[str, List[Dict[str, Any]]]:
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
             if file["source_path"].stat().st_size > 0
             # ensure the datatype matches the source datatype
             and file["source_path"].suffix == f".{source_datatype}"
+            and _file_is_more_than_one_line(path=file["source_path"])
         ]
         for filegroup, files in sources.items()
     }
-@join_app
+def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
+    """
+    Check if the file has more than one line.
+    Args:
+        path (Union[pathlib.Path, AnyPath]):
+            The path to the file.
+    Returns:
+        bool:
+            True if the file has more than one line, False otherwise.
+    Raises:
+        NoInputDataException: If the file has zero lines.
+    """
+    # if we don't have a sqlite file
+    # (we can't check sqlite files for lines)
+    if path.suffix.lower() != ".sqlite":
+        with path.open("r") as f:
+            try:
+                # read two lines, if the second is empty return false
+                return bool(f.readline() and f.readline())
+            except StopIteration:
+                # If we encounter the end of the file, it has only one line
+                raise NoInputDataException(
+                    f"Data file has 0 rows of values. Error in file: {path}"
+                )
+    else:
+        return True
 def _gather_sources(
     source_path: str,
     source_datatype: Optional[str] = None,
@@ -295,11 +324,11 @@ def _gather_sources(
         _infer_source_datatype,
     )
-    source_path = _build_path(path=source_path, **kwargs)
+    built_path = _build_path(path=source_path, **kwargs)
     # gather filepaths which will be used as the basis for this work
     sources = _get_source_filepaths(
-        path=source_path, targets=targets, source_datatype=source_datatype
+        path=built_path, targets=targets, source_datatype=source_datatype
     )
     # infer or validate the source datatype based on source filepaths

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/utils.py RENAMED Viewed

@@ -5,7 +5,7 @@ Utility functions for CytoTable
 import logging
 import os
 import pathlib
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import duckdb
 import parsl
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
         INSTALL sqlite_scanner;
         LOAD sqlite_scanner;
+        /* Install httpfs plugin to avoid error
+        https://github.com/duckdb/duckdb/issues/3243 */
+        INSTALL httpfs;
         /*
         Set threads available to duckdb
         See the following for more information:
@@ -169,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
 def _sqlite_mixed_type_query_to_parquet(
     source_path: str,
     table_name: str,
-    chunk_size: int,
-    offset: int,
+    page_key: str,
+    pageset: Tuple[Union[int, float], Union[int, float]],
     sort_output: bool,
-    add_cytotable_meta: bool = False,
 ) -> str:
     """
     Performs SQLite table data extraction where one or many
@@ -184,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
             A str which is a path to a SQLite database file.
         table_name: str:
             The name of the table being queried.
-        chunk_size: int:
-            Row count to use for chunked output.
-        offset: int:
-            The offset for chunking the data from source.
+        page_key: str:
+            The column name to be used to identify pagination chunks.
+        pageset: Tuple[int, int]:
+            The range for values used for paginating data from source.
         sort_output: bool
             Specifies whether to sort cytotable output or not.
         add_cytotable_meta: bool, default=False:
@@ -201,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
     import pyarrow as pa
-    from cytotable.constants import (
-        CYOTABLE_META_COLUMN_TYPES,
-        SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
-    )
+    from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
     from cytotable.exceptions import DatatypeException
     # open sqlite3 connection
@@ -264,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
             for col in column_info
         ]
-        if add_cytotable_meta:
-            query_parts += [
-                (
-                    f"CAST( '{f'{source_path}_table_{table_name}'}' "
-                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
-                    "AS cytotable_meta_source_path"
-                ),
-                (
-                    f"CAST( {offset} "
-                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
-                    "AS cytotable_meta_offset"
-                ),
-                (
-                    f"CAST( (ROW_NUMBER() OVER ()) AS "
-                    f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
-                    "AS cytotable_meta_rownum"
-                ),
-            ]
         # perform the select using the cases built above and using chunksize + offset
-        sql_stmt = (
-            f"""
-            SELECT
-                {', '.join(query_parts)}
-            FROM {table_name}
-            ORDER BY {', '.join([col['column_name'] for col in column_info])}
-            LIMIT {chunk_size} OFFSET {offset};
-            """
-            if sort_output
-            else f"""
+        sql_stmt = f"""
             SELECT
                 {', '.join(query_parts)}
             FROM {table_name}
-            LIMIT {chunk_size} OFFSET {offset};
+            WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
+            {"ORDER BY " + page_key if sort_output else ""};
             """
-        )
         # execute the sql stmt
         cursor.execute(sql_stmt)
@@ -322,7 +294,7 @@ def _sqlite_mixed_type_query_to_parquet(
     return pa.Table.from_pylist(results)
-def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
+def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
     """
     Takes a cloudpath and uses cache to convert to a local copy
     for use in scenarios where remote work is not possible (sqlite).
@@ -337,24 +309,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
             A local pathlib.Path to cached version of cloudpath file.
     """
-    candidate_path = AnyPath(path)
     # check that the path is a file (caching won't work with a dir)
     # and check that the file is of sqlite type
     # (other file types will be handled remotely in cloud)
-    if candidate_path.is_file() and candidate_path.suffix.lower() == ".sqlite":
+    if (
+        isinstance(path, CloudPath)
+        and path.is_file()
+        and path.suffix.lower() == ".sqlite"
+    ):
         try:
             # update the path to be the local filepath for reference in CytoTable ops
             # note: incurs a data read which will trigger caching of the file
-            path = CloudPath(path).fspath
+            path = pathlib.Path(path.fspath)
         except InvalidPrefixError:
             # share information about not finding a cloud path
             logger.info(
                 "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
             )
-    # cast the result as a pathlib.Path
-    return pathlib.Path(path)
+    return path
 def _arrow_type_cast_if_specified(
@@ -595,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
         if isinstance(sources, dict)
         else _unwrap_value(sources)
     )
+def _generate_pagesets(
+    keys: List[Union[int, float]], chunk_size: int
+) -> List[Tuple[Union[int, float], Union[int, float]]]:
+    """
+    Generate a pageset (keyset pagination) from a list of keys.
+    Parameters:
+        keys List[Union[int, float]]:
+            List of keys to paginate.
+        chunk_size int:
+            Size of each chunk/page.
+    Returns:
+        List[Tuple[Union[int, float], Union[int, float]]]:
+            List of (start_key, end_key) tuples representing each page.
+    """
+    # Initialize an empty list to store the chunks/pages
+    chunks = []
+    # Start index for iteration through the keys
+    i = 0
+    while i < len(keys):
+        # Get the start key for the current chunk
+        start_key = keys[i]
+        # Calculate the end index for the current chunk
+        end_index = min(i + chunk_size, len(keys)) - 1
+        # Get the end key for the current chunk
+        end_key = keys[end_index]
+        # Ensure non-overlapping by incrementing the start of the next range if there are duplicates
+        while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
+            end_index += 1
+        # Append the current chunk (start_key, end_key) to the list of chunks
+        chunks.append((start_key, end_key))
+        # Update the index to start from the next chunk
+        i = end_index + 1
+    # Return the list of chunks/pages
+    return chunks
+def _natural_sort(list_to_sort):
+    """
+    Sorts the given iterable using natural sort adapted from approach
+    provided by the following link:
+    https://stackoverflow.com/a/4836734
+    Args:
+      list_to_sort: List:
+        The list to sort.
+    Returns:
+      List: The sorted list.
+    """
+    import re
+    return sorted(
+        list_to_sort,
+        # use a custom key to sort the list
+        key=lambda key: [
+            # use integer of c if it's a digit, otherwise str
+            int(c) if c.isdigit() else c
+            # Split the key into parts, separating numbers from alphabetic characters
+            for c in re.split("([0-9]+)", str(key))
+        ],
+    )

{cytotable-0.0.8 → cytotable-0.0.10}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "CytoTable"
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-version = "0.0.8"
+version = "0.0.10"
 description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
 authors = ["Cytomining Community"]
 license = "BSD-3-Clause License"
@@ -25,7 +25,7 @@ build-backend = "poetry_dynamic_versioning.backend"
 [tool.poetry.dependencies]
 python = ">=3.8,<3.13"
 pyarrow = ">=13.0.0"
-cloudpathlib = {extras = ["all"], version = "^0.18.0"}
+cloudpathlib = {extras = ["all", "s3"], version = "^0.18.0"}
 duckdb = ">=0.8.0,!=0.10.0,>=0.10.1"
 parsl = ">=2023.9.25"
 numpy = [
@@ -43,10 +43,10 @@ pytest-cov = "^4.1.0"
 Sphinx = "^6.0.0"
 myst-parser = "^2.0.0"
 sphinxcontrib-mermaid = "^0.9.0"
-moto = {extras = ["s3", "server"], version = "^4.0.0"}
 cytominer-database = "^0.3.4"
 pycytominer = "^1.1.0"
 dunamai = "^1.19.0"
+botocore = "^1.34.133" # added to help avoid dependency reolution issues
 [tool.vulture]
 min_confidence = 80
@@ -69,3 +69,6 @@ filterwarnings = [
   "ignore::DeprecationWarning:pkg_resources",
   "ignore::DeprecationWarning:cytominer_database",
 ]
+markers = [
+    "large_data_tests: tests which involve the use of large data.",
+]

{cytotable-0.0.8 → cytotable-0.0.10}/LICENSE RENAMED Viewed

File without changes

{cytotable-0.0.8 → cytotable-0.0.10}/cytotable/exceptions.py RENAMED Viewed

File without changes

{cytotable-0.0.8 → cytotable-0.0.10}/readme.md RENAMED Viewed

File without changes

CytoTable 0.0.8__tar.gz → 0.0.10__tar.gz

CytoTable 0.0.8tar.gz → 0.0.10tar.gz