PyPI - CytoTable - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

CytoTable 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

cytotable/__init__.py +1 -1
cytotable/constants.py +0 -7
cytotable/convert.py +167 -161
cytotable/presets.py +56 -0
cytotable/utils.py +85 -43
{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/METADATA +1 -1
cytotable-0.0.10.dist-info/RECORD +11 -0
cytotable-0.0.9.dist-info/RECORD +0 -11
{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/LICENSE +0 -0
{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/WHEEL +0 -0

cytotable/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ __init__.py for cytotable
 """
 # note: version data is maintained by poetry-dynamic-versioning (do not edit)
-__version__ = "0.0.9"
+__version__ = "0.0.10"
 from .convert import convert
 from .exceptions import (

cytotable/constants.py CHANGED Viewed

@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
     ],
 }
-# metadata column names and types for internal use within CytoTable
-CYOTABLE_META_COLUMN_TYPES = {
-    "cytotable_meta_source_path": "VARCHAR",
-    "cytotable_meta_offset": "BIGINT",
-    "cytotable_meta_rownum": "BIGINT",
-}
 CYTOTABLE_DEFAULT_PARQUET_METADATA = {
     "data-producer": "https://github.com/cytomining/CytoTable",
     "data-producer-version": str(_get_cytotable_version()),

cytotable/convert.py CHANGED Viewed

@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
 import itertools
 import logging
-import uuid
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
 import parsl
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
     Args:
         source: Dict[str, Any]
-            Contains the source data to be chunked. Represents a single
+            Contains source data details. Represents a single
             file or table of some kind.
         sort_output:
             Specifies whether to sort cytotable output or not.
@@ -43,10 +42,7 @@ def _get_table_columns_and_types(
             list of dictionaries which each include column level information
     """
-    import pathlib
     import duckdb
-    from cloudpathlib import AnyPath
     from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
@@ -89,7 +85,7 @@ def _get_table_columns_and_types(
     # with exception handling to read mixed-type data
     # using sqlite3 and special utility function
     try:
-        # isolate using new connection to read data with chunk size + offset
+        # isolate using new connection to read data based on pageset
         # and export directly to parquet via duckdb (avoiding need to return data to python)
         # perform the query and create a list of dictionaries with the column data for table
         with _duckdb_reader() as ddb_reader:
@@ -109,13 +105,8 @@ def _get_table_columns_and_types(
             arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
                 source_path=str(source["source_path"]),
                 table_name=str(source["table_name"]),
-                # chunk size is set to 5 as a limit similar
-                # to above SQL within select_query variable
-                chunk_size=5,
-                # offset is set to 0 start at first row
-                # result from table
-                offset=0,
-                add_cytotable_meta=False,
+                page_key=source["page_key"],
+                pageset=source["pagesets"][0],
                 sort_output=sort_output,
             )
             with _duckdb_reader() as ddb_reader:
@@ -183,13 +174,14 @@ def _prep_cast_column_data_types(
 @python_app
-def _get_table_chunk_offsets(
+def _get_table_keyset_pagination_sets(
     chunk_size: int,
+    page_key: str,
     source: Optional[Dict[str, Any]] = None,
     sql_stmt: Optional[str] = None,
-) -> Union[List[int], None]:
+) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
     """
-    Get table data chunk offsets for later use in capturing segments
+    Get table data chunk keys for later use in capturing segments
     of values. This work also provides a chance to catch problematic
     input data which will be ignored with warnings.
@@ -199,21 +191,27 @@ def _get_table_chunk_offsets(
             file or table of some kind.
         chunk_size: int
             The size in rowcount of the chunks to create.
+        page_key: str
+            The column name to be used to identify pagination chunks.
+            Expected to be of numeric type (int, float) for ordering.
+        sql_stmt:
+            Optional sql statement to form the pagination set from.
+            Default behavior extracts pagination sets from the full
+            data source.
     Returns:
-        List[int]
-            List of integers which represent offsets to use for reading
-            the data later on.
+        List[Any]
+            List of keys to use for reading the data later on.
     """
     import logging
-    import pathlib
+    import sqlite3
+    from contextlib import closing
     import duckdb
-    from cloudpathlib import AnyPath, CloudPath
     from cytotable.exceptions import NoInputDataException
-    from cytotable.utils import _duckdb_reader
+    from cytotable.utils import _duckdb_reader, _generate_pagesets
     logger = logging.getLogger(__name__)
@@ -223,18 +221,29 @@ def _get_table_chunk_offsets(
         source_type = str(source_path.suffix).lower()
         try:
-            # gather the total rowcount from csv or sqlite data input sources
             with _duckdb_reader() as ddb_reader:
-                rowcount = int(
-                    ddb_reader.execute(
-                        # nosec
-                        f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
-                        if source_type == ".csv"
-                        else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
-                    ).fetchone()[0]
-                )
+                if source_type == ".csv":
+                    sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
+                else:
+                    sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
+                page_keys = [
+                    results[0] for results in ddb_reader.execute(sql_query).fetchall()
+                ]
+        # exception case for when we have mixed types
+        # (i.e. integer col with string and ints) in a sqlite column
+        except duckdb.TypeMismatchException:
+            with closing(sqlite3.connect(source_path)) as cx:
+                with cx:
+                    page_keys = [
+                        key[0]
+                        for key in cx.execute(
+                            f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
+                        ).fetchall()
+                        if isinstance(key[0], (int, float))
+                    ]
-        # catch input errors which will result in skipped files
         except (
             duckdb.InvalidInputException,
             NoInputDataException,
@@ -245,34 +254,20 @@ def _get_table_chunk_offsets(
             return None
-    # find chunk offsets from sql statement
     elif sql_stmt is not None:
-        # gather the total rowcount from csv or sqlite data input sources
         with _duckdb_reader() as ddb_reader:
-            rowcount = int(
-                ddb_reader.execute(
-                    # nosec
-                    f"SELECT COUNT(*) FROM ({sql_stmt})"
-                ).fetchone()[0]
-            )
+            sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
+            page_keys = ddb_reader.execute(sql_query).fetchall()
+            page_keys = [key[0] for key in page_keys]
-    return list(
-        range(
-            0,
-            # gather rowcount from table and use as maximum for range
-            rowcount,
-            # step through using chunk size
-            chunk_size,
-        )
-    )
+    return _generate_pagesets(page_keys, chunk_size)
 @python_app
-def _source_chunk_to_parquet(
+def _source_pageset_to_parquet(
     source_group_name: str,
     source: Dict[str, Any],
-    chunk_size: int,
-    offset: int,
+    pageset: Tuple[Union[int, float], Union[int, float]],
     dest_path: str,
     sort_output: bool,
 ) -> str:
@@ -285,10 +280,8 @@ def _source_chunk_to_parquet(
         source: Dict[str, Any]
             Contains the source data to be chunked. Represents a single
             file or table of some kind along with collected information about table.
-        chunk_size: int
-            Row count to use for chunked output.
-        offset: int
-            The offset for chunking the data from source.
+        pageset: Tuple[int, int]
+            The pageset for chunking the data from source.
         dest_path: str
             Path to store the output data.
         sort_output: bool
@@ -303,9 +296,7 @@ def _source_chunk_to_parquet(
     import duckdb
     from cloudpathlib import AnyPath
-    from pyarrow import parquet
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import (
         _duckdb_reader,
         _sqlite_mixed_type_query_to_parquet,
@@ -319,26 +310,6 @@ def _source_chunk_to_parquet(
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
-    source_path_str = (
-        source["source_path"]
-        if "table_name" not in source.keys()
-        else f"{source['source_path']}_table_{source['table_name']}"
-    )
-    # build the column selection block of query
-    # add cytotable metadata columns
-    cytotable_metadata_cols = [
-        (
-            f"CAST( '{source_path_str}' "
-            f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
-            ' AS "cytotable_meta_source_path"'
-        ),
-        f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
-        (
-            f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
-            ' AS "cytotable_meta_rownum"'
-        ),
-    ]
     # add source table columns
     casted_source_cols = [
         # here we cast the column to the specified type ensure the colname remains the same
@@ -349,7 +320,7 @@ def _source_chunk_to_parquet(
     # create selection statement from lists above
     select_columns = ",".join(
         # if we should sort the output, add the metadata_cols
-        cytotable_metadata_cols + casted_source_cols
+        casted_source_cols
         if sort_output
         else casted_source_cols
     )
@@ -364,7 +335,8 @@ def _source_chunk_to_parquet(
         base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
         result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
-    result_filepath = f"{result_filepath_base}-{offset}.parquet"
+    # form a filepath which indicates the pageset
+    result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
     # Attempt to read the data to parquet file
     # using duckdb for extraction and pyarrow for
@@ -377,14 +349,9 @@ def _source_chunk_to_parquet(
                 table=ddb_reader.execute(
                     f"""
                     {base_query}
-                    /* order by all columns for deterministic output */
-                    ORDER BY ALL
-                    LIMIT {chunk_size} OFFSET {offset}
-                    """
-                    if sort_output
-                    else f"""
-                    {base_query}
-                    LIMIT {chunk_size} OFFSET {offset}
+                    WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
+                    /* optional ordering per pageset */
+                    {"ORDER BY " + source['page_key'] if sort_output else ""};
                     """
                 ).arrow(),
                 where=result_filepath,
@@ -406,9 +373,8 @@ def _source_chunk_to_parquet(
                 table=_sqlite_mixed_type_query_to_parquet(
                     source_path=str(source["source_path"]),
                     table_name=str(source["table_name"]),
-                    chunk_size=chunk_size,
-                    offset=offset,
-                    add_cytotable_meta=True if sort_output else False,
+                    page_key=source["page_key"],
+                    pageset=pageset,
                     sort_output=sort_output,
                 ),
                 where=result_filepath,
@@ -458,10 +424,7 @@ def _prepend_column_name(
     import pyarrow.parquet as parquet
-    from cytotable.constants import (
-        CYOTABLE_META_COLUMN_TYPES,
-        CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
-    )
+    from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
     from cytotable.utils import _write_parquet_table_with_metadata
     logger = logging.getLogger(__name__)
@@ -472,7 +435,7 @@ def _prepend_column_name(
     if len(targets) == 0:
         logger.warning(
             msg=(
-                "Skipping column name prepend operations"
+                "Skipping column name prepend operations "
                 "because no compartments or metadata were provided."
             )
         )
@@ -509,10 +472,8 @@ def _prepend_column_name(
         #   source_group_name_stem: 'Cells'
         #   column_name: 'AreaShape_Area'
         #   updated_column_name: 'Cells_AreaShape_Area'
-        if (
-            column_name not in identifying_columns
-            and not column_name.startswith(source_group_name_stem.capitalize())
-            and column_name not in CYOTABLE_META_COLUMN_TYPES
+        if column_name not in identifying_columns and not column_name.startswith(
+            source_group_name_stem.capitalize()
         ):
             updated_column_names.append(f"{source_group_name_stem}_{column_name}")
         # if-condition for prepending 'Metadata_' to column name
@@ -574,6 +535,7 @@ def _concat_source_group(
     source_group: List[Dict[str, Any]],
     dest_path: str,
     common_schema: Optional[List[Tuple[str, str]]] = None,
+    sort_output: bool = True,
 ) -> List[Dict[str, Any]]:
     """
     Concatenate group of source data together as single file.
@@ -620,6 +582,8 @@ def _concat_source_group(
         common_schema: List[Tuple[str, str]] (Default value = None)
             Common schema to use for concatenation amongst arrow tables
             which may have slightly different but compatible schema.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         List[Dict[str, Any]]
@@ -637,7 +601,7 @@ def _concat_source_group(
         CYTOTABLE_DEFAULT_PARQUET_METADATA,
     )
     from cytotable.exceptions import SchemaException
-    from cytotable.utils import _write_parquet_table_with_metadata
+    from cytotable.utils import _natural_sort
     # build a result placeholder
     concatted: List[Dict[str, Any]] = [
@@ -676,7 +640,10 @@ def _concat_source_group(
     # (all must be the same schema)
     with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
         for source in source_group:
-            for table in [table for table in source["table"]]:
+            tables = [table for table in source["table"]]
+            if sort_output:
+                tables = _natural_sort(tables)
+            for table in tables:
                 # if we haven't inferred the common schema
                 # check that our file matches the expected schema, otherwise raise an error
                 if common_schema is None and not writer_schema.equals(
@@ -720,7 +687,6 @@ def _concat_source_group(
 def _prepare_join_sql(
     sources: Dict[str, List[Dict[str, Any]]],
     joins: str,
-    sort_output: bool,
 ) -> str:
     """
     Prepare join SQL statement with actual locations of data based on the sources.
@@ -741,8 +707,6 @@ def _prepare_join_sql(
     """
     import pathlib
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     # replace with real location of sources for join sql
     order_by_tables = []
     for key, val in sources.items():
@@ -754,25 +718,17 @@ def _prepare_join_sql(
             )
             order_by_tables.append(table_name)
-    # create order by statement with from all tables using cytotable metadata
-    order_by_sql = "ORDER BY " + ", ".join(
-        [
-            f"{table}.{meta_column}"
-            for table in order_by_tables
-            for meta_column in CYOTABLE_META_COLUMN_TYPES
-        ]
-    )
     # add the order by statements to the join
-    return joins + order_by_sql if sort_output else joins
+    return joins
 @python_app
-def _join_source_chunk(
+def _join_source_pageset(
     dest_path: str,
     joins: str,
-    chunk_size: int,
-    offset: int,
+    page_key: str,
+    pageset: Tuple[int, int],
+    sort_output: bool,
     drop_null: bool,
 ) -> str:
     """
@@ -798,31 +754,20 @@ def _join_source_chunk(
     import pathlib
-    from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
     from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
-    # Attempt to read the data to parquet file
-    # using duckdb for extraction and pyarrow for
-    # writing data to a parquet file.
-    # read data with chunk size + offset
-    # and export to parquet
-    exclude_meta_cols = [
-        f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
-    ]
     with _duckdb_reader() as ddb_reader:
         result = ddb_reader.execute(
             f"""
-                WITH joined AS (
+            WITH joined AS (
                 {joins}
-                LIMIT {chunk_size} OFFSET {offset}
-                )
-                SELECT
-                /* exclude metadata columns from the results
-                by using a lambda on column names based on exclude_meta_cols. */
-                COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
-                FROM joined;
-                """
+            )
+            SELECT *
+            FROM joined
+            WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
+            /* optional sorting per pagset */
+            {"ORDER BY " + page_key if sort_output else ""};
+            """
         ).arrow()
     # drop nulls if specified
@@ -847,10 +792,8 @@ def _join_source_chunk(
         f"{str(pathlib.Path(dest_path).parent)}/"
         # use the dest_path stem in the name
         f"{str(pathlib.Path(dest_path).stem)}-"
-        # give the join chunk result a unique to arbitrarily
-        # differentiate from other chunk groups which are mapped
-        # and before they are brought together as one dataset
-        f"{str(uuid.uuid4().hex)}.parquet"
+        # add the pageset indication to the filename
+        f"{pageset[0]}-{pageset[1]}.parquet"
     )
     # write the result
@@ -867,6 +810,7 @@ def _concat_join_sources(
     sources: Dict[str, List[Dict[str, Any]]],
     dest_path: str,
     join_sources: List[str],
+    sort_output: bool = True,
 ) -> str:
     """
     Concatenate join sources from parquet-based chunks.
@@ -883,6 +827,8 @@ def _concat_join_sources(
         join_sources: List[str]:
             List of local filepath destination for join source chunks
             which will be concatenated.
+        sort_output: bool
+            Specifies whether to sort cytotable output or not.
     Returns:
         str
@@ -898,7 +844,7 @@ def _concat_join_sources(
         CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
         CYTOTABLE_DEFAULT_PARQUET_METADATA,
     )
-    from cytotable.utils import _write_parquet_table_with_metadata
+    from cytotable.utils import _natural_sort
     # remove the unjoined concatted compartments to prepare final dest_path usage
     # (we now have joined results)
@@ -918,7 +864,11 @@ def _concat_join_sources(
         CYTOTABLE_DEFAULT_PARQUET_METADATA
     )
     with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
-        for table_path in join_sources:
+        for table_path in (
+            join_sources
+            if not sort_output
+            else _natural_sort(list_to_sort=join_sources)
+        ):
             writer.write_table(
                 parquet.read_table(
                     table_path,
@@ -1042,6 +992,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     infer_common_schema: bool,
     drop_null: bool,
     sort_output: bool,
+    page_keys: Dict[str, str],
     data_type_cast_map: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1082,6 +1033,9 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             Whether to drop null results.
         sort_output: bool
             Specifies whether to sort cytotable output or not.
+        page_keys: Dict[str, str]
+            A dictionary which defines which column names are used for keyset pagination
+            in order to perform data extraction.
         data_type_cast_map: Dict[str, str]
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
@@ -1112,16 +1066,35 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
     # expand the destination path
     expanded_dest_path = _expand_path(path=dest_path)
-    # prepare offsets for chunked data export from source tables
-    offsets_prepared = {
+    # check that each source group name has a pagination key
+    for source_group_name in sources.keys():
+        matching_keys = [
+            key for key in page_keys.keys() if key.lower() in source_group_name.lower()
+        ]
+        if not matching_keys:
+            raise CytoTableException(
+                f"No matching key found in page_keys for source_group_name: {source_group_name}."
+                "Please include a pagination key based on a column name from the table."
+            )
+    # prepare pagesets for chunked data export from source tables
+    pagesets_prepared = {
         source_group_name: [
             dict(
                 source,
                 **{
-                    "offsets": _get_table_chunk_offsets(
+                    "page_key": (
+                        page_key := [
+                            value
+                            for key, value in page_keys.items()
+                            if key.lower() in source_group_name.lower()
+                        ][0]
+                    ),
+                    "pagesets": _get_table_keyset_pagination_sets(
                         source=source,
                         chunk_size=chunk_size,
-                    )
+                        page_key=page_key,
+                    ),
                 },
             )
             for source in source_group_vals
@@ -1129,17 +1102,17 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         for source_group_name, source_group_vals in sources.items()
     }
-    # if offsets is none and we haven't halted, remove the file as there
+    # if pagesets is none and we haven't halted, remove the file as there
     # were input formatting errors which will create challenges downstream
     invalid_files_dropped = {
         source_group_name: [
-            # ensure we have offsets
+            # ensure we have pagesets
             source
             for source in source_group_vals
-            if source["offsets"] is not None
+            if source["pagesets"] is not None
         ]
         for source_group_name, source_group_vals in evaluate_futures(
-            offsets_prepared
+            pagesets_prepared
         ).items()
         # ensure we have source_groups with at least one source table
         if len(source_group_vals) > 0
@@ -1172,12 +1145,11 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                     "table": [
                         # perform column renaming and create potential return result
                         _prepend_column_name(
-                            # perform chunked data export to parquet using offsets
-                            table_path=_source_chunk_to_parquet(
+                            # perform chunked data export to parquet using pagesets
+                            table_path=_source_pageset_to_parquet(
                                 source_group_name=source_group_name,
                                 source=source,
-                                chunk_size=chunk_size,
-                                offset=offset,
+                                pageset=pageset,
                                 dest_path=expanded_dest_path,
                                 sort_output=sort_output,
                             ),
@@ -1186,7 +1158,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                             metadata=metadata,
                             compartments=compartments,
                         )
-                        for offset in source["offsets"]
+                        for pageset in source["pagesets"]
                     ]
                 },
             )
@@ -1227,6 +1199,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
                 source_group=source_group_vals[0]["sources"],
                 dest_path=expanded_dest_path,
                 common_schema=source_group_vals[0]["common_schema"],
+                sort_output=sort_output,
             )
             for source_group_name, source_group_vals in evaluate_futures(
                 common_schema_determined
@@ -1240,28 +1213,34 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
         evaluated_results = evaluate_futures(results)
         prepared_joins_sql = _prepare_join_sql(
-            sources=evaluated_results, joins=joins, sort_output=sort_output
+            sources=evaluated_results, joins=joins
         ).result()
+        page_key_join = [
+            value for key, value in page_keys.items() if key.lower() == "join"
+        ][0]
         # map joined results based on the join groups gathered above
         # note: after mapping we end up with a list of strings (task returns str)
         join_sources_result = [
-            _join_source_chunk(
+            _join_source_pageset(
                 # gather the result of concatted sources prior to
                 # join group merging as each mapped task run will need
                 # full concat results
                 dest_path=expanded_dest_path,
                 joins=prepared_joins_sql,
-                chunk_size=chunk_size,
-                offset=offset,
+                page_key=page_key_join,
+                pageset=pageset,
+                sort_output=sort_output,
                 drop_null=drop_null,
             )
             # create join group for querying the concatenated
             # data in order to perform memory-safe joining
             # per user chunk size specification.
-            for offset in _get_table_chunk_offsets(
+            for pageset in _get_table_keyset_pagination_sets(
                 sql_stmt=prepared_joins_sql,
                 chunk_size=chunk_size,
+                page_key=page_key_join,
             ).result()
         ]
@@ -1272,6 +1251,7 @@ def _to_parquet(  # pylint: disable=too-many-arguments, too-many-locals
             dest_path=expanded_dest_path,
             join_sources=[join.result() for join in join_sources_result],
             sources=evaluated_results,
+            sort_output=sort_output,
         )
     # wrap the final result as a future and return
@@ -1293,6 +1273,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
     infer_common_schema: bool = True,
     drop_null: bool = False,
     data_type_cast_map: Optional[Dict[str, str]] = None,
+    page_keys: Optional[Dict[str, str]] = None,
     sort_output: bool = True,
     preset: Optional[str] = "cellprofiler_csv",
     parsl_config: Optional[parsl.Config] = None,
@@ -1341,6 +1322,12 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             A dictionary mapping data type groups to specific types.
             Roughly includes Arrow data types language from:
             https://arrow.apache.org/docs/python/api/datatypes.html
+        page_keys: str:
+            The table and column names to be used for key pagination.
+            Uses the form: {"table_name":"column_name"}.
+            Expects columns to include numeric data (ints or floats).
+            Interacts with the `chunk_size` parameter to form
+            pages of `chunk_size`.
         sort_output: bool (Default value = True)
             Specifies whether to sort cytotable output or not.
         drop_null: bool (Default value = False)
@@ -1440,6 +1427,24 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             if chunk_size is None
             else chunk_size
         )
+        page_keys = (
+            cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
+            if page_keys is None
+            else page_keys
+        )
+    # Raise an exception for scenarios where one configures CytoTable to join
+    # but does not provide a pagination key for the joins.
+    if join and (page_keys is None or "join" not in page_keys.keys()):
+        raise CytoTableException(
+            (
+                "When using join=True one must pass a 'join' pagination key "
+                "in the page_keys parameter. The 'join' pagination key is a column "
+                "name found within the joined results based on the SQL provided from "
+                "the joins parameter. This special key is required as not all columns "
+                "from the source tables might not be included."
+            )
+        )
     # send sources to be written to parquet if selected
     if dest_datatype == "parquet":
@@ -1458,6 +1463,7 @@ def convert(  # pylint: disable=too-many-arguments,too-many-locals
             drop_null=drop_null,
             data_type_cast_map=data_type_cast_map,
             sort_output=sort_output,
+            page_keys=cast(dict, page_keys),
             **kwargs,
         )

cytotable/presets.py CHANGED Viewed

@@ -22,6 +22,16 @@ config = {
             "Parent_Cells",
             "Parent_Nuclei",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -61,6 +71,16 @@ config = {
             "Parent_Cells",
             "Parent_Nuclei",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "Cells_Number_Object_Number",
+            "nuclei": "Nuclei_Number_Object_Number",
+            "cytoplasm": "Cytoplasm_Number_Object_Number",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -104,6 +124,16 @@ config = {
             "Parent_Cells",
             "Parent_Nuclei",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -155,6 +185,16 @@ config = {
             "Cells_Number_Object_Number",
             "Nuclei_Number_Object_Number",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "Cells_Number_Object_Number",
+            "nuclei": "Nuclei_Number_Object_Number",
+            "cytoplasm": "Cytoplasm_Number_Object_Number",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -203,6 +243,16 @@ config = {
             "Cells_ObjectNumber",
             "Nuclei_ObjectNumber",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "image": "ImageNumber",
+            "cells": "ObjectNumber",
+            "nuclei": "ObjectNumber",
+            "cytoplasm": "ObjectNumber",
+            "join": "Cytoplasm_Number_Object_Number",
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.
@@ -248,6 +298,12 @@ config = {
             "Z",
             "T",
         ),
+        # pagination keys for use with this data
+        # of the rough format "table" -> "column".
+        # note: page keys are expected to be numeric (int, float)
+        "CONFIG_PAGE_KEYS": {
+            "test": '"OBJECT ID"',
+        },
         # chunk size to use for join operations to help with possible performance issues
         # note: this number is an estimate and is may need changes contingent on data
         # and system used by this library.

cytotable/utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ Utility functions for CytoTable
 import logging
 import os
 import pathlib
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import duckdb
 import parsl
@@ -173,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
 def _sqlite_mixed_type_query_to_parquet(
     source_path: str,
     table_name: str,
-    chunk_size: int,
-    offset: int,
+    page_key: str,
+    pageset: Tuple[Union[int, float], Union[int, float]],
     sort_output: bool,
-    add_cytotable_meta: bool = False,
 ) -> str:
     """
     Performs SQLite table data extraction where one or many
@@ -188,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
             A str which is a path to a SQLite database file.
         table_name: str:
             The name of the table being queried.
-        chunk_size: int:
-            Row count to use for chunked output.
-        offset: int:
-            The offset for chunking the data from source.
+        page_key: str:
+            The column name to be used to identify pagination chunks.
+        pageset: Tuple[int, int]:
+            The range for values used for paginating data from source.
         sort_output: bool
             Specifies whether to sort cytotable output or not.
         add_cytotable_meta: bool, default=False:
@@ -205,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
     import pyarrow as pa
-    from cytotable.constants import (
-        CYOTABLE_META_COLUMN_TYPES,
-        SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
-    )
+    from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
     from cytotable.exceptions import DatatypeException
     # open sqlite3 connection
@@ -268,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
             for col in column_info
         ]
-        if add_cytotable_meta:
-            query_parts += [
-                (
-                    f"CAST( '{f'{source_path}_table_{table_name}'}' "
-                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
-                    "AS cytotable_meta_source_path"
-                ),
-                (
-                    f"CAST( {offset} "
-                    f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
-                    "AS cytotable_meta_offset"
-                ),
-                (
-                    f"CAST( (ROW_NUMBER() OVER ()) AS "
-                    f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
-                    "AS cytotable_meta_rownum"
-                ),
-            ]
         # perform the select using the cases built above and using chunksize + offset
-        sql_stmt = (
-            f"""
+        sql_stmt = f"""
             SELECT
                 {', '.join(query_parts)}
             FROM {table_name}
-            ORDER BY {', '.join([col['column_name'] for col in column_info])}
-            LIMIT {chunk_size} OFFSET {offset};
+            WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
+            {"ORDER BY " + page_key if sort_output else ""};
             """
-            if sort_output
-            else f"""
-            SELECT
-                {', '.join(query_parts)}
-            FROM {table_name}
-            LIMIT {chunk_size} OFFSET {offset};
-            """
-        )
         # execute the sql stmt
         cursor.execute(sql_stmt)
@@ -600,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
         if isinstance(sources, dict)
         else _unwrap_value(sources)
     )
+def _generate_pagesets(
+    keys: List[Union[int, float]], chunk_size: int
+) -> List[Tuple[Union[int, float], Union[int, float]]]:
+    """
+    Generate a pageset (keyset pagination) from a list of keys.
+    Parameters:
+        keys List[Union[int, float]]:
+            List of keys to paginate.
+        chunk_size int:
+            Size of each chunk/page.
+    Returns:
+        List[Tuple[Union[int, float], Union[int, float]]]:
+            List of (start_key, end_key) tuples representing each page.
+    """
+    # Initialize an empty list to store the chunks/pages
+    chunks = []
+    # Start index for iteration through the keys
+    i = 0
+    while i < len(keys):
+        # Get the start key for the current chunk
+        start_key = keys[i]
+        # Calculate the end index for the current chunk
+        end_index = min(i + chunk_size, len(keys)) - 1
+        # Get the end key for the current chunk
+        end_key = keys[end_index]
+        # Ensure non-overlapping by incrementing the start of the next range if there are duplicates
+        while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
+            end_index += 1
+        # Append the current chunk (start_key, end_key) to the list of chunks
+        chunks.append((start_key, end_key))
+        # Update the index to start from the next chunk
+        i = end_index + 1
+    # Return the list of chunks/pages
+    return chunks
+def _natural_sort(list_to_sort):
+    """
+    Sorts the given iterable using natural sort adapted from approach
+    provided by the following link:
+    https://stackoverflow.com/a/4836734
+    Args:
+      list_to_sort: List:
+        The list to sort.
+    Returns:
+      List: The sorted list.
+    """
+    import re
+    return sorted(
+        list_to_sort,
+        # use a custom key to sort the list
+        key=lambda key: [
+            # use integer of c if it's a digit, otherwise str
+            int(c) if c.isdigit() else c
+            # Split the key into parts, separating numbers from alphabetic characters
+            for c in re.split("([0-9]+)", str(key))
+        ],
+    )

{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: CytoTable
-Version: 0.0.9
+Version: 0.0.10
 Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
 Home-page: https://github.com/cytomining/CytoTable
 License: BSD-3-Clause License

cytotable-0.0.10.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
+cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
+cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
+cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
+cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
+cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
+cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
+cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
+cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
+cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+cytotable-0.0.10.dist-info/RECORD,,

cytotable-0.0.9.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
-cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
-cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
-cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
-cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
-cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
-cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
-cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
-cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
-cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-cytotable-0.0.9.dist-info/RECORD,,

{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/WHEEL RENAMED Viewed

File without changes

CytoTable 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

CytoTable 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl