CytoTable 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.9"
6
+ __version__ = "0.0.10"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/constants.py CHANGED
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
- # metadata column names and types for internal use within CytoTable
72
- CYOTABLE_META_COLUMN_TYPES = {
73
- "cytotable_meta_source_path": "VARCHAR",
74
- "cytotable_meta_offset": "BIGINT",
75
- "cytotable_meta_rownum": "BIGINT",
76
- }
77
-
78
71
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
79
72
  "data-producer": "https://github.com/cytomining/CytoTable",
80
73
  "data-producer-version": str(_get_cytotable_version()),
cytotable/convert.py CHANGED
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
4
4
 
5
5
  import itertools
6
6
  import logging
7
- import uuid
8
7
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
8
 
10
9
  import parsl
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
33
32
 
34
33
  Args:
35
34
  source: Dict[str, Any]
36
- Contains the source data to be chunked. Represents a single
35
+ Contains source data details. Represents a single
37
36
  file or table of some kind.
38
37
  sort_output:
39
38
  Specifies whether to sort cytotable output or not.
@@ -43,10 +42,7 @@ def _get_table_columns_and_types(
43
42
  list of dictionaries which each include column level information
44
43
  """
45
44
 
46
- import pathlib
47
-
48
45
  import duckdb
49
- from cloudpathlib import AnyPath
50
46
 
51
47
  from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
52
48
 
@@ -89,7 +85,7 @@ def _get_table_columns_and_types(
89
85
  # with exception handling to read mixed-type data
90
86
  # using sqlite3 and special utility function
91
87
  try:
92
- # isolate using new connection to read data with chunk size + offset
88
+ # isolate using new connection to read data based on pageset
93
89
  # and export directly to parquet via duckdb (avoiding need to return data to python)
94
90
  # perform the query and create a list of dictionaries with the column data for table
95
91
  with _duckdb_reader() as ddb_reader:
@@ -109,13 +105,8 @@ def _get_table_columns_and_types(
109
105
  arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
110
106
  source_path=str(source["source_path"]),
111
107
  table_name=str(source["table_name"]),
112
- # chunk size is set to 5 as a limit similar
113
- # to above SQL within select_query variable
114
- chunk_size=5,
115
- # offset is set to 0 start at first row
116
- # result from table
117
- offset=0,
118
- add_cytotable_meta=False,
108
+ page_key=source["page_key"],
109
+ pageset=source["pagesets"][0],
119
110
  sort_output=sort_output,
120
111
  )
121
112
  with _duckdb_reader() as ddb_reader:
@@ -183,13 +174,14 @@ def _prep_cast_column_data_types(
183
174
 
184
175
 
185
176
  @python_app
186
- def _get_table_chunk_offsets(
177
+ def _get_table_keyset_pagination_sets(
187
178
  chunk_size: int,
179
+ page_key: str,
188
180
  source: Optional[Dict[str, Any]] = None,
189
181
  sql_stmt: Optional[str] = None,
190
- ) -> Union[List[int], None]:
182
+ ) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
191
183
  """
192
- Get table data chunk offsets for later use in capturing segments
184
+ Get table data chunk keys for later use in capturing segments
193
185
  of values. This work also provides a chance to catch problematic
194
186
  input data which will be ignored with warnings.
195
187
 
@@ -199,21 +191,27 @@ def _get_table_chunk_offsets(
199
191
  file or table of some kind.
200
192
  chunk_size: int
201
193
  The size in rowcount of the chunks to create.
194
+ page_key: str
195
+ The column name to be used to identify pagination chunks.
196
+ Expected to be of numeric type (int, float) for ordering.
197
+ sql_stmt:
198
+ Optional sql statement to form the pagination set from.
199
+ Default behavior extracts pagination sets from the full
200
+ data source.
202
201
 
203
202
  Returns:
204
- List[int]
205
- List of integers which represent offsets to use for reading
206
- the data later on.
203
+ List[Any]
204
+ List of keys to use for reading the data later on.
207
205
  """
208
206
 
209
207
  import logging
210
- import pathlib
208
+ import sqlite3
209
+ from contextlib import closing
211
210
 
212
211
  import duckdb
213
- from cloudpathlib import AnyPath, CloudPath
214
212
 
215
213
  from cytotable.exceptions import NoInputDataException
216
- from cytotable.utils import _duckdb_reader
214
+ from cytotable.utils import _duckdb_reader, _generate_pagesets
217
215
 
218
216
  logger = logging.getLogger(__name__)
219
217
 
@@ -223,18 +221,29 @@ def _get_table_chunk_offsets(
223
221
  source_type = str(source_path.suffix).lower()
224
222
 
225
223
  try:
226
- # gather the total rowcount from csv or sqlite data input sources
227
224
  with _duckdb_reader() as ddb_reader:
228
- rowcount = int(
229
- ddb_reader.execute(
230
- # nosec
231
- f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
232
- if source_type == ".csv"
233
- else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
234
- ).fetchone()[0]
235
- )
225
+ if source_type == ".csv":
226
+ sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
227
+ else:
228
+ sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
229
+
230
+ page_keys = [
231
+ results[0] for results in ddb_reader.execute(sql_query).fetchall()
232
+ ]
233
+
234
+ # exception case for when we have mixed types
235
+ # (i.e. integer col with string and ints) in a sqlite column
236
+ except duckdb.TypeMismatchException:
237
+ with closing(sqlite3.connect(source_path)) as cx:
238
+ with cx:
239
+ page_keys = [
240
+ key[0]
241
+ for key in cx.execute(
242
+ f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
243
+ ).fetchall()
244
+ if isinstance(key[0], (int, float))
245
+ ]
236
246
 
237
- # catch input errors which will result in skipped files
238
247
  except (
239
248
  duckdb.InvalidInputException,
240
249
  NoInputDataException,
@@ -245,34 +254,20 @@ def _get_table_chunk_offsets(
245
254
 
246
255
  return None
247
256
 
248
- # find chunk offsets from sql statement
249
257
  elif sql_stmt is not None:
250
- # gather the total rowcount from csv or sqlite data input sources
251
258
  with _duckdb_reader() as ddb_reader:
252
- rowcount = int(
253
- ddb_reader.execute(
254
- # nosec
255
- f"SELECT COUNT(*) FROM ({sql_stmt})"
256
- ).fetchone()[0]
257
- )
259
+ sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
260
+ page_keys = ddb_reader.execute(sql_query).fetchall()
261
+ page_keys = [key[0] for key in page_keys]
258
262
 
259
- return list(
260
- range(
261
- 0,
262
- # gather rowcount from table and use as maximum for range
263
- rowcount,
264
- # step through using chunk size
265
- chunk_size,
266
- )
267
- )
263
+ return _generate_pagesets(page_keys, chunk_size)
268
264
 
269
265
 
270
266
  @python_app
271
- def _source_chunk_to_parquet(
267
+ def _source_pageset_to_parquet(
272
268
  source_group_name: str,
273
269
  source: Dict[str, Any],
274
- chunk_size: int,
275
- offset: int,
270
+ pageset: Tuple[Union[int, float], Union[int, float]],
276
271
  dest_path: str,
277
272
  sort_output: bool,
278
273
  ) -> str:
@@ -285,10 +280,8 @@ def _source_chunk_to_parquet(
285
280
  source: Dict[str, Any]
286
281
  Contains the source data to be chunked. Represents a single
287
282
  file or table of some kind along with collected information about table.
288
- chunk_size: int
289
- Row count to use for chunked output.
290
- offset: int
291
- The offset for chunking the data from source.
283
+ pageset: Tuple[int, int]
284
+ The pageset for chunking the data from source.
292
285
  dest_path: str
293
286
  Path to store the output data.
294
287
  sort_output: bool
@@ -303,9 +296,7 @@ def _source_chunk_to_parquet(
303
296
 
304
297
  import duckdb
305
298
  from cloudpathlib import AnyPath
306
- from pyarrow import parquet
307
299
 
308
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
309
300
  from cytotable.utils import (
310
301
  _duckdb_reader,
311
302
  _sqlite_mixed_type_query_to_parquet,
@@ -319,26 +310,6 @@ def _source_chunk_to_parquet(
319
310
  )
320
311
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
321
312
 
322
- source_path_str = (
323
- source["source_path"]
324
- if "table_name" not in source.keys()
325
- else f"{source['source_path']}_table_{source['table_name']}"
326
- )
327
- # build the column selection block of query
328
-
329
- # add cytotable metadata columns
330
- cytotable_metadata_cols = [
331
- (
332
- f"CAST( '{source_path_str}' "
333
- f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
334
- ' AS "cytotable_meta_source_path"'
335
- ),
336
- f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
337
- (
338
- f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
339
- ' AS "cytotable_meta_rownum"'
340
- ),
341
- ]
342
313
  # add source table columns
343
314
  casted_source_cols = [
344
315
  # here we cast the column to the specified type ensure the colname remains the same
@@ -349,7 +320,7 @@ def _source_chunk_to_parquet(
349
320
  # create selection statement from lists above
350
321
  select_columns = ",".join(
351
322
  # if we should sort the output, add the metadata_cols
352
- cytotable_metadata_cols + casted_source_cols
323
+ casted_source_cols
353
324
  if sort_output
354
325
  else casted_source_cols
355
326
  )
@@ -364,7 +335,8 @@ def _source_chunk_to_parquet(
364
335
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
365
336
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
366
337
 
367
- result_filepath = f"{result_filepath_base}-{offset}.parquet"
338
+ # form a filepath which indicates the pageset
339
+ result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
368
340
 
369
341
  # Attempt to read the data to parquet file
370
342
  # using duckdb for extraction and pyarrow for
@@ -377,14 +349,9 @@ def _source_chunk_to_parquet(
377
349
  table=ddb_reader.execute(
378
350
  f"""
379
351
  {base_query}
380
- /* order by all columns for deterministic output */
381
- ORDER BY ALL
382
- LIMIT {chunk_size} OFFSET {offset}
383
- """
384
- if sort_output
385
- else f"""
386
- {base_query}
387
- LIMIT {chunk_size} OFFSET {offset}
352
+ WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
353
+ /* optional ordering per pageset */
354
+ {"ORDER BY " + source['page_key'] if sort_output else ""};
388
355
  """
389
356
  ).arrow(),
390
357
  where=result_filepath,
@@ -406,9 +373,8 @@ def _source_chunk_to_parquet(
406
373
  table=_sqlite_mixed_type_query_to_parquet(
407
374
  source_path=str(source["source_path"]),
408
375
  table_name=str(source["table_name"]),
409
- chunk_size=chunk_size,
410
- offset=offset,
411
- add_cytotable_meta=True if sort_output else False,
376
+ page_key=source["page_key"],
377
+ pageset=pageset,
412
378
  sort_output=sort_output,
413
379
  ),
414
380
  where=result_filepath,
@@ -458,10 +424,7 @@ def _prepend_column_name(
458
424
 
459
425
  import pyarrow.parquet as parquet
460
426
 
461
- from cytotable.constants import (
462
- CYOTABLE_META_COLUMN_TYPES,
463
- CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
464
- )
427
+ from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
465
428
  from cytotable.utils import _write_parquet_table_with_metadata
466
429
 
467
430
  logger = logging.getLogger(__name__)
@@ -472,7 +435,7 @@ def _prepend_column_name(
472
435
  if len(targets) == 0:
473
436
  logger.warning(
474
437
  msg=(
475
- "Skipping column name prepend operations"
438
+ "Skipping column name prepend operations "
476
439
  "because no compartments or metadata were provided."
477
440
  )
478
441
  )
@@ -509,10 +472,8 @@ def _prepend_column_name(
509
472
  # source_group_name_stem: 'Cells'
510
473
  # column_name: 'AreaShape_Area'
511
474
  # updated_column_name: 'Cells_AreaShape_Area'
512
- if (
513
- column_name not in identifying_columns
514
- and not column_name.startswith(source_group_name_stem.capitalize())
515
- and column_name not in CYOTABLE_META_COLUMN_TYPES
475
+ if column_name not in identifying_columns and not column_name.startswith(
476
+ source_group_name_stem.capitalize()
516
477
  ):
517
478
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
518
479
  # if-condition for prepending 'Metadata_' to column name
@@ -574,6 +535,7 @@ def _concat_source_group(
574
535
  source_group: List[Dict[str, Any]],
575
536
  dest_path: str,
576
537
  common_schema: Optional[List[Tuple[str, str]]] = None,
538
+ sort_output: bool = True,
577
539
  ) -> List[Dict[str, Any]]:
578
540
  """
579
541
  Concatenate group of source data together as single file.
@@ -620,6 +582,8 @@ def _concat_source_group(
620
582
  common_schema: List[Tuple[str, str]] (Default value = None)
621
583
  Common schema to use for concatenation amongst arrow tables
622
584
  which may have slightly different but compatible schema.
585
+ sort_output: bool
586
+ Specifies whether to sort cytotable output or not.
623
587
 
624
588
  Returns:
625
589
  List[Dict[str, Any]]
@@ -637,7 +601,7 @@ def _concat_source_group(
637
601
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
638
602
  )
639
603
  from cytotable.exceptions import SchemaException
640
- from cytotable.utils import _write_parquet_table_with_metadata
604
+ from cytotable.utils import _natural_sort
641
605
 
642
606
  # build a result placeholder
643
607
  concatted: List[Dict[str, Any]] = [
@@ -676,7 +640,10 @@ def _concat_source_group(
676
640
  # (all must be the same schema)
677
641
  with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
678
642
  for source in source_group:
679
- for table in [table for table in source["table"]]:
643
+ tables = [table for table in source["table"]]
644
+ if sort_output:
645
+ tables = _natural_sort(tables)
646
+ for table in tables:
680
647
  # if we haven't inferred the common schema
681
648
  # check that our file matches the expected schema, otherwise raise an error
682
649
  if common_schema is None and not writer_schema.equals(
@@ -720,7 +687,6 @@ def _concat_source_group(
720
687
  def _prepare_join_sql(
721
688
  sources: Dict[str, List[Dict[str, Any]]],
722
689
  joins: str,
723
- sort_output: bool,
724
690
  ) -> str:
725
691
  """
726
692
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -741,8 +707,6 @@ def _prepare_join_sql(
741
707
  """
742
708
  import pathlib
743
709
 
744
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
745
-
746
710
  # replace with real location of sources for join sql
747
711
  order_by_tables = []
748
712
  for key, val in sources.items():
@@ -754,25 +718,17 @@ def _prepare_join_sql(
754
718
  )
755
719
  order_by_tables.append(table_name)
756
720
 
757
- # create order by statement with from all tables using cytotable metadata
758
- order_by_sql = "ORDER BY " + ", ".join(
759
- [
760
- f"{table}.{meta_column}"
761
- for table in order_by_tables
762
- for meta_column in CYOTABLE_META_COLUMN_TYPES
763
- ]
764
- )
765
-
766
721
  # add the order by statements to the join
767
- return joins + order_by_sql if sort_output else joins
722
+ return joins
768
723
 
769
724
 
770
725
  @python_app
771
- def _join_source_chunk(
726
+ def _join_source_pageset(
772
727
  dest_path: str,
773
728
  joins: str,
774
- chunk_size: int,
775
- offset: int,
729
+ page_key: str,
730
+ pageset: Tuple[int, int],
731
+ sort_output: bool,
776
732
  drop_null: bool,
777
733
  ) -> str:
778
734
  """
@@ -798,31 +754,20 @@ def _join_source_chunk(
798
754
 
799
755
  import pathlib
800
756
 
801
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
802
757
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
803
758
 
804
- # Attempt to read the data to parquet file
805
- # using duckdb for extraction and pyarrow for
806
- # writing data to a parquet file.
807
- # read data with chunk size + offset
808
- # and export to parquet
809
- exclude_meta_cols = [
810
- f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
811
- ]
812
-
813
759
  with _duckdb_reader() as ddb_reader:
814
760
  result = ddb_reader.execute(
815
761
  f"""
816
- WITH joined AS (
762
+ WITH joined AS (
817
763
  {joins}
818
- LIMIT {chunk_size} OFFSET {offset}
819
- )
820
- SELECT
821
- /* exclude metadata columns from the results
822
- by using a lambda on column names based on exclude_meta_cols. */
823
- COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
824
- FROM joined;
825
- """
764
+ )
765
+ SELECT *
766
+ FROM joined
767
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
768
+ /* optional sorting per pagset */
769
+ {"ORDER BY " + page_key if sort_output else ""};
770
+ """
826
771
  ).arrow()
827
772
 
828
773
  # drop nulls if specified
@@ -847,10 +792,8 @@ def _join_source_chunk(
847
792
  f"{str(pathlib.Path(dest_path).parent)}/"
848
793
  # use the dest_path stem in the name
849
794
  f"{str(pathlib.Path(dest_path).stem)}-"
850
- # give the join chunk result a unique to arbitrarily
851
- # differentiate from other chunk groups which are mapped
852
- # and before they are brought together as one dataset
853
- f"{str(uuid.uuid4().hex)}.parquet"
795
+ # add the pageset indication to the filename
796
+ f"{pageset[0]}-{pageset[1]}.parquet"
854
797
  )
855
798
 
856
799
  # write the result
@@ -867,6 +810,7 @@ def _concat_join_sources(
867
810
  sources: Dict[str, List[Dict[str, Any]]],
868
811
  dest_path: str,
869
812
  join_sources: List[str],
813
+ sort_output: bool = True,
870
814
  ) -> str:
871
815
  """
872
816
  Concatenate join sources from parquet-based chunks.
@@ -883,6 +827,8 @@ def _concat_join_sources(
883
827
  join_sources: List[str]:
884
828
  List of local filepath destination for join source chunks
885
829
  which will be concatenated.
830
+ sort_output: bool
831
+ Specifies whether to sort cytotable output or not.
886
832
 
887
833
  Returns:
888
834
  str
@@ -898,7 +844,7 @@ def _concat_join_sources(
898
844
  CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
899
845
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
900
846
  )
901
- from cytotable.utils import _write_parquet_table_with_metadata
847
+ from cytotable.utils import _natural_sort
902
848
 
903
849
  # remove the unjoined concatted compartments to prepare final dest_path usage
904
850
  # (we now have joined results)
@@ -918,7 +864,11 @@ def _concat_join_sources(
918
864
  CYTOTABLE_DEFAULT_PARQUET_METADATA
919
865
  )
920
866
  with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
921
- for table_path in join_sources:
867
+ for table_path in (
868
+ join_sources
869
+ if not sort_output
870
+ else _natural_sort(list_to_sort=join_sources)
871
+ ):
922
872
  writer.write_table(
923
873
  parquet.read_table(
924
874
  table_path,
@@ -1042,6 +992,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1042
992
  infer_common_schema: bool,
1043
993
  drop_null: bool,
1044
994
  sort_output: bool,
995
+ page_keys: Dict[str, str],
1045
996
  data_type_cast_map: Optional[Dict[str, str]] = None,
1046
997
  **kwargs,
1047
998
  ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1082,6 +1033,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1082
1033
  Whether to drop null results.
1083
1034
  sort_output: bool
1084
1035
  Specifies whether to sort cytotable output or not.
1036
+ page_keys: Dict[str, str]
1037
+ A dictionary which defines which column names are used for keyset pagination
1038
+ in order to perform data extraction.
1085
1039
  data_type_cast_map: Dict[str, str]
1086
1040
  A dictionary mapping data type groups to specific types.
1087
1041
  Roughly includes Arrow data types language from:
@@ -1112,16 +1066,35 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1112
1066
  # expand the destination path
1113
1067
  expanded_dest_path = _expand_path(path=dest_path)
1114
1068
 
1115
- # prepare offsets for chunked data export from source tables
1116
- offsets_prepared = {
1069
+ # check that each source group name has a pagination key
1070
+ for source_group_name in sources.keys():
1071
+ matching_keys = [
1072
+ key for key in page_keys.keys() if key.lower() in source_group_name.lower()
1073
+ ]
1074
+ if not matching_keys:
1075
+ raise CytoTableException(
1076
+ f"No matching key found in page_keys for source_group_name: {source_group_name}."
1077
+ "Please include a pagination key based on a column name from the table."
1078
+ )
1079
+
1080
+ # prepare pagesets for chunked data export from source tables
1081
+ pagesets_prepared = {
1117
1082
  source_group_name: [
1118
1083
  dict(
1119
1084
  source,
1120
1085
  **{
1121
- "offsets": _get_table_chunk_offsets(
1086
+ "page_key": (
1087
+ page_key := [
1088
+ value
1089
+ for key, value in page_keys.items()
1090
+ if key.lower() in source_group_name.lower()
1091
+ ][0]
1092
+ ),
1093
+ "pagesets": _get_table_keyset_pagination_sets(
1122
1094
  source=source,
1123
1095
  chunk_size=chunk_size,
1124
- )
1096
+ page_key=page_key,
1097
+ ),
1125
1098
  },
1126
1099
  )
1127
1100
  for source in source_group_vals
@@ -1129,17 +1102,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1129
1102
  for source_group_name, source_group_vals in sources.items()
1130
1103
  }
1131
1104
 
1132
- # if offsets is none and we haven't halted, remove the file as there
1105
+ # if pagesets is none and we haven't halted, remove the file as there
1133
1106
  # were input formatting errors which will create challenges downstream
1134
1107
  invalid_files_dropped = {
1135
1108
  source_group_name: [
1136
- # ensure we have offsets
1109
+ # ensure we have pagesets
1137
1110
  source
1138
1111
  for source in source_group_vals
1139
- if source["offsets"] is not None
1112
+ if source["pagesets"] is not None
1140
1113
  ]
1141
1114
  for source_group_name, source_group_vals in evaluate_futures(
1142
- offsets_prepared
1115
+ pagesets_prepared
1143
1116
  ).items()
1144
1117
  # ensure we have source_groups with at least one source table
1145
1118
  if len(source_group_vals) > 0
@@ -1172,12 +1145,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1172
1145
  "table": [
1173
1146
  # perform column renaming and create potential return result
1174
1147
  _prepend_column_name(
1175
- # perform chunked data export to parquet using offsets
1176
- table_path=_source_chunk_to_parquet(
1148
+ # perform chunked data export to parquet using pagesets
1149
+ table_path=_source_pageset_to_parquet(
1177
1150
  source_group_name=source_group_name,
1178
1151
  source=source,
1179
- chunk_size=chunk_size,
1180
- offset=offset,
1152
+ pageset=pageset,
1181
1153
  dest_path=expanded_dest_path,
1182
1154
  sort_output=sort_output,
1183
1155
  ),
@@ -1186,7 +1158,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1186
1158
  metadata=metadata,
1187
1159
  compartments=compartments,
1188
1160
  )
1189
- for offset in source["offsets"]
1161
+ for pageset in source["pagesets"]
1190
1162
  ]
1191
1163
  },
1192
1164
  )
@@ -1227,6 +1199,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1227
1199
  source_group=source_group_vals[0]["sources"],
1228
1200
  dest_path=expanded_dest_path,
1229
1201
  common_schema=source_group_vals[0]["common_schema"],
1202
+ sort_output=sort_output,
1230
1203
  )
1231
1204
  for source_group_name, source_group_vals in evaluate_futures(
1232
1205
  common_schema_determined
@@ -1240,28 +1213,34 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1240
1213
  evaluated_results = evaluate_futures(results)
1241
1214
 
1242
1215
  prepared_joins_sql = _prepare_join_sql(
1243
- sources=evaluated_results, joins=joins, sort_output=sort_output
1216
+ sources=evaluated_results, joins=joins
1244
1217
  ).result()
1245
1218
 
1219
+ page_key_join = [
1220
+ value for key, value in page_keys.items() if key.lower() == "join"
1221
+ ][0]
1222
+
1246
1223
  # map joined results based on the join groups gathered above
1247
1224
  # note: after mapping we end up with a list of strings (task returns str)
1248
1225
  join_sources_result = [
1249
- _join_source_chunk(
1226
+ _join_source_pageset(
1250
1227
  # gather the result of concatted sources prior to
1251
1228
  # join group merging as each mapped task run will need
1252
1229
  # full concat results
1253
1230
  dest_path=expanded_dest_path,
1254
1231
  joins=prepared_joins_sql,
1255
- chunk_size=chunk_size,
1256
- offset=offset,
1232
+ page_key=page_key_join,
1233
+ pageset=pageset,
1234
+ sort_output=sort_output,
1257
1235
  drop_null=drop_null,
1258
1236
  )
1259
1237
  # create join group for querying the concatenated
1260
1238
  # data in order to perform memory-safe joining
1261
1239
  # per user chunk size specification.
1262
- for offset in _get_table_chunk_offsets(
1240
+ for pageset in _get_table_keyset_pagination_sets(
1263
1241
  sql_stmt=prepared_joins_sql,
1264
1242
  chunk_size=chunk_size,
1243
+ page_key=page_key_join,
1265
1244
  ).result()
1266
1245
  ]
1267
1246
 
@@ -1272,6 +1251,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1272
1251
  dest_path=expanded_dest_path,
1273
1252
  join_sources=[join.result() for join in join_sources_result],
1274
1253
  sources=evaluated_results,
1254
+ sort_output=sort_output,
1275
1255
  )
1276
1256
 
1277
1257
  # wrap the final result as a future and return
@@ -1293,6 +1273,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1293
1273
  infer_common_schema: bool = True,
1294
1274
  drop_null: bool = False,
1295
1275
  data_type_cast_map: Optional[Dict[str, str]] = None,
1276
+ page_keys: Optional[Dict[str, str]] = None,
1296
1277
  sort_output: bool = True,
1297
1278
  preset: Optional[str] = "cellprofiler_csv",
1298
1279
  parsl_config: Optional[parsl.Config] = None,
@@ -1341,6 +1322,12 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1341
1322
  A dictionary mapping data type groups to specific types.
1342
1323
  Roughly includes Arrow data types language from:
1343
1324
  https://arrow.apache.org/docs/python/api/datatypes.html
1325
+ page_keys: str:
1326
+ The table and column names to be used for key pagination.
1327
+ Uses the form: {"table_name":"column_name"}.
1328
+ Expects columns to include numeric data (ints or floats).
1329
+ Interacts with the `chunk_size` parameter to form
1330
+ pages of `chunk_size`.
1344
1331
  sort_output: bool (Default value = True)
1345
1332
  Specifies whether to sort cytotable output or not.
1346
1333
  drop_null: bool (Default value = False)
@@ -1440,6 +1427,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1440
1427
  if chunk_size is None
1441
1428
  else chunk_size
1442
1429
  )
1430
+ page_keys = (
1431
+ cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
1432
+ if page_keys is None
1433
+ else page_keys
1434
+ )
1435
+
1436
+ # Raise an exception for scenarios where one configures CytoTable to join
1437
+ # but does not provide a pagination key for the joins.
1438
+ if join and (page_keys is None or "join" not in page_keys.keys()):
1439
+ raise CytoTableException(
1440
+ (
1441
+ "When using join=True one must pass a 'join' pagination key "
1442
+ "in the page_keys parameter. The 'join' pagination key is a column "
1443
+ "name found within the joined results based on the SQL provided from "
1444
+ "the joins parameter. This special key is required as not all columns "
1445
+ "from the source tables might not be included."
1446
+ )
1447
+ )
1443
1448
 
1444
1449
  # send sources to be written to parquet if selected
1445
1450
  if dest_datatype == "parquet":
@@ -1458,6 +1463,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1458
1463
  drop_null=drop_null,
1459
1464
  data_type_cast_map=data_type_cast_map,
1460
1465
  sort_output=sort_output,
1466
+ page_keys=cast(dict, page_keys),
1461
1467
  **kwargs,
1462
1468
  )
1463
1469
 
cytotable/presets.py CHANGED
@@ -22,6 +22,16 @@ config = {
22
22
  "Parent_Cells",
23
23
  "Parent_Nuclei",
24
24
  ),
25
+ # pagination keys for use with this data
26
+ # of the rough format "table" -> "column".
27
+ # note: page keys are expected to be numeric (int, float)
28
+ "CONFIG_PAGE_KEYS": {
29
+ "image": "ImageNumber",
30
+ "cells": "ObjectNumber",
31
+ "nuclei": "ObjectNumber",
32
+ "cytoplasm": "ObjectNumber",
33
+ "join": "Cytoplasm_Number_Object_Number",
34
+ },
25
35
  # chunk size to use for join operations to help with possible performance issues
26
36
  # note: this number is an estimate and is may need changes contingent on data
27
37
  # and system used by this library.
@@ -61,6 +71,16 @@ config = {
61
71
  "Parent_Cells",
62
72
  "Parent_Nuclei",
63
73
  ),
74
+ # pagination keys for use with this data
75
+ # of the rough format "table" -> "column".
76
+ # note: page keys are expected to be numeric (int, float)
77
+ "CONFIG_PAGE_KEYS": {
78
+ "image": "ImageNumber",
79
+ "cells": "Cells_Number_Object_Number",
80
+ "nuclei": "Nuclei_Number_Object_Number",
81
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
82
+ "join": "Cytoplasm_Number_Object_Number",
83
+ },
64
84
  # chunk size to use for join operations to help with possible performance issues
65
85
  # note: this number is an estimate and is may need changes contingent on data
66
86
  # and system used by this library.
@@ -104,6 +124,16 @@ config = {
104
124
  "Parent_Cells",
105
125
  "Parent_Nuclei",
106
126
  ),
127
+ # pagination keys for use with this data
128
+ # of the rough format "table" -> "column".
129
+ # note: page keys are expected to be numeric (int, float)
130
+ "CONFIG_PAGE_KEYS": {
131
+ "image": "ImageNumber",
132
+ "cells": "ObjectNumber",
133
+ "nuclei": "ObjectNumber",
134
+ "cytoplasm": "ObjectNumber",
135
+ "join": "Cytoplasm_Number_Object_Number",
136
+ },
107
137
  # chunk size to use for join operations to help with possible performance issues
108
138
  # note: this number is an estimate and is may need changes contingent on data
109
139
  # and system used by this library.
@@ -155,6 +185,16 @@ config = {
155
185
  "Cells_Number_Object_Number",
156
186
  "Nuclei_Number_Object_Number",
157
187
  ),
188
+ # pagination keys for use with this data
189
+ # of the rough format "table" -> "column".
190
+ # note: page keys are expected to be numeric (int, float)
191
+ "CONFIG_PAGE_KEYS": {
192
+ "image": "ImageNumber",
193
+ "cells": "Cells_Number_Object_Number",
194
+ "nuclei": "Nuclei_Number_Object_Number",
195
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
196
+ "join": "Cytoplasm_Number_Object_Number",
197
+ },
158
198
  # chunk size to use for join operations to help with possible performance issues
159
199
  # note: this number is an estimate and is may need changes contingent on data
160
200
  # and system used by this library.
@@ -203,6 +243,16 @@ config = {
203
243
  "Cells_ObjectNumber",
204
244
  "Nuclei_ObjectNumber",
205
245
  ),
246
+ # pagination keys for use with this data
247
+ # of the rough format "table" -> "column".
248
+ # note: page keys are expected to be numeric (int, float)
249
+ "CONFIG_PAGE_KEYS": {
250
+ "image": "ImageNumber",
251
+ "cells": "ObjectNumber",
252
+ "nuclei": "ObjectNumber",
253
+ "cytoplasm": "ObjectNumber",
254
+ "join": "Cytoplasm_Number_Object_Number",
255
+ },
206
256
  # chunk size to use for join operations to help with possible performance issues
207
257
  # note: this number is an estimate and is may need changes contingent on data
208
258
  # and system used by this library.
@@ -248,6 +298,12 @@ config = {
248
298
  "Z",
249
299
  "T",
250
300
  ),
301
+ # pagination keys for use with this data
302
+ # of the rough format "table" -> "column".
303
+ # note: page keys are expected to be numeric (int, float)
304
+ "CONFIG_PAGE_KEYS": {
305
+ "test": '"OBJECT ID"',
306
+ },
251
307
  # chunk size to use for join operations to help with possible performance issues
252
308
  # note: this number is an estimate and is may need changes contingent on data
253
309
  # and system used by this library.
cytotable/utils.py CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, List, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -173,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
173
173
  def _sqlite_mixed_type_query_to_parquet(
174
174
  source_path: str,
175
175
  table_name: str,
176
- chunk_size: int,
177
- offset: int,
176
+ page_key: str,
177
+ pageset: Tuple[Union[int, float], Union[int, float]],
178
178
  sort_output: bool,
179
- add_cytotable_meta: bool = False,
180
179
  ) -> str:
181
180
  """
182
181
  Performs SQLite table data extraction where one or many
@@ -188,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
188
187
  A str which is a path to a SQLite database file.
189
188
  table_name: str:
190
189
  The name of the table being queried.
191
- chunk_size: int:
192
- Row count to use for chunked output.
193
- offset: int:
194
- The offset for chunking the data from source.
190
+ page_key: str:
191
+ The column name to be used to identify pagination chunks.
192
+ pageset: Tuple[int, int]:
193
+ The range for values used for paginating data from source.
195
194
  sort_output: bool
196
195
  Specifies whether to sort cytotable output or not.
197
196
  add_cytotable_meta: bool, default=False:
@@ -205,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
205
204
 
206
205
  import pyarrow as pa
207
206
 
208
- from cytotable.constants import (
209
- CYOTABLE_META_COLUMN_TYPES,
210
- SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
211
- )
207
+ from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
212
208
  from cytotable.exceptions import DatatypeException
213
209
 
214
210
  # open sqlite3 connection
@@ -268,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
268
264
  for col in column_info
269
265
  ]
270
266
 
271
- if add_cytotable_meta:
272
- query_parts += [
273
- (
274
- f"CAST( '{f'{source_path}_table_{table_name}'}' "
275
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
276
- "AS cytotable_meta_source_path"
277
- ),
278
- (
279
- f"CAST( {offset} "
280
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
281
- "AS cytotable_meta_offset"
282
- ),
283
- (
284
- f"CAST( (ROW_NUMBER() OVER ()) AS "
285
- f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
286
- "AS cytotable_meta_rownum"
287
- ),
288
- ]
289
-
290
267
  # perform the select using the cases built above and using chunksize + offset
291
- sql_stmt = (
292
- f"""
268
+ sql_stmt = f"""
293
269
  SELECT
294
270
  {', '.join(query_parts)}
295
271
  FROM {table_name}
296
- ORDER BY {', '.join([col['column_name'] for col in column_info])}
297
- LIMIT {chunk_size} OFFSET {offset};
272
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
273
+ {"ORDER BY " + page_key if sort_output else ""};
298
274
  """
299
- if sort_output
300
- else f"""
301
- SELECT
302
- {', '.join(query_parts)}
303
- FROM {table_name}
304
- LIMIT {chunk_size} OFFSET {offset};
305
- """
306
- )
307
275
 
308
276
  # execute the sql stmt
309
277
  cursor.execute(sql_stmt)
@@ -600,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
600
568
  if isinstance(sources, dict)
601
569
  else _unwrap_value(sources)
602
570
  )
571
+
572
+
573
+ def _generate_pagesets(
574
+ keys: List[Union[int, float]], chunk_size: int
575
+ ) -> List[Tuple[Union[int, float], Union[int, float]]]:
576
+ """
577
+ Generate a pageset (keyset pagination) from a list of keys.
578
+
579
+ Parameters:
580
+ keys List[Union[int, float]]:
581
+ List of keys to paginate.
582
+ chunk_size int:
583
+ Size of each chunk/page.
584
+
585
+ Returns:
586
+ List[Tuple[Union[int, float], Union[int, float]]]:
587
+ List of (start_key, end_key) tuples representing each page.
588
+ """
589
+
590
+ # Initialize an empty list to store the chunks/pages
591
+ chunks = []
592
+
593
+ # Start index for iteration through the keys
594
+ i = 0
595
+
596
+ while i < len(keys):
597
+ # Get the start key for the current chunk
598
+ start_key = keys[i]
599
+
600
+ # Calculate the end index for the current chunk
601
+ end_index = min(i + chunk_size, len(keys)) - 1
602
+
603
+ # Get the end key for the current chunk
604
+ end_key = keys[end_index]
605
+
606
+ # Ensure non-overlapping by incrementing the start of the next range if there are duplicates
607
+ while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
608
+ end_index += 1
609
+
610
+ # Append the current chunk (start_key, end_key) to the list of chunks
611
+ chunks.append((start_key, end_key))
612
+
613
+ # Update the index to start from the next chunk
614
+ i = end_index + 1
615
+
616
+ # Return the list of chunks/pages
617
+ return chunks
618
+
619
+
620
+ def _natural_sort(list_to_sort):
621
+ """
622
+ Sorts the given iterable using natural sort adapted from approach
623
+ provided by the following link:
624
+ https://stackoverflow.com/a/4836734
625
+
626
+ Args:
627
+ list_to_sort: List:
628
+ The list to sort.
629
+
630
+ Returns:
631
+ List: The sorted list.
632
+ """
633
+ import re
634
+
635
+ return sorted(
636
+ list_to_sort,
637
+ # use a custom key to sort the list
638
+ key=lambda key: [
639
+ # use integer of c if it's a digit, otherwise str
640
+ int(c) if c.isdigit() else c
641
+ # Split the key into parts, separating numbers from alphabetic characters
642
+ for c in re.split("([0-9]+)", str(key))
643
+ ],
644
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.9
3
+ Version: 0.0.10
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
2
+ cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
+ cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
6
+ cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
+ cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
8
+ cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
10
+ cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ cytotable-0.0.10.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
2
- cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
- cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
6
- cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
- cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
8
- cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
10
- cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.9.dist-info/RECORD,,