CytoTable 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.9"
6
+ __version__ = "0.0.11"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/constants.py CHANGED
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
- # metadata column names and types for internal use within CytoTable
72
- CYOTABLE_META_COLUMN_TYPES = {
73
- "cytotable_meta_source_path": "VARCHAR",
74
- "cytotable_meta_offset": "BIGINT",
75
- "cytotable_meta_rownum": "BIGINT",
76
- }
77
-
78
71
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
79
72
  "data-producer": "https://github.com/cytomining/CytoTable",
80
73
  "data-producer-version": str(_get_cytotable_version()),
cytotable/convert.py CHANGED
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
4
4
 
5
5
  import itertools
6
6
  import logging
7
- import uuid
8
7
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
8
 
10
9
  import parsl
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
33
32
 
34
33
  Args:
35
34
  source: Dict[str, Any]
36
- Contains the source data to be chunked. Represents a single
35
+ Contains source data details. Represents a single
37
36
  file or table of some kind.
38
37
  sort_output:
39
38
  Specifies whether to sort cytotable output or not.
@@ -43,10 +42,7 @@ def _get_table_columns_and_types(
43
42
  list of dictionaries which each include column level information
44
43
  """
45
44
 
46
- import pathlib
47
-
48
45
  import duckdb
49
- from cloudpathlib import AnyPath
50
46
 
51
47
  from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
52
48
 
@@ -89,7 +85,7 @@ def _get_table_columns_and_types(
89
85
  # with exception handling to read mixed-type data
90
86
  # using sqlite3 and special utility function
91
87
  try:
92
- # isolate using new connection to read data with chunk size + offset
88
+ # isolate using new connection to read data based on pageset
93
89
  # and export directly to parquet via duckdb (avoiding need to return data to python)
94
90
  # perform the query and create a list of dictionaries with the column data for table
95
91
  with _duckdb_reader() as ddb_reader:
@@ -109,13 +105,8 @@ def _get_table_columns_and_types(
109
105
  arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
110
106
  source_path=str(source["source_path"]),
111
107
  table_name=str(source["table_name"]),
112
- # chunk size is set to 5 as a limit similar
113
- # to above SQL within select_query variable
114
- chunk_size=5,
115
- # offset is set to 0 start at first row
116
- # result from table
117
- offset=0,
118
- add_cytotable_meta=False,
108
+ page_key=source["page_key"],
109
+ pageset=source["pagesets"][0],
119
110
  sort_output=sort_output,
120
111
  )
121
112
  with _duckdb_reader() as ddb_reader:
@@ -183,13 +174,114 @@ def _prep_cast_column_data_types(
183
174
 
184
175
 
185
176
  @python_app
186
- def _get_table_chunk_offsets(
177
+ def _set_tablenumber(
178
+ sources: Dict[str, List[Dict[str, Any]]],
179
+ add_tablenumber: Optional[bool] = None,
180
+ ) -> Dict[str, List[Dict[str, Any]]]:
181
+ """
182
+ Gathers a "TableNumber" from the image table (if CSV) or
183
+ SQLite file (if SQLite source) which is a unique identifier
184
+ intended to help differentiate between imagenumbers
185
+ to create distinct records for single-cell profiles
186
+ referenced across multiple source data exports.
187
+ For example, ImageNumber column values from CellProfiler
188
+ will repeat across exports, meaning we may lose distinction
189
+ when combining multiple export files together through CytoTable.
190
+
191
+ Note:
192
+ - If using CSV data sources, the image.csv table is used for checksum.
193
+ - If using SQLite data sources, the entire SQLite database is used for checksum.
194
+
195
+ Args:
196
+ sources: Dict[str, List[Dict[str, Any]]]
197
+ Contains metadata about data tables and related contents.
198
+ add_tablenumber: Optional[bool]
199
+ Whether to add a calculated tablenumber.
200
+ Note: when False, adds None as the tablenumber
201
+
202
+ Returns:
203
+ List[Dict[str, Any]]
204
+ New source group with added TableNumber details.
205
+ """
206
+
207
+ from cloudpathlib import AnyPath
208
+
209
+ from cytotable.utils import _gather_tablenumber_checksum
210
+
211
+ image_table_groups = {
212
+ # create a data structure with the common parent for each dataset
213
+ # and the calculated checksum from the image table.
214
+ # note: the source_path parent is used for non-SQLite files
215
+ # whereas the direct source path is used for SQLite files.
216
+ (
217
+ str(source["source_path"].parent)
218
+ if source["source_path"].suffix != "sqlite"
219
+ else source["source_path"]
220
+ ): source["source_path"]
221
+ for source_group_name, source_group_vals in sources.items()
222
+ # use the image tables references only for the basis of the
223
+ # these calculations.
224
+ if any(
225
+ value in str(AnyPath(source_group_name).stem).lower()
226
+ for value in ["image", "per_image"]
227
+ )
228
+ for source in source_group_vals
229
+ }
230
+
231
+ # determine if we need to add tablenumber data
232
+ if (
233
+ # case for detecting multiple image tables which need to be differentiated
234
+ add_tablenumber is None
235
+ and (len(image_table_groups) <= 1)
236
+ ) or (
237
+ # case for explicitly set no tablenumbers
238
+ add_tablenumber
239
+ is False
240
+ ):
241
+ return {
242
+ source_group_name: [
243
+ dict(
244
+ source,
245
+ **{
246
+ "tablenumber": None,
247
+ },
248
+ )
249
+ for source in source_group_vals
250
+ ]
251
+ for source_group_name, source_group_vals in sources.items()
252
+ }
253
+
254
+ # gather the image table from the source_group
255
+ tablenumber_table = {
256
+ # create a data structure with the common parent for each dataset
257
+ # and the calculated checksum from the image table
258
+ group: _gather_tablenumber_checksum(path)
259
+ for group, path in image_table_groups.items()
260
+ }
261
+
262
+ # return a modified sources data structure with the tablenumber added
263
+ return {
264
+ source_group_name: [
265
+ dict(
266
+ source,
267
+ **{"tablenumber": tablenumber_table[str(source["source_path"].parent)]},
268
+ )
269
+ for source in source_group_vals
270
+ if str(source["source_path"].parent) in list(tablenumber_table.keys())
271
+ ]
272
+ for source_group_name, source_group_vals in sources.items()
273
+ }
274
+
275
+
276
+ @python_app
277
+ def _get_table_keyset_pagination_sets(
187
278
  chunk_size: int,
279
+ page_key: str,
188
280
  source: Optional[Dict[str, Any]] = None,
189
281
  sql_stmt: Optional[str] = None,
190
- ) -> Union[List[int], None]:
282
+ ) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
191
283
  """
192
- Get table data chunk offsets for later use in capturing segments
284
+ Get table data chunk keys for later use in capturing segments
193
285
  of values. This work also provides a chance to catch problematic
194
286
  input data which will be ignored with warnings.
195
287
 
@@ -199,21 +291,27 @@ def _get_table_chunk_offsets(
199
291
  file or table of some kind.
200
292
  chunk_size: int
201
293
  The size in rowcount of the chunks to create.
294
+ page_key: str
295
+ The column name to be used to identify pagination chunks.
296
+ Expected to be of numeric type (int, float) for ordering.
297
+ sql_stmt:
298
+ Optional sql statement to form the pagination set from.
299
+ Default behavior extracts pagination sets from the full
300
+ data source.
202
301
 
203
302
  Returns:
204
- List[int]
205
- List of integers which represent offsets to use for reading
206
- the data later on.
303
+ List[Any]
304
+ List of keys to use for reading the data later on.
207
305
  """
208
306
 
209
307
  import logging
210
- import pathlib
308
+ import sqlite3
309
+ from contextlib import closing
211
310
 
212
311
  import duckdb
213
- from cloudpathlib import AnyPath, CloudPath
214
312
 
215
313
  from cytotable.exceptions import NoInputDataException
216
- from cytotable.utils import _duckdb_reader
314
+ from cytotable.utils import _duckdb_reader, _generate_pagesets
217
315
 
218
316
  logger = logging.getLogger(__name__)
219
317
 
@@ -223,18 +321,29 @@ def _get_table_chunk_offsets(
223
321
  source_type = str(source_path.suffix).lower()
224
322
 
225
323
  try:
226
- # gather the total rowcount from csv or sqlite data input sources
227
324
  with _duckdb_reader() as ddb_reader:
228
- rowcount = int(
229
- ddb_reader.execute(
230
- # nosec
231
- f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
232
- if source_type == ".csv"
233
- else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
234
- ).fetchone()[0]
235
- )
325
+ if source_type == ".csv":
326
+ sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
327
+ else:
328
+ sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
329
+
330
+ page_keys = [
331
+ results[0] for results in ddb_reader.execute(sql_query).fetchall()
332
+ ]
333
+
334
+ # exception case for when we have mixed types
335
+ # (i.e. integer col with string and ints) in a sqlite column
336
+ except duckdb.TypeMismatchException:
337
+ with closing(sqlite3.connect(source_path)) as cx:
338
+ with cx:
339
+ page_keys = [
340
+ key[0]
341
+ for key in cx.execute(
342
+ f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
343
+ ).fetchall()
344
+ if isinstance(key[0], (int, float))
345
+ ]
236
346
 
237
- # catch input errors which will result in skipped files
238
347
  except (
239
348
  duckdb.InvalidInputException,
240
349
  NoInputDataException,
@@ -245,34 +354,20 @@ def _get_table_chunk_offsets(
245
354
 
246
355
  return None
247
356
 
248
- # find chunk offsets from sql statement
249
357
  elif sql_stmt is not None:
250
- # gather the total rowcount from csv or sqlite data input sources
251
358
  with _duckdb_reader() as ddb_reader:
252
- rowcount = int(
253
- ddb_reader.execute(
254
- # nosec
255
- f"SELECT COUNT(*) FROM ({sql_stmt})"
256
- ).fetchone()[0]
257
- )
359
+ sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
360
+ page_keys = ddb_reader.execute(sql_query).fetchall()
361
+ page_keys = [key[0] for key in page_keys]
258
362
 
259
- return list(
260
- range(
261
- 0,
262
- # gather rowcount from table and use as maximum for range
263
- rowcount,
264
- # step through using chunk size
265
- chunk_size,
266
- )
267
- )
363
+ return _generate_pagesets(page_keys, chunk_size)
268
364
 
269
365
 
270
366
  @python_app
271
- def _source_chunk_to_parquet(
367
+ def _source_pageset_to_parquet(
272
368
  source_group_name: str,
273
369
  source: Dict[str, Any],
274
- chunk_size: int,
275
- offset: int,
370
+ pageset: Tuple[Union[int, float], Union[int, float]],
276
371
  dest_path: str,
277
372
  sort_output: bool,
278
373
  ) -> str:
@@ -285,10 +380,8 @@ def _source_chunk_to_parquet(
285
380
  source: Dict[str, Any]
286
381
  Contains the source data to be chunked. Represents a single
287
382
  file or table of some kind along with collected information about table.
288
- chunk_size: int
289
- Row count to use for chunked output.
290
- offset: int
291
- The offset for chunking the data from source.
383
+ pageset: Tuple[int, int]
384
+ The pageset for chunking the data from source.
292
385
  dest_path: str
293
386
  Path to store the output data.
294
387
  sort_output: bool
@@ -303,9 +396,7 @@ def _source_chunk_to_parquet(
303
396
 
304
397
  import duckdb
305
398
  from cloudpathlib import AnyPath
306
- from pyarrow import parquet
307
399
 
308
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
309
400
  from cytotable.utils import (
310
401
  _duckdb_reader,
311
402
  _sqlite_mixed_type_query_to_parquet,
@@ -319,26 +410,18 @@ def _source_chunk_to_parquet(
319
410
  )
320
411
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
321
412
 
322
- source_path_str = (
323
- source["source_path"]
324
- if "table_name" not in source.keys()
325
- else f"{source['source_path']}_table_{source['table_name']}"
413
+ # build tablenumber segment addition (if necessary)
414
+ tablenumber_sql = (
415
+ # to become tablenumber in sql select later with bigint (8-byte integer)
416
+ # we cast here to bigint to avoid concat or join conflicts later due to
417
+ # misaligned automatic data typing.
418
+ f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, "
419
+ if source["tablenumber"] is not None
420
+ # don't introduce the column if we aren't supposed to add tablenumber
421
+ # as per parameter.
422
+ else ""
326
423
  )
327
- # build the column selection block of query
328
424
 
329
- # add cytotable metadata columns
330
- cytotable_metadata_cols = [
331
- (
332
- f"CAST( '{source_path_str}' "
333
- f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
334
- ' AS "cytotable_meta_source_path"'
335
- ),
336
- f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
337
- (
338
- f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
339
- ' AS "cytotable_meta_rownum"'
340
- ),
341
- ]
342
425
  # add source table columns
343
426
  casted_source_cols = [
344
427
  # here we cast the column to the specified type ensure the colname remains the same
@@ -346,10 +429,10 @@ def _source_chunk_to_parquet(
346
429
  for column in source["columns"]
347
430
  ]
348
431
 
349
- # create selection statement from lists above
350
- select_columns = ",".join(
432
+ # create selection statement from tablenumber_sql + lists above
433
+ select_columns = tablenumber_sql + ",".join(
351
434
  # if we should sort the output, add the metadata_cols
352
- cytotable_metadata_cols + casted_source_cols
435
+ casted_source_cols
353
436
  if sort_output
354
437
  else casted_source_cols
355
438
  )
@@ -364,7 +447,8 @@ def _source_chunk_to_parquet(
364
447
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
365
448
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
366
449
 
367
- result_filepath = f"{result_filepath_base}-{offset}.parquet"
450
+ # form a filepath which indicates the pageset
451
+ result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
368
452
 
369
453
  # Attempt to read the data to parquet file
370
454
  # using duckdb for extraction and pyarrow for
@@ -377,14 +461,9 @@ def _source_chunk_to_parquet(
377
461
  table=ddb_reader.execute(
378
462
  f"""
379
463
  {base_query}
380
- /* order by all columns for deterministic output */
381
- ORDER BY ALL
382
- LIMIT {chunk_size} OFFSET {offset}
383
- """
384
- if sort_output
385
- else f"""
386
- {base_query}
387
- LIMIT {chunk_size} OFFSET {offset}
464
+ WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
465
+ /* optional ordering per pageset */
466
+ {"ORDER BY " + source['page_key'] if sort_output else ""};
388
467
  """
389
468
  ).arrow(),
390
469
  where=result_filepath,
@@ -406,10 +485,10 @@ def _source_chunk_to_parquet(
406
485
  table=_sqlite_mixed_type_query_to_parquet(
407
486
  source_path=str(source["source_path"]),
408
487
  table_name=str(source["table_name"]),
409
- chunk_size=chunk_size,
410
- offset=offset,
411
- add_cytotable_meta=True if sort_output else False,
488
+ page_key=source["page_key"],
489
+ pageset=pageset,
412
490
  sort_output=sort_output,
491
+ tablenumber=source["tablenumber"],
413
492
  ),
414
493
  where=result_filepath,
415
494
  )
@@ -458,10 +537,7 @@ def _prepend_column_name(
458
537
 
459
538
  import pyarrow.parquet as parquet
460
539
 
461
- from cytotable.constants import (
462
- CYOTABLE_META_COLUMN_TYPES,
463
- CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
464
- )
540
+ from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
465
541
  from cytotable.utils import _write_parquet_table_with_metadata
466
542
 
467
543
  logger = logging.getLogger(__name__)
@@ -472,7 +548,7 @@ def _prepend_column_name(
472
548
  if len(targets) == 0:
473
549
  logger.warning(
474
550
  msg=(
475
- "Skipping column name prepend operations"
551
+ "Skipping column name prepend operations "
476
552
  "because no compartments or metadata were provided."
477
553
  )
478
554
  )
@@ -509,10 +585,8 @@ def _prepend_column_name(
509
585
  # source_group_name_stem: 'Cells'
510
586
  # column_name: 'AreaShape_Area'
511
587
  # updated_column_name: 'Cells_AreaShape_Area'
512
- if (
513
- column_name not in identifying_columns
514
- and not column_name.startswith(source_group_name_stem.capitalize())
515
- and column_name not in CYOTABLE_META_COLUMN_TYPES
588
+ if column_name not in identifying_columns and not column_name.startswith(
589
+ source_group_name_stem.capitalize()
516
590
  ):
517
591
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
518
592
  # if-condition for prepending 'Metadata_' to column name
@@ -574,6 +648,7 @@ def _concat_source_group(
574
648
  source_group: List[Dict[str, Any]],
575
649
  dest_path: str,
576
650
  common_schema: Optional[List[Tuple[str, str]]] = None,
651
+ sort_output: bool = True,
577
652
  ) -> List[Dict[str, Any]]:
578
653
  """
579
654
  Concatenate group of source data together as single file.
@@ -620,6 +695,8 @@ def _concat_source_group(
620
695
  common_schema: List[Tuple[str, str]] (Default value = None)
621
696
  Common schema to use for concatenation amongst arrow tables
622
697
  which may have slightly different but compatible schema.
698
+ sort_output: bool
699
+ Specifies whether to sort cytotable output or not.
623
700
 
624
701
  Returns:
625
702
  List[Dict[str, Any]]
@@ -637,7 +714,7 @@ def _concat_source_group(
637
714
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
638
715
  )
639
716
  from cytotable.exceptions import SchemaException
640
- from cytotable.utils import _write_parquet_table_with_metadata
717
+ from cytotable.utils import _natural_sort
641
718
 
642
719
  # build a result placeholder
643
720
  concatted: List[Dict[str, Any]] = [
@@ -676,7 +753,10 @@ def _concat_source_group(
676
753
  # (all must be the same schema)
677
754
  with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
678
755
  for source in source_group:
679
- for table in [table for table in source["table"]]:
756
+ tables = [table for table in source["table"]]
757
+ if sort_output:
758
+ tables = _natural_sort(tables)
759
+ for table in tables:
680
760
  # if we haven't inferred the common schema
681
761
  # check that our file matches the expected schema, otherwise raise an error
682
762
  if common_schema is None and not writer_schema.equals(
@@ -720,7 +800,6 @@ def _concat_source_group(
720
800
  def _prepare_join_sql(
721
801
  sources: Dict[str, List[Dict[str, Any]]],
722
802
  joins: str,
723
- sort_output: bool,
724
803
  ) -> str:
725
804
  """
726
805
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -741,8 +820,6 @@ def _prepare_join_sql(
741
820
  """
742
821
  import pathlib
743
822
 
744
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
745
-
746
823
  # replace with real location of sources for join sql
747
824
  order_by_tables = []
748
825
  for key, val in sources.items():
@@ -754,25 +831,17 @@ def _prepare_join_sql(
754
831
  )
755
832
  order_by_tables.append(table_name)
756
833
 
757
- # create order by statement with from all tables using cytotable metadata
758
- order_by_sql = "ORDER BY " + ", ".join(
759
- [
760
- f"{table}.{meta_column}"
761
- for table in order_by_tables
762
- for meta_column in CYOTABLE_META_COLUMN_TYPES
763
- ]
764
- )
765
-
766
834
  # add the order by statements to the join
767
- return joins + order_by_sql if sort_output else joins
835
+ return joins
768
836
 
769
837
 
770
838
  @python_app
771
- def _join_source_chunk(
839
+ def _join_source_pageset(
772
840
  dest_path: str,
773
841
  joins: str,
774
- chunk_size: int,
775
- offset: int,
842
+ page_key: str,
843
+ pageset: Tuple[int, int],
844
+ sort_output: bool,
776
845
  drop_null: bool,
777
846
  ) -> str:
778
847
  """
@@ -798,31 +867,20 @@ def _join_source_chunk(
798
867
 
799
868
  import pathlib
800
869
 
801
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
802
870
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
803
871
 
804
- # Attempt to read the data to parquet file
805
- # using duckdb for extraction and pyarrow for
806
- # writing data to a parquet file.
807
- # read data with chunk size + offset
808
- # and export to parquet
809
- exclude_meta_cols = [
810
- f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
811
- ]
812
-
813
872
  with _duckdb_reader() as ddb_reader:
814
873
  result = ddb_reader.execute(
815
874
  f"""
816
- WITH joined AS (
875
+ WITH joined AS (
817
876
  {joins}
818
- LIMIT {chunk_size} OFFSET {offset}
819
- )
820
- SELECT
821
- /* exclude metadata columns from the results
822
- by using a lambda on column names based on exclude_meta_cols. */
823
- COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
824
- FROM joined;
825
- """
877
+ )
878
+ SELECT *
879
+ FROM joined
880
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
881
+ /* optional sorting per pagset */
882
+ {"ORDER BY " + page_key if sort_output else ""};
883
+ """
826
884
  ).arrow()
827
885
 
828
886
  # drop nulls if specified
@@ -847,10 +905,8 @@ def _join_source_chunk(
847
905
  f"{str(pathlib.Path(dest_path).parent)}/"
848
906
  # use the dest_path stem in the name
849
907
  f"{str(pathlib.Path(dest_path).stem)}-"
850
- # give the join chunk result a unique to arbitrarily
851
- # differentiate from other chunk groups which are mapped
852
- # and before they are brought together as one dataset
853
- f"{str(uuid.uuid4().hex)}.parquet"
908
+ # add the pageset indication to the filename
909
+ f"{pageset[0]}-{pageset[1]}.parquet"
854
910
  )
855
911
 
856
912
  # write the result
@@ -867,6 +923,7 @@ def _concat_join_sources(
867
923
  sources: Dict[str, List[Dict[str, Any]]],
868
924
  dest_path: str,
869
925
  join_sources: List[str],
926
+ sort_output: bool = True,
870
927
  ) -> str:
871
928
  """
872
929
  Concatenate join sources from parquet-based chunks.
@@ -883,6 +940,8 @@ def _concat_join_sources(
883
940
  join_sources: List[str]:
884
941
  List of local filepath destination for join source chunks
885
942
  which will be concatenated.
943
+ sort_output: bool
944
+ Specifies whether to sort cytotable output or not.
886
945
 
887
946
  Returns:
888
947
  str
@@ -898,7 +957,7 @@ def _concat_join_sources(
898
957
  CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
899
958
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
900
959
  )
901
- from cytotable.utils import _write_parquet_table_with_metadata
960
+ from cytotable.utils import _natural_sort
902
961
 
903
962
  # remove the unjoined concatted compartments to prepare final dest_path usage
904
963
  # (we now have joined results)
@@ -918,7 +977,11 @@ def _concat_join_sources(
918
977
  CYTOTABLE_DEFAULT_PARQUET_METADATA
919
978
  )
920
979
  with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
921
- for table_path in join_sources:
980
+ for table_path in (
981
+ join_sources
982
+ if not sort_output
983
+ else _natural_sort(list_to_sort=join_sources)
984
+ ):
922
985
  writer.write_table(
923
986
  parquet.read_table(
924
987
  table_path,
@@ -1042,9 +1105,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1042
1105
  infer_common_schema: bool,
1043
1106
  drop_null: bool,
1044
1107
  sort_output: bool,
1108
+ page_keys: Dict[str, str],
1045
1109
  data_type_cast_map: Optional[Dict[str, str]] = None,
1110
+ add_tablenumber: Optional[bool] = None,
1046
1111
  **kwargs,
1047
- ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
1112
+ ) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
1048
1113
  """
1049
1114
  Export data to parquet.
1050
1115
 
@@ -1082,6 +1147,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1082
1147
  Whether to drop null results.
1083
1148
  sort_output: bool
1084
1149
  Specifies whether to sort cytotable output or not.
1150
+ page_keys: Dict[str, str]
1151
+ A dictionary which defines which column names are used for keyset pagination
1152
+ in order to perform data extraction.
1085
1153
  data_type_cast_map: Dict[str, str]
1086
1154
  A dictionary mapping data type groups to specific types.
1087
1155
  Roughly includes Arrow data types language from:
@@ -1112,16 +1180,35 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1112
1180
  # expand the destination path
1113
1181
  expanded_dest_path = _expand_path(path=dest_path)
1114
1182
 
1115
- # prepare offsets for chunked data export from source tables
1116
- offsets_prepared = {
1183
+ # check that each source group name has a pagination key
1184
+ for source_group_name in sources.keys():
1185
+ matching_keys = [
1186
+ key for key in page_keys.keys() if key.lower() in source_group_name.lower()
1187
+ ]
1188
+ if not matching_keys:
1189
+ raise CytoTableException(
1190
+ f"No matching key found in page_keys for source_group_name: {source_group_name}."
1191
+ "Please include a pagination key based on a column name from the table."
1192
+ )
1193
+
1194
+ # prepare pagesets for chunked data export from source tables
1195
+ pagesets_prepared = {
1117
1196
  source_group_name: [
1118
1197
  dict(
1119
1198
  source,
1120
1199
  **{
1121
- "offsets": _get_table_chunk_offsets(
1200
+ "page_key": (
1201
+ page_key := [
1202
+ value
1203
+ for key, value in page_keys.items()
1204
+ if key.lower() in source_group_name.lower()
1205
+ ][0]
1206
+ ),
1207
+ "pagesets": _get_table_keyset_pagination_sets(
1122
1208
  source=source,
1123
1209
  chunk_size=chunk_size,
1124
- )
1210
+ page_key=page_key,
1211
+ ),
1125
1212
  },
1126
1213
  )
1127
1214
  for source in source_group_vals
@@ -1129,17 +1216,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1129
1216
  for source_group_name, source_group_vals in sources.items()
1130
1217
  }
1131
1218
 
1132
- # if offsets is none and we haven't halted, remove the file as there
1219
+ # if pagesets is none and we haven't halted, remove the file as there
1133
1220
  # were input formatting errors which will create challenges downstream
1134
1221
  invalid_files_dropped = {
1135
1222
  source_group_name: [
1136
- # ensure we have offsets
1223
+ # ensure we have pagesets
1137
1224
  source
1138
1225
  for source in source_group_vals
1139
- if source["offsets"] is not None
1226
+ if source["pagesets"] is not None
1140
1227
  ]
1141
1228
  for source_group_name, source_group_vals in evaluate_futures(
1142
- offsets_prepared
1229
+ pagesets_prepared
1143
1230
  ).items()
1144
1231
  # ensure we have source_groups with at least one source table
1145
1232
  if len(source_group_vals) > 0
@@ -1164,6 +1251,12 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1164
1251
  for source_group_name, source_group_vals in invalid_files_dropped.items()
1165
1252
  }
1166
1253
 
1254
+ # add tablenumber details, appending None if not add_tablenumber
1255
+ tablenumber_prepared = _set_tablenumber(
1256
+ sources=evaluate_futures(column_names_and_types_gathered),
1257
+ add_tablenumber=add_tablenumber,
1258
+ ).result()
1259
+
1167
1260
  results = {
1168
1261
  source_group_name: [
1169
1262
  dict(
@@ -1172,12 +1265,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1172
1265
  "table": [
1173
1266
  # perform column renaming and create potential return result
1174
1267
  _prepend_column_name(
1175
- # perform chunked data export to parquet using offsets
1176
- table_path=_source_chunk_to_parquet(
1268
+ # perform chunked data export to parquet using pagesets
1269
+ table_path=_source_pageset_to_parquet(
1177
1270
  source_group_name=source_group_name,
1178
1271
  source=source,
1179
- chunk_size=chunk_size,
1180
- offset=offset,
1272
+ pageset=pageset,
1181
1273
  dest_path=expanded_dest_path,
1182
1274
  sort_output=sort_output,
1183
1275
  ),
@@ -1186,14 +1278,14 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1186
1278
  metadata=metadata,
1187
1279
  compartments=compartments,
1188
1280
  )
1189
- for offset in source["offsets"]
1281
+ for pageset in source["pagesets"]
1190
1282
  ]
1191
1283
  },
1192
1284
  )
1193
1285
  for source in source_group_vals
1194
1286
  ]
1195
1287
  for source_group_name, source_group_vals in evaluate_futures(
1196
- column_names_and_types_gathered
1288
+ tablenumber_prepared
1197
1289
  ).items()
1198
1290
  }
1199
1291
 
@@ -1227,6 +1319,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1227
1319
  source_group=source_group_vals[0]["sources"],
1228
1320
  dest_path=expanded_dest_path,
1229
1321
  common_schema=source_group_vals[0]["common_schema"],
1322
+ sort_output=sort_output,
1230
1323
  )
1231
1324
  for source_group_name, source_group_vals in evaluate_futures(
1232
1325
  common_schema_determined
@@ -1240,39 +1333,50 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1240
1333
  evaluated_results = evaluate_futures(results)
1241
1334
 
1242
1335
  prepared_joins_sql = _prepare_join_sql(
1243
- sources=evaluated_results, joins=joins, sort_output=sort_output
1336
+ sources=evaluated_results, joins=joins
1244
1337
  ).result()
1245
1338
 
1339
+ page_key_join = [
1340
+ value for key, value in page_keys.items() if key.lower() == "join"
1341
+ ][0]
1342
+
1246
1343
  # map joined results based on the join groups gathered above
1247
1344
  # note: after mapping we end up with a list of strings (task returns str)
1248
1345
  join_sources_result = [
1249
- _join_source_chunk(
1346
+ _join_source_pageset(
1250
1347
  # gather the result of concatted sources prior to
1251
1348
  # join group merging as each mapped task run will need
1252
1349
  # full concat results
1253
1350
  dest_path=expanded_dest_path,
1254
1351
  joins=prepared_joins_sql,
1255
- chunk_size=chunk_size,
1256
- offset=offset,
1352
+ page_key=page_key_join,
1353
+ pageset=pageset,
1354
+ sort_output=sort_output,
1257
1355
  drop_null=drop_null,
1258
1356
  )
1259
1357
  # create join group for querying the concatenated
1260
1358
  # data in order to perform memory-safe joining
1261
1359
  # per user chunk size specification.
1262
- for offset in _get_table_chunk_offsets(
1360
+ for pageset in _get_table_keyset_pagination_sets(
1263
1361
  sql_stmt=prepared_joins_sql,
1264
1362
  chunk_size=chunk_size,
1363
+ page_key=page_key_join,
1265
1364
  ).result()
1266
1365
  ]
1267
1366
 
1268
- # concat our join chunks together as one cohesive dataset
1269
- # return results in common format which includes metadata
1270
- # for lineage and debugging
1271
- results = _concat_join_sources(
1272
- dest_path=expanded_dest_path,
1273
- join_sources=[join.result() for join in join_sources_result],
1274
- sources=evaluated_results,
1275
- )
1367
+ if concat:
1368
+ # concat our join chunks together as one cohesive dataset
1369
+ # return results in common format which includes metadata
1370
+ # for lineage and debugging
1371
+ results = _concat_join_sources(
1372
+ dest_path=expanded_dest_path,
1373
+ join_sources=[join.result() for join in join_sources_result],
1374
+ sources=evaluated_results,
1375
+ sort_output=sort_output,
1376
+ )
1377
+ else:
1378
+ # else we leave the joined chunks as-is and return them
1379
+ return evaluate_futures(join_sources_result)
1276
1380
 
1277
1381
  # wrap the final result as a future and return
1278
1382
  return evaluate_futures(results)
@@ -1293,11 +1397,13 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1293
1397
  infer_common_schema: bool = True,
1294
1398
  drop_null: bool = False,
1295
1399
  data_type_cast_map: Optional[Dict[str, str]] = None,
1400
+ add_tablenumber: Optional[bool] = None,
1401
+ page_keys: Optional[Dict[str, str]] = None,
1296
1402
  sort_output: bool = True,
1297
1403
  preset: Optional[str] = "cellprofiler_csv",
1298
1404
  parsl_config: Optional[parsl.Config] = None,
1299
1405
  **kwargs,
1300
- ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
1406
+ ) -> Union[Dict[str, List[Dict[str, Any]]], List[Any], str]:
1301
1407
  """
1302
1408
  Convert file-based data from various sources to Pycytominer-compatible standards.
1303
1409
 
@@ -1341,6 +1447,17 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1341
1447
  A dictionary mapping data type groups to specific types.
1342
1448
  Roughly includes Arrow data types language from:
1343
1449
  https://arrow.apache.org/docs/python/api/datatypes.html
1450
+ add_tablenumber: Optional[bool]
1451
+ Whether to add a calculated tablenumber which helps differentiate
1452
+ various repeated values (such as ObjectNumber) within source data.
1453
+ Useful for processing multiple SQLite or CSV data sources together
1454
+ to retain distinction from each dataset.
1455
+ page_keys: str:
1456
+ The table and column names to be used for key pagination.
1457
+ Uses the form: {"table_name":"column_name"}.
1458
+ Expects columns to include numeric data (ints or floats).
1459
+ Interacts with the `chunk_size` parameter to form
1460
+ pages of `chunk_size`.
1344
1461
  sort_output: bool (Default value = True)
1345
1462
  Specifies whether to sort cytotable output or not.
1346
1463
  drop_null: bool (Default value = False)
@@ -1440,6 +1557,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1440
1557
  if chunk_size is None
1441
1558
  else chunk_size
1442
1559
  )
1560
+ page_keys = (
1561
+ cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
1562
+ if page_keys is None
1563
+ else page_keys
1564
+ )
1565
+
1566
+ # Raise an exception for scenarios where one configures CytoTable to join
1567
+ # but does not provide a pagination key for the joins.
1568
+ if join and (page_keys is None or "join" not in page_keys.keys()):
1569
+ raise CytoTableException(
1570
+ (
1571
+ "When using join=True one must pass a 'join' pagination key "
1572
+ "in the page_keys parameter. The 'join' pagination key is a column "
1573
+ "name found within the joined results based on the SQL provided from "
1574
+ "the joins parameter. This special key is required as not all columns "
1575
+ "from the source tables might not be included."
1576
+ )
1577
+ )
1443
1578
 
1444
1579
  # send sources to be written to parquet if selected
1445
1580
  if dest_datatype == "parquet":
@@ -1457,7 +1592,9 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1457
1592
  infer_common_schema=infer_common_schema,
1458
1593
  drop_null=drop_null,
1459
1594
  data_type_cast_map=data_type_cast_map,
1595
+ add_tablenumber=add_tablenumber,
1460
1596
  sort_output=sort_output,
1597
+ page_keys=cast(dict, page_keys),
1461
1598
  **kwargs,
1462
1599
  )
1463
1600
 
cytotable/presets.py CHANGED
@@ -22,6 +22,16 @@ config = {
22
22
  "Parent_Cells",
23
23
  "Parent_Nuclei",
24
24
  ),
25
+ # pagination keys for use with this data
26
+ # of the rough format "table" -> "column".
27
+ # note: page keys are expected to be numeric (int, float)
28
+ "CONFIG_PAGE_KEYS": {
29
+ "image": "ImageNumber",
30
+ "cells": "ObjectNumber",
31
+ "nuclei": "ObjectNumber",
32
+ "cytoplasm": "ObjectNumber",
33
+ "join": "Cytoplasm_Number_Object_Number",
34
+ },
25
35
  # chunk size to use for join operations to help with possible performance issues
26
36
  # note: this number is an estimate and is may need changes contingent on data
27
37
  # and system used by this library.
@@ -61,6 +71,16 @@ config = {
61
71
  "Parent_Cells",
62
72
  "Parent_Nuclei",
63
73
  ),
74
+ # pagination keys for use with this data
75
+ # of the rough format "table" -> "column".
76
+ # note: page keys are expected to be numeric (int, float)
77
+ "CONFIG_PAGE_KEYS": {
78
+ "image": "ImageNumber",
79
+ "cells": "Cells_Number_Object_Number",
80
+ "nuclei": "Nuclei_Number_Object_Number",
81
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
82
+ "join": "Cytoplasm_Number_Object_Number",
83
+ },
64
84
  # chunk size to use for join operations to help with possible performance issues
65
85
  # note: this number is an estimate and is may need changes contingent on data
66
86
  # and system used by this library.
@@ -104,6 +124,16 @@ config = {
104
124
  "Parent_Cells",
105
125
  "Parent_Nuclei",
106
126
  ),
127
+ # pagination keys for use with this data
128
+ # of the rough format "table" -> "column".
129
+ # note: page keys are expected to be numeric (int, float)
130
+ "CONFIG_PAGE_KEYS": {
131
+ "image": "ImageNumber",
132
+ "cells": "ObjectNumber",
133
+ "nuclei": "ObjectNumber",
134
+ "cytoplasm": "ObjectNumber",
135
+ "join": "Cytoplasm_Number_Object_Number",
136
+ },
107
137
  # chunk size to use for join operations to help with possible performance issues
108
138
  # note: this number is an estimate and is may need changes contingent on data
109
139
  # and system used by this library.
@@ -155,6 +185,16 @@ config = {
155
185
  "Cells_Number_Object_Number",
156
186
  "Nuclei_Number_Object_Number",
157
187
  ),
188
+ # pagination keys for use with this data
189
+ # of the rough format "table" -> "column".
190
+ # note: page keys are expected to be numeric (int, float)
191
+ "CONFIG_PAGE_KEYS": {
192
+ "image": "ImageNumber",
193
+ "cells": "Cells_Number_Object_Number",
194
+ "nuclei": "Nuclei_Number_Object_Number",
195
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
196
+ "join": "Cytoplasm_Number_Object_Number",
197
+ },
158
198
  # chunk size to use for join operations to help with possible performance issues
159
199
  # note: this number is an estimate and is may need changes contingent on data
160
200
  # and system used by this library.
@@ -203,6 +243,16 @@ config = {
203
243
  "Cells_ObjectNumber",
204
244
  "Nuclei_ObjectNumber",
205
245
  ),
246
+ # pagination keys for use with this data
247
+ # of the rough format "table" -> "column".
248
+ # note: page keys are expected to be numeric (int, float)
249
+ "CONFIG_PAGE_KEYS": {
250
+ "image": "ImageNumber",
251
+ "cells": "ObjectNumber",
252
+ "nuclei": "ObjectNumber",
253
+ "cytoplasm": "ObjectNumber",
254
+ "join": "Cytoplasm_Number_Object_Number",
255
+ },
206
256
  # chunk size to use for join operations to help with possible performance issues
207
257
  # note: this number is an estimate and is may need changes contingent on data
208
258
  # and system used by this library.
@@ -248,6 +298,12 @@ config = {
248
298
  "Z",
249
299
  "T",
250
300
  ),
301
+ # pagination keys for use with this data
302
+ # of the rough format "table" -> "column".
303
+ # note: page keys are expected to be numeric (int, float)
304
+ "CONFIG_PAGE_KEYS": {
305
+ "test": '"OBJECT ID"',
306
+ },
251
307
  # chunk size to use for join operations to help with possible performance issues
252
308
  # note: this number is an estimate and is may need changes contingent on data
253
309
  # and system used by this library.
cytotable/utils.py CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, List, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -166,6 +166,12 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
166
166
  https://duckdb.org/docs/sql/configuration#configuration-reference
167
167
  */
168
168
  PRAGMA preserve_insertion_order=FALSE;
169
+
170
+ /*
171
+ Disable progress bar from displaying (defaults to TRUE)
172
+ See earlier documentation references above for more information.
173
+ */
174
+ SET enable_progress_bar=FALSE;
169
175
  """,
170
176
  )
171
177
 
@@ -173,10 +179,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
173
179
  def _sqlite_mixed_type_query_to_parquet(
174
180
  source_path: str,
175
181
  table_name: str,
176
- chunk_size: int,
177
- offset: int,
182
+ page_key: str,
183
+ pageset: Tuple[Union[int, float], Union[int, float]],
178
184
  sort_output: bool,
179
- add_cytotable_meta: bool = False,
185
+ tablenumber: Optional[int] = None,
180
186
  ) -> str:
181
187
  """
182
188
  Performs SQLite table data extraction where one or many
@@ -188,14 +194,17 @@ def _sqlite_mixed_type_query_to_parquet(
188
194
  A str which is a path to a SQLite database file.
189
195
  table_name: str:
190
196
  The name of the table being queried.
191
- chunk_size: int:
192
- Row count to use for chunked output.
193
- offset: int:
194
- The offset for chunking the data from source.
197
+ page_key: str:
198
+ The column name to be used to identify pagination chunks.
199
+ pageset: Tuple[int, int]:
200
+ The range for values used for paginating data from source.
195
201
  sort_output: bool
196
202
  Specifies whether to sort cytotable output or not.
197
203
  add_cytotable_meta: bool, default=False:
198
204
  Whether to add CytoTable metadata fields or not
205
+ tablenumber: Optional[int], default=None:
206
+ An optional table number to append to the results.
207
+ Defaults to None.
199
208
 
200
209
  Returns:
201
210
  pyarrow.Table:
@@ -205,10 +214,7 @@ def _sqlite_mixed_type_query_to_parquet(
205
214
 
206
215
  import pyarrow as pa
207
216
 
208
- from cytotable.constants import (
209
- CYOTABLE_META_COLUMN_TYPES,
210
- SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
211
- )
217
+ from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
212
218
  from cytotable.exceptions import DatatypeException
213
219
 
214
220
  # open sqlite3 connection
@@ -254,9 +260,19 @@ def _sqlite_mixed_type_query_to_parquet(
254
260
  # return the translated type for use in SQLite
255
261
  return translated_type[0]
256
262
 
263
+ # build tablenumber segment addition (if necessary)
264
+ tablenumber_sql = (
265
+ # to become tablenumber in sql select later with integer
266
+ f"CAST({tablenumber} AS INTEGER) as TableNumber, "
267
+ if tablenumber is not None
268
+ # if we don't have a tablenumber value, don't introduce the column
269
+ else ""
270
+ )
271
+
257
272
  # create cases for mixed-type handling in each column discovered above
258
- query_parts = [
259
- f"""
273
+ query_parts = tablenumber_sql + ", ".join(
274
+ [
275
+ f"""
260
276
  CASE
261
277
  /* when the storage class type doesn't match the column, return nulltype */
262
278
  WHEN typeof({col['column_name']}) !=
@@ -265,45 +281,18 @@ def _sqlite_mixed_type_query_to_parquet(
265
281
  ELSE {col['column_name']}
266
282
  END AS {col['column_name']}
267
283
  """
268
- for col in column_info
269
- ]
270
-
271
- if add_cytotable_meta:
272
- query_parts += [
273
- (
274
- f"CAST( '{f'{source_path}_table_{table_name}'}' "
275
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
276
- "AS cytotable_meta_source_path"
277
- ),
278
- (
279
- f"CAST( {offset} "
280
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
281
- "AS cytotable_meta_offset"
282
- ),
283
- (
284
- f"CAST( (ROW_NUMBER() OVER ()) AS "
285
- f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
286
- "AS cytotable_meta_rownum"
287
- ),
284
+ for col in column_info
288
285
  ]
286
+ )
289
287
 
290
288
  # perform the select using the cases built above and using chunksize + offset
291
- sql_stmt = (
292
- f"""
293
- SELECT
294
- {', '.join(query_parts)}
295
- FROM {table_name}
296
- ORDER BY {', '.join([col['column_name'] for col in column_info])}
297
- LIMIT {chunk_size} OFFSET {offset};
298
- """
299
- if sort_output
300
- else f"""
289
+ sql_stmt = f"""
301
290
  SELECT
302
- {', '.join(query_parts)}
291
+ {query_parts}
303
292
  FROM {table_name}
304
- LIMIT {chunk_size} OFFSET {offset};
293
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
294
+ {"ORDER BY " + page_key if sort_output else ""};
305
295
  """
306
- )
307
296
 
308
297
  # execute the sql stmt
309
298
  cursor.execute(sql_stmt)
@@ -508,6 +497,47 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
508
497
  )
509
498
 
510
499
 
500
+ def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int:
501
+ """
502
+ Build and return a checksum for use as a unique identifier across datasets
503
+ referenced from cytominer-database:
504
+ https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129
505
+
506
+ Args:
507
+ pathname: str:
508
+ A path to a file with which to generate the checksum on.
509
+ buffer_size: int:
510
+ Buffer size to use for reading data.
511
+
512
+ Returns:
513
+ int
514
+ an integer representing the checksum of the pathname file.
515
+ """
516
+
517
+ import os
518
+ import zlib
519
+
520
+ # check whether the buffer size is larger than the file_size
521
+ file_size = os.path.getsize(pathname)
522
+ if file_size < buffer_size:
523
+ buffer_size = file_size
524
+
525
+ # open file
526
+ with open(str(pathname), "rb") as stream:
527
+ # begin result formation
528
+ result = zlib.crc32(bytes(0))
529
+ while True:
530
+ # read data from stream using buffer size
531
+ buffer = stream.read(buffer_size)
532
+ if not buffer:
533
+ # if we have no more data to use, break while loop
534
+ break
535
+ # use buffer read data to form checksum
536
+ result = zlib.crc32(buffer, result)
537
+
538
+ return result & 0xFFFFFFFF
539
+
540
+
511
541
  def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any:
512
542
  """
513
543
  Helper function to unwrap futures from values or return values
@@ -563,14 +593,16 @@ def _unwrap_source(
563
593
  return _unwrap_value(source)
564
594
 
565
595
 
566
- def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> Any:
596
+ def evaluate_futures(
597
+ sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
598
+ ) -> Any:
567
599
  """
568
600
  Evaluates any Parsl futures for use within other tasks.
569
601
  This enables a pattern of Parsl app usage as "tasks" and delayed
570
602
  future result evaluation for concurrency.
571
603
 
572
604
  Args:
573
- sources: Union[Dict[str, List[Dict[str, Any]]], str]
605
+ sources: Union[Dict[str, List[Dict[str, Any]]], List[Any], str]
574
606
  Sources are an internal data structure used by CytoTable for
575
607
  processing and organizing data results. They may include futures
576
608
  which require asynchronous processing through Parsl, so we
@@ -600,3 +632,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
600
632
  if isinstance(sources, dict)
601
633
  else _unwrap_value(sources)
602
634
  )
635
+
636
+
637
+ def _generate_pagesets(
638
+ keys: List[Union[int, float]], chunk_size: int
639
+ ) -> List[Tuple[Union[int, float], Union[int, float]]]:
640
+ """
641
+ Generate a pageset (keyset pagination) from a list of keys.
642
+
643
+ Parameters:
644
+ keys List[Union[int, float]]:
645
+ List of keys to paginate.
646
+ chunk_size int:
647
+ Size of each chunk/page.
648
+
649
+ Returns:
650
+ List[Tuple[Union[int, float], Union[int, float]]]:
651
+ List of (start_key, end_key) tuples representing each page.
652
+ """
653
+
654
+ # Initialize an empty list to store the chunks/pages
655
+ chunks = []
656
+
657
+ # Start index for iteration through the keys
658
+ i = 0
659
+
660
+ while i < len(keys):
661
+ # Get the start key for the current chunk
662
+ start_key = keys[i]
663
+
664
+ # Calculate the end index for the current chunk
665
+ end_index = min(i + chunk_size, len(keys)) - 1
666
+
667
+ # Get the end key for the current chunk
668
+ end_key = keys[end_index]
669
+
670
+ # Ensure non-overlapping by incrementing the start of the next range if there are duplicates
671
+ while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
672
+ end_index += 1
673
+
674
+ # Append the current chunk (start_key, end_key) to the list of chunks
675
+ chunks.append((start_key, end_key))
676
+
677
+ # Update the index to start from the next chunk
678
+ i = end_index + 1
679
+
680
+ # Return the list of chunks/pages
681
+ return chunks
682
+
683
+
684
+ def _natural_sort(list_to_sort):
685
+ """
686
+ Sorts the given iterable using natural sort adapted from approach
687
+ provided by the following link:
688
+ https://stackoverflow.com/a/4836734
689
+
690
+ Args:
691
+ list_to_sort: List:
692
+ The list to sort.
693
+
694
+ Returns:
695
+ List: The sorted list.
696
+ """
697
+ import re
698
+
699
+ return sorted(
700
+ list_to_sort,
701
+ # use a custom key to sort the list
702
+ key=lambda key: [
703
+ # use integer of c if it's a digit, otherwise str
704
+ int(c) if c.isdigit() else c
705
+ # Split the key into parts, separating numbers from alphabetic characters
706
+ for c in re.split("([0-9]+)", str(key))
707
+ ],
708
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=KSVr7xOOrpmQ_ybzcsZkblTAzPIYEq7_bm-Cjc874FM,316
2
+ cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
3
+ cytotable/convert.py,sha256=5VHnw0eGdfXTbSfeEoPAPVa-dtobM6VHkIJwscLe68M,60651
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
6
+ cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
+ cytotable/utils.py,sha256=tywZg1Gr78ebLlOp8R7trkiV7jsQ4iiZt4B6qG6SrxY,22578
8
+ cytotable-0.0.11.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.11.dist-info/METADATA,sha256=sOvdWxld2Ryyjd5bluZt8Z78uElg1CyWG0UIRJn0F8E,3424
10
+ cytotable-0.0.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
11
+ cytotable-0.0.11.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
2
- cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
- cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
6
- cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
- cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
8
- cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
10
- cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.9.dist-info/RECORD,,