CytoTable 0.0.8__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
- Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
17
+ Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
18
18
  Requires-Dist: duckdb (>=0.10.1)
19
19
  Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
20
20
  Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.8"
6
+ __version__ = "0.0.10"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
68
68
  ],
69
69
  }
70
70
 
71
- # metadata column names and types for internal use within CytoTable
72
- CYOTABLE_META_COLUMN_TYPES = {
73
- "cytotable_meta_source_path": "VARCHAR",
74
- "cytotable_meta_offset": "BIGINT",
75
- "cytotable_meta_rownum": "BIGINT",
76
- }
77
-
78
71
  CYTOTABLE_DEFAULT_PARQUET_METADATA = {
79
72
  "data-producer": "https://github.com/cytomining/CytoTable",
80
73
  "data-producer-version": str(_get_cytotable_version()),
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
4
4
 
5
5
  import itertools
6
6
  import logging
7
- import uuid
8
7
  from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
9
8
 
10
9
  import parsl
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
33
32
 
34
33
  Args:
35
34
  source: Dict[str, Any]
36
- Contains the source data to be chunked. Represents a single
35
+ Contains source data details. Represents a single
37
36
  file or table of some kind.
38
37
  sort_output:
39
38
  Specifies whether to sort cytotable output or not.
@@ -43,14 +42,12 @@ def _get_table_columns_and_types(
43
42
  list of dictionaries which each include column level information
44
43
  """
45
44
 
46
- import pathlib
47
-
48
45
  import duckdb
49
46
 
50
47
  from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
51
48
 
52
49
  source_path = source["source_path"]
53
- source_type = str(pathlib.Path(source_path).suffix).lower()
50
+ source_type = str(source_path.suffix).lower()
54
51
 
55
52
  # prepare the data source in the form of a duckdb query
56
53
  select_source = (
@@ -88,7 +85,7 @@ def _get_table_columns_and_types(
88
85
  # with exception handling to read mixed-type data
89
86
  # using sqlite3 and special utility function
90
87
  try:
91
- # isolate using new connection to read data with chunk size + offset
88
+ # isolate using new connection to read data based on pageset
92
89
  # and export directly to parquet via duckdb (avoiding need to return data to python)
93
90
  # perform the query and create a list of dictionaries with the column data for table
94
91
  with _duckdb_reader() as ddb_reader:
@@ -108,13 +105,8 @@ def _get_table_columns_and_types(
108
105
  arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
109
106
  source_path=str(source["source_path"]),
110
107
  table_name=str(source["table_name"]),
111
- # chunk size is set to 5 as a limit similar
112
- # to above SQL within select_query variable
113
- chunk_size=5,
114
- # offset is set to 0 start at first row
115
- # result from table
116
- offset=0,
117
- add_cytotable_meta=False,
108
+ page_key=source["page_key"],
109
+ pageset=source["pagesets"][0],
118
110
  sort_output=sort_output,
119
111
  )
120
112
  with _duckdb_reader() as ddb_reader:
@@ -182,13 +174,14 @@ def _prep_cast_column_data_types(
182
174
 
183
175
 
184
176
  @python_app
185
- def _get_table_chunk_offsets(
177
+ def _get_table_keyset_pagination_sets(
186
178
  chunk_size: int,
179
+ page_key: str,
187
180
  source: Optional[Dict[str, Any]] = None,
188
181
  sql_stmt: Optional[str] = None,
189
- ) -> Union[List[int], None]:
182
+ ) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
190
183
  """
191
- Get table data chunk offsets for later use in capturing segments
184
+ Get table data chunk keys for later use in capturing segments
192
185
  of values. This work also provides a chance to catch problematic
193
186
  input data which will be ignored with warnings.
194
187
 
@@ -198,51 +191,59 @@ def _get_table_chunk_offsets(
198
191
  file or table of some kind.
199
192
  chunk_size: int
200
193
  The size in rowcount of the chunks to create.
194
+ page_key: str
195
+ The column name to be used to identify pagination chunks.
196
+ Expected to be of numeric type (int, float) for ordering.
197
+ sql_stmt:
198
+ Optional sql statement to form the pagination set from.
199
+ Default behavior extracts pagination sets from the full
200
+ data source.
201
201
 
202
202
  Returns:
203
- List[int]
204
- List of integers which represent offsets to use for reading
205
- the data later on.
203
+ List[Any]
204
+ List of keys to use for reading the data later on.
206
205
  """
207
206
 
208
207
  import logging
209
- import pathlib
208
+ import sqlite3
209
+ from contextlib import closing
210
210
 
211
211
  import duckdb
212
- from cloudpathlib import AnyPath
213
212
 
214
213
  from cytotable.exceptions import NoInputDataException
215
- from cytotable.utils import _duckdb_reader
214
+ from cytotable.utils import _duckdb_reader, _generate_pagesets
216
215
 
217
216
  logger = logging.getLogger(__name__)
218
217
 
219
218
  if source is not None:
220
219
  table_name = source["table_name"] if "table_name" in source.keys() else None
221
220
  source_path = source["source_path"]
222
- source_type = str(pathlib.Path(source_path).suffix).lower()
221
+ source_type = str(source_path.suffix).lower()
223
222
 
224
223
  try:
225
- # for csv's, check that we have more than one row (a header and data values)
226
- if (
227
- source_type == ".csv"
228
- and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
229
- ):
230
- raise NoInputDataException(
231
- f"Data file has 0 rows of values. Error in file: {source_path}"
232
- )
233
-
234
- # gather the total rowcount from csv or sqlite data input sources
235
224
  with _duckdb_reader() as ddb_reader:
236
- rowcount = int(
237
- ddb_reader.execute(
238
- # nosec
239
- f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
240
- if source_type == ".csv"
241
- else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
242
- ).fetchone()[0]
243
- )
225
+ if source_type == ".csv":
226
+ sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
227
+ else:
228
+ sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
229
+
230
+ page_keys = [
231
+ results[0] for results in ddb_reader.execute(sql_query).fetchall()
232
+ ]
233
+
234
+ # exception case for when we have mixed types
235
+ # (i.e. integer col with string and ints) in a sqlite column
236
+ except duckdb.TypeMismatchException:
237
+ with closing(sqlite3.connect(source_path)) as cx:
238
+ with cx:
239
+ page_keys = [
240
+ key[0]
241
+ for key in cx.execute(
242
+ f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
243
+ ).fetchall()
244
+ if isinstance(key[0], (int, float))
245
+ ]
244
246
 
245
- # catch input errors which will result in skipped files
246
247
  except (
247
248
  duckdb.InvalidInputException,
248
249
  NoInputDataException,
@@ -253,34 +254,20 @@ def _get_table_chunk_offsets(
253
254
 
254
255
  return None
255
256
 
256
- # find chunk offsets from sql statement
257
257
  elif sql_stmt is not None:
258
- # gather the total rowcount from csv or sqlite data input sources
259
258
  with _duckdb_reader() as ddb_reader:
260
- rowcount = int(
261
- ddb_reader.execute(
262
- # nosec
263
- f"SELECT COUNT(*) FROM ({sql_stmt})"
264
- ).fetchone()[0]
265
- )
259
+ sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
260
+ page_keys = ddb_reader.execute(sql_query).fetchall()
261
+ page_keys = [key[0] for key in page_keys]
266
262
 
267
- return list(
268
- range(
269
- 0,
270
- # gather rowcount from table and use as maximum for range
271
- rowcount,
272
- # step through using chunk size
273
- chunk_size,
274
- )
275
- )
263
+ return _generate_pagesets(page_keys, chunk_size)
276
264
 
277
265
 
278
266
  @python_app
279
- def _source_chunk_to_parquet(
267
+ def _source_pageset_to_parquet(
280
268
  source_group_name: str,
281
269
  source: Dict[str, Any],
282
- chunk_size: int,
283
- offset: int,
270
+ pageset: Tuple[Union[int, float], Union[int, float]],
284
271
  dest_path: str,
285
272
  sort_output: bool,
286
273
  ) -> str:
@@ -293,10 +280,8 @@ def _source_chunk_to_parquet(
293
280
  source: Dict[str, Any]
294
281
  Contains the source data to be chunked. Represents a single
295
282
  file or table of some kind along with collected information about table.
296
- chunk_size: int
297
- Row count to use for chunked output.
298
- offset: int
299
- The offset for chunking the data from source.
283
+ pageset: Tuple[int, int]
284
+ The pageset for chunking the data from source.
300
285
  dest_path: str
301
286
  Path to store the output data.
302
287
  sort_output: bool
@@ -311,9 +296,7 @@ def _source_chunk_to_parquet(
311
296
 
312
297
  import duckdb
313
298
  from cloudpathlib import AnyPath
314
- from pyarrow import parquet
315
299
 
316
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
317
300
  from cytotable.utils import (
318
301
  _duckdb_reader,
319
302
  _sqlite_mixed_type_query_to_parquet,
@@ -322,31 +305,11 @@ def _source_chunk_to_parquet(
322
305
 
323
306
  # attempt to build dest_path
324
307
  source_dest_path = (
325
- f"{dest_path}/{str(pathlib.Path(source_group_name).stem).lower()}/"
326
- f"{str(pathlib.Path(source['source_path']).parent.name).lower()}"
308
+ f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
309
+ f"{str(source['source_path'].parent.name).lower()}"
327
310
  )
328
311
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
329
312
 
330
- source_path_str = (
331
- source["source_path"]
332
- if "table_name" not in source.keys()
333
- else f"{source['source_path']}_table_{source['table_name']}"
334
- )
335
- # build the column selection block of query
336
-
337
- # add cytotable metadata columns
338
- cytotable_metadata_cols = [
339
- (
340
- f"CAST( '{source_path_str}' "
341
- f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
342
- ' AS "cytotable_meta_source_path"'
343
- ),
344
- f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
345
- (
346
- f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
347
- ' AS "cytotable_meta_rownum"'
348
- ),
349
- ]
350
313
  # add source table columns
351
314
  casted_source_cols = [
352
315
  # here we cast the column to the specified type ensure the colname remains the same
@@ -357,22 +320,23 @@ def _source_chunk_to_parquet(
357
320
  # create selection statement from lists above
358
321
  select_columns = ",".join(
359
322
  # if we should sort the output, add the metadata_cols
360
- cytotable_metadata_cols + casted_source_cols
323
+ casted_source_cols
361
324
  if sort_output
362
325
  else casted_source_cols
363
326
  )
364
327
 
365
328
  # build output query and filepath base
366
329
  # (chunked output will append offset to keep output paths unique)
367
- if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
330
+ if str(source["source_path"].suffix).lower() == ".csv":
368
331
  base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
369
332
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
370
333
 
371
- elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
334
+ elif str(source["source_path"].suffix).lower() == ".sqlite":
372
335
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
373
336
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
374
337
 
375
- result_filepath = f"{result_filepath_base}-{offset}.parquet"
338
+ # form a filepath which indicates the pageset
339
+ result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
376
340
 
377
341
  # Attempt to read the data to parquet file
378
342
  # using duckdb for extraction and pyarrow for
@@ -385,14 +349,9 @@ def _source_chunk_to_parquet(
385
349
  table=ddb_reader.execute(
386
350
  f"""
387
351
  {base_query}
388
- /* order by all columns for deterministic output */
389
- ORDER BY ALL
390
- LIMIT {chunk_size} OFFSET {offset}
391
- """
392
- if sort_output
393
- else f"""
394
- {base_query}
395
- LIMIT {chunk_size} OFFSET {offset}
352
+ WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
353
+ /* optional ordering per pageset */
354
+ {"ORDER BY " + source['page_key'] if sort_output else ""};
396
355
  """
397
356
  ).arrow(),
398
357
  where=result_filepath,
@@ -405,7 +364,7 @@ def _source_chunk_to_parquet(
405
364
  # to handle the mixed types
406
365
  if (
407
366
  "Mismatch Type Error" in str(e)
408
- and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
367
+ and str(source["source_path"].suffix).lower() == ".sqlite"
409
368
  ):
410
369
  _write_parquet_table_with_metadata(
411
370
  # here we use sqlite instead of duckdb to extract
@@ -414,9 +373,8 @@ def _source_chunk_to_parquet(
414
373
  table=_sqlite_mixed_type_query_to_parquet(
415
374
  source_path=str(source["source_path"]),
416
375
  table_name=str(source["table_name"]),
417
- chunk_size=chunk_size,
418
- offset=offset,
419
- add_cytotable_meta=True if sort_output else False,
376
+ page_key=source["page_key"],
377
+ pageset=pageset,
420
378
  sort_output=sort_output,
421
379
  ),
422
380
  where=result_filepath,
@@ -466,10 +424,7 @@ def _prepend_column_name(
466
424
 
467
425
  import pyarrow.parquet as parquet
468
426
 
469
- from cytotable.constants import (
470
- CYOTABLE_META_COLUMN_TYPES,
471
- CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
472
- )
427
+ from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
473
428
  from cytotable.utils import _write_parquet_table_with_metadata
474
429
 
475
430
  logger = logging.getLogger(__name__)
@@ -480,7 +435,7 @@ def _prepend_column_name(
480
435
  if len(targets) == 0:
481
436
  logger.warning(
482
437
  msg=(
483
- "Skipping column name prepend operations"
438
+ "Skipping column name prepend operations "
484
439
  "because no compartments or metadata were provided."
485
440
  )
486
441
  )
@@ -517,10 +472,8 @@ def _prepend_column_name(
517
472
  # source_group_name_stem: 'Cells'
518
473
  # column_name: 'AreaShape_Area'
519
474
  # updated_column_name: 'Cells_AreaShape_Area'
520
- if (
521
- column_name not in identifying_columns
522
- and not column_name.startswith(source_group_name_stem.capitalize())
523
- and column_name not in CYOTABLE_META_COLUMN_TYPES
475
+ if column_name not in identifying_columns and not column_name.startswith(
476
+ source_group_name_stem.capitalize()
524
477
  ):
525
478
  updated_column_names.append(f"{source_group_name_stem}_{column_name}")
526
479
  # if-condition for prepending 'Metadata_' to column name
@@ -582,6 +535,7 @@ def _concat_source_group(
582
535
  source_group: List[Dict[str, Any]],
583
536
  dest_path: str,
584
537
  common_schema: Optional[List[Tuple[str, str]]] = None,
538
+ sort_output: bool = True,
585
539
  ) -> List[Dict[str, Any]]:
586
540
  """
587
541
  Concatenate group of source data together as single file.
@@ -628,6 +582,8 @@ def _concat_source_group(
628
582
  common_schema: List[Tuple[str, str]] (Default value = None)
629
583
  Common schema to use for concatenation amongst arrow tables
630
584
  which may have slightly different but compatible schema.
585
+ sort_output: bool
586
+ Specifies whether to sort cytotable output or not.
631
587
 
632
588
  Returns:
633
589
  List[Dict[str, Any]]
@@ -645,7 +601,7 @@ def _concat_source_group(
645
601
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
646
602
  )
647
603
  from cytotable.exceptions import SchemaException
648
- from cytotable.utils import _write_parquet_table_with_metadata
604
+ from cytotable.utils import _natural_sort
649
605
 
650
606
  # build a result placeholder
651
607
  concatted: List[Dict[str, Any]] = [
@@ -684,7 +640,10 @@ def _concat_source_group(
684
640
  # (all must be the same schema)
685
641
  with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
686
642
  for source in source_group:
687
- for table in [table for table in source["table"]]:
643
+ tables = [table for table in source["table"]]
644
+ if sort_output:
645
+ tables = _natural_sort(tables)
646
+ for table in tables:
688
647
  # if we haven't inferred the common schema
689
648
  # check that our file matches the expected schema, otherwise raise an error
690
649
  if common_schema is None and not writer_schema.equals(
@@ -728,7 +687,6 @@ def _concat_source_group(
728
687
  def _prepare_join_sql(
729
688
  sources: Dict[str, List[Dict[str, Any]]],
730
689
  joins: str,
731
- sort_output: bool,
732
690
  ) -> str:
733
691
  """
734
692
  Prepare join SQL statement with actual locations of data based on the sources.
@@ -749,8 +707,6 @@ def _prepare_join_sql(
749
707
  """
750
708
  import pathlib
751
709
 
752
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
753
-
754
710
  # replace with real location of sources for join sql
755
711
  order_by_tables = []
756
712
  for key, val in sources.items():
@@ -762,25 +718,17 @@ def _prepare_join_sql(
762
718
  )
763
719
  order_by_tables.append(table_name)
764
720
 
765
- # create order by statement with from all tables using cytotable metadata
766
- order_by_sql = "ORDER BY " + ", ".join(
767
- [
768
- f"{table}.{meta_column}"
769
- for table in order_by_tables
770
- for meta_column in CYOTABLE_META_COLUMN_TYPES
771
- ]
772
- )
773
-
774
721
  # add the order by statements to the join
775
- return joins + order_by_sql if sort_output else joins
722
+ return joins
776
723
 
777
724
 
778
725
  @python_app
779
- def _join_source_chunk(
726
+ def _join_source_pageset(
780
727
  dest_path: str,
781
728
  joins: str,
782
- chunk_size: int,
783
- offset: int,
729
+ page_key: str,
730
+ pageset: Tuple[int, int],
731
+ sort_output: bool,
784
732
  drop_null: bool,
785
733
  ) -> str:
786
734
  """
@@ -806,30 +754,20 @@ def _join_source_chunk(
806
754
 
807
755
  import pathlib
808
756
 
809
- from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
810
757
  from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
811
758
 
812
- # Attempt to read the data to parquet file
813
- # using duckdb for extraction and pyarrow for
814
- # writing data to a parquet file.
815
- # read data with chunk size + offset
816
- # and export to parquet
817
- exclude_meta_cols = [
818
- f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
819
- ]
820
759
  with _duckdb_reader() as ddb_reader:
821
760
  result = ddb_reader.execute(
822
761
  f"""
823
- WITH joined AS (
762
+ WITH joined AS (
824
763
  {joins}
825
- LIMIT {chunk_size} OFFSET {offset}
826
- )
827
- SELECT
828
- /* exclude metadata columns from the results
829
- by using a lambda on column names based on exclude_meta_cols. */
830
- COLUMNS (c -> ({" AND ".join(exclude_meta_cols)}))
831
- FROM joined;
832
- """
764
+ )
765
+ SELECT *
766
+ FROM joined
767
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
768
+ /* optional sorting per pagset */
769
+ {"ORDER BY " + page_key if sort_output else ""};
770
+ """
833
771
  ).arrow()
834
772
 
835
773
  # drop nulls if specified
@@ -854,10 +792,8 @@ def _join_source_chunk(
854
792
  f"{str(pathlib.Path(dest_path).parent)}/"
855
793
  # use the dest_path stem in the name
856
794
  f"{str(pathlib.Path(dest_path).stem)}-"
857
- # give the join chunk result a unique to arbitrarily
858
- # differentiate from other chunk groups which are mapped
859
- # and before they are brought together as one dataset
860
- f"{str(uuid.uuid4().hex)}.parquet"
795
+ # add the pageset indication to the filename
796
+ f"{pageset[0]}-{pageset[1]}.parquet"
861
797
  )
862
798
 
863
799
  # write the result
@@ -874,6 +810,7 @@ def _concat_join_sources(
874
810
  sources: Dict[str, List[Dict[str, Any]]],
875
811
  dest_path: str,
876
812
  join_sources: List[str],
813
+ sort_output: bool = True,
877
814
  ) -> str:
878
815
  """
879
816
  Concatenate join sources from parquet-based chunks.
@@ -890,6 +827,8 @@ def _concat_join_sources(
890
827
  join_sources: List[str]:
891
828
  List of local filepath destination for join source chunks
892
829
  which will be concatenated.
830
+ sort_output: bool
831
+ Specifies whether to sort cytotable output or not.
893
832
 
894
833
  Returns:
895
834
  str
@@ -905,7 +844,7 @@ def _concat_join_sources(
905
844
  CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
906
845
  CYTOTABLE_DEFAULT_PARQUET_METADATA,
907
846
  )
908
- from cytotable.utils import _write_parquet_table_with_metadata
847
+ from cytotable.utils import _natural_sort
909
848
 
910
849
  # remove the unjoined concatted compartments to prepare final dest_path usage
911
850
  # (we now have joined results)
@@ -925,7 +864,11 @@ def _concat_join_sources(
925
864
  CYTOTABLE_DEFAULT_PARQUET_METADATA
926
865
  )
927
866
  with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
928
- for table_path in join_sources:
867
+ for table_path in (
868
+ join_sources
869
+ if not sort_output
870
+ else _natural_sort(list_to_sort=join_sources)
871
+ ):
929
872
  writer.write_table(
930
873
  parquet.read_table(
931
874
  table_path,
@@ -1049,6 +992,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1049
992
  infer_common_schema: bool,
1050
993
  drop_null: bool,
1051
994
  sort_output: bool,
995
+ page_keys: Dict[str, str],
1052
996
  data_type_cast_map: Optional[Dict[str, str]] = None,
1053
997
  **kwargs,
1054
998
  ) -> Union[Dict[str, List[Dict[str, Any]]], str]:
@@ -1089,6 +1033,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1089
1033
  Whether to drop null results.
1090
1034
  sort_output: bool
1091
1035
  Specifies whether to sort cytotable output or not.
1036
+ page_keys: Dict[str, str]
1037
+ A dictionary which defines which column names are used for keyset pagination
1038
+ in order to perform data extraction.
1092
1039
  data_type_cast_map: Dict[str, str]
1093
1040
  A dictionary mapping data type groups to specific types.
1094
1041
  Roughly includes Arrow data types language from:
@@ -1114,21 +1061,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1114
1061
  else []
1115
1062
  ),
1116
1063
  **kwargs,
1117
- ).result()
1064
+ )
1118
1065
 
1119
1066
  # expand the destination path
1120
1067
  expanded_dest_path = _expand_path(path=dest_path)
1121
1068
 
1122
- # prepare offsets for chunked data export from source tables
1123
- offsets_prepared = {
1069
+ # check that each source group name has a pagination key
1070
+ for source_group_name in sources.keys():
1071
+ matching_keys = [
1072
+ key for key in page_keys.keys() if key.lower() in source_group_name.lower()
1073
+ ]
1074
+ if not matching_keys:
1075
+ raise CytoTableException(
1076
+ f"No matching key found in page_keys for source_group_name: {source_group_name}."
1077
+ "Please include a pagination key based on a column name from the table."
1078
+ )
1079
+
1080
+ # prepare pagesets for chunked data export from source tables
1081
+ pagesets_prepared = {
1124
1082
  source_group_name: [
1125
1083
  dict(
1126
1084
  source,
1127
1085
  **{
1128
- "offsets": _get_table_chunk_offsets(
1086
+ "page_key": (
1087
+ page_key := [
1088
+ value
1089
+ for key, value in page_keys.items()
1090
+ if key.lower() in source_group_name.lower()
1091
+ ][0]
1092
+ ),
1093
+ "pagesets": _get_table_keyset_pagination_sets(
1129
1094
  source=source,
1130
1095
  chunk_size=chunk_size,
1131
- )
1096
+ page_key=page_key,
1097
+ ),
1132
1098
  },
1133
1099
  )
1134
1100
  for source in source_group_vals
@@ -1136,17 +1102,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1136
1102
  for source_group_name, source_group_vals in sources.items()
1137
1103
  }
1138
1104
 
1139
- # if offsets is none and we haven't halted, remove the file as there
1105
+ # if pagesets is none and we haven't halted, remove the file as there
1140
1106
  # were input formatting errors which will create challenges downstream
1141
1107
  invalid_files_dropped = {
1142
1108
  source_group_name: [
1143
- # ensure we have offsets
1109
+ # ensure we have pagesets
1144
1110
  source
1145
1111
  for source in source_group_vals
1146
- if source["offsets"] is not None
1112
+ if source["pagesets"] is not None
1147
1113
  ]
1148
1114
  for source_group_name, source_group_vals in evaluate_futures(
1149
- offsets_prepared
1115
+ pagesets_prepared
1150
1116
  ).items()
1151
1117
  # ensure we have source_groups with at least one source table
1152
1118
  if len(source_group_vals) > 0
@@ -1179,12 +1145,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1179
1145
  "table": [
1180
1146
  # perform column renaming and create potential return result
1181
1147
  _prepend_column_name(
1182
- # perform chunked data export to parquet using offsets
1183
- table_path=_source_chunk_to_parquet(
1148
+ # perform chunked data export to parquet using pagesets
1149
+ table_path=_source_pageset_to_parquet(
1184
1150
  source_group_name=source_group_name,
1185
1151
  source=source,
1186
- chunk_size=chunk_size,
1187
- offset=offset,
1152
+ pageset=pageset,
1188
1153
  dest_path=expanded_dest_path,
1189
1154
  sort_output=sort_output,
1190
1155
  ),
@@ -1193,7 +1158,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1193
1158
  metadata=metadata,
1194
1159
  compartments=compartments,
1195
1160
  )
1196
- for offset in source["offsets"]
1161
+ for pageset in source["pagesets"]
1197
1162
  ]
1198
1163
  },
1199
1164
  )
@@ -1234,6 +1199,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1234
1199
  source_group=source_group_vals[0]["sources"],
1235
1200
  dest_path=expanded_dest_path,
1236
1201
  common_schema=source_group_vals[0]["common_schema"],
1202
+ sort_output=sort_output,
1237
1203
  )
1238
1204
  for source_group_name, source_group_vals in evaluate_futures(
1239
1205
  common_schema_determined
@@ -1247,28 +1213,34 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1247
1213
  evaluated_results = evaluate_futures(results)
1248
1214
 
1249
1215
  prepared_joins_sql = _prepare_join_sql(
1250
- sources=evaluated_results, joins=joins, sort_output=sort_output
1216
+ sources=evaluated_results, joins=joins
1251
1217
  ).result()
1252
1218
 
1219
+ page_key_join = [
1220
+ value for key, value in page_keys.items() if key.lower() == "join"
1221
+ ][0]
1222
+
1253
1223
  # map joined results based on the join groups gathered above
1254
1224
  # note: after mapping we end up with a list of strings (task returns str)
1255
1225
  join_sources_result = [
1256
- _join_source_chunk(
1226
+ _join_source_pageset(
1257
1227
  # gather the result of concatted sources prior to
1258
1228
  # join group merging as each mapped task run will need
1259
1229
  # full concat results
1260
1230
  dest_path=expanded_dest_path,
1261
1231
  joins=prepared_joins_sql,
1262
- chunk_size=chunk_size,
1263
- offset=offset,
1232
+ page_key=page_key_join,
1233
+ pageset=pageset,
1234
+ sort_output=sort_output,
1264
1235
  drop_null=drop_null,
1265
1236
  )
1266
1237
  # create join group for querying the concatenated
1267
1238
  # data in order to perform memory-safe joining
1268
1239
  # per user chunk size specification.
1269
- for offset in _get_table_chunk_offsets(
1240
+ for pageset in _get_table_keyset_pagination_sets(
1270
1241
  sql_stmt=prepared_joins_sql,
1271
1242
  chunk_size=chunk_size,
1243
+ page_key=page_key_join,
1272
1244
  ).result()
1273
1245
  ]
1274
1246
 
@@ -1279,6 +1251,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1279
1251
  dest_path=expanded_dest_path,
1280
1252
  join_sources=[join.result() for join in join_sources_result],
1281
1253
  sources=evaluated_results,
1254
+ sort_output=sort_output,
1282
1255
  )
1283
1256
 
1284
1257
  # wrap the final result as a future and return
@@ -1300,6 +1273,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1300
1273
  infer_common_schema: bool = True,
1301
1274
  drop_null: bool = False,
1302
1275
  data_type_cast_map: Optional[Dict[str, str]] = None,
1276
+ page_keys: Optional[Dict[str, str]] = None,
1303
1277
  sort_output: bool = True,
1304
1278
  preset: Optional[str] = "cellprofiler_csv",
1305
1279
  parsl_config: Optional[parsl.Config] = None,
@@ -1348,6 +1322,12 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1348
1322
  A dictionary mapping data type groups to specific types.
1349
1323
  Roughly includes Arrow data types language from:
1350
1324
  https://arrow.apache.org/docs/python/api/datatypes.html
1325
+ page_keys: str:
1326
+ The table and column names to be used for key pagination.
1327
+ Uses the form: {"table_name":"column_name"}.
1328
+ Expects columns to include numeric data (ints or floats).
1329
+ Interacts with the `chunk_size` parameter to form
1330
+ pages of `chunk_size`.
1351
1331
  sort_output: bool (Default value = True)
1352
1332
  Specifies whether to sort cytotable output or not.
1353
1333
  drop_null: bool (Default value = False)
@@ -1447,6 +1427,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1447
1427
  if chunk_size is None
1448
1428
  else chunk_size
1449
1429
  )
1430
+ page_keys = (
1431
+ cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
1432
+ if page_keys is None
1433
+ else page_keys
1434
+ )
1435
+
1436
+ # Raise an exception for scenarios where one configures CytoTable to join
1437
+ # but does not provide a pagination key for the joins.
1438
+ if join and (page_keys is None or "join" not in page_keys.keys()):
1439
+ raise CytoTableException(
1440
+ (
1441
+ "When using join=True one must pass a 'join' pagination key "
1442
+ "in the page_keys parameter. The 'join' pagination key is a column "
1443
+ "name found within the joined results based on the SQL provided from "
1444
+ "the joins parameter. This special key is required as not all columns "
1445
+ "from the source tables might not be included."
1446
+ )
1447
+ )
1450
1448
 
1451
1449
  # send sources to be written to parquet if selected
1452
1450
  if dest_datatype == "parquet":
@@ -1465,6 +1463,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1465
1463
  drop_null=drop_null,
1466
1464
  data_type_cast_map=data_type_cast_map,
1467
1465
  sort_output=sort_output,
1466
+ page_keys=cast(dict, page_keys),
1468
1467
  **kwargs,
1469
1468
  )
1470
1469
 
@@ -22,6 +22,16 @@ config = {
22
22
  "Parent_Cells",
23
23
  "Parent_Nuclei",
24
24
  ),
25
+ # pagination keys for use with this data
26
+ # of the rough format "table" -> "column".
27
+ # note: page keys are expected to be numeric (int, float)
28
+ "CONFIG_PAGE_KEYS": {
29
+ "image": "ImageNumber",
30
+ "cells": "ObjectNumber",
31
+ "nuclei": "ObjectNumber",
32
+ "cytoplasm": "ObjectNumber",
33
+ "join": "Cytoplasm_Number_Object_Number",
34
+ },
25
35
  # chunk size to use for join operations to help with possible performance issues
26
36
  # note: this number is an estimate and is may need changes contingent on data
27
37
  # and system used by this library.
@@ -61,6 +71,16 @@ config = {
61
71
  "Parent_Cells",
62
72
  "Parent_Nuclei",
63
73
  ),
74
+ # pagination keys for use with this data
75
+ # of the rough format "table" -> "column".
76
+ # note: page keys are expected to be numeric (int, float)
77
+ "CONFIG_PAGE_KEYS": {
78
+ "image": "ImageNumber",
79
+ "cells": "Cells_Number_Object_Number",
80
+ "nuclei": "Nuclei_Number_Object_Number",
81
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
82
+ "join": "Cytoplasm_Number_Object_Number",
83
+ },
64
84
  # chunk size to use for join operations to help with possible performance issues
65
85
  # note: this number is an estimate and is may need changes contingent on data
66
86
  # and system used by this library.
@@ -85,6 +105,64 @@ config = {
85
105
  AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
86
106
  """,
87
107
  },
108
+ "cellprofiler_sqlite_cpg0016_jump": {
109
+ # version specifications using related references
110
+ "CONFIG_SOURCE_VERSION": {
111
+ "cellprofiler": "v4.0.0",
112
+ },
113
+ # names of source table compartments (for ex. cells.csv, etc.)
114
+ "CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
115
+ # names of source table metadata (for ex. image.csv, etc.)
116
+ "CONFIG_NAMES_METADATA": ("image",),
117
+ # column names in any compartment or metadata tables which contain
118
+ # unique names to avoid renaming
119
+ "CONFIG_IDENTIFYING_COLUMNS": (
120
+ "ImageNumber",
121
+ "ObjectNumber",
122
+ "Metadata_Well",
123
+ "Metadata_Plate",
124
+ "Parent_Cells",
125
+ "Parent_Nuclei",
126
+ ),
127
+ # pagination keys for use with this data
128
+ # of the rough format "table" -> "column".
129
+ # note: page keys are expected to be numeric (int, float)
130
+ "CONFIG_PAGE_KEYS": {
131
+ "image": "ImageNumber",
132
+ "cells": "ObjectNumber",
133
+ "nuclei": "ObjectNumber",
134
+ "cytoplasm": "ObjectNumber",
135
+ "join": "Cytoplasm_Number_Object_Number",
136
+ },
137
+ # chunk size to use for join operations to help with possible performance issues
138
+ # note: this number is an estimate and is may need changes contingent on data
139
+ # and system used by this library.
140
+ "CONFIG_CHUNK_SIZE": 1000,
141
+ # compartment and metadata joins performed using DuckDB SQL
142
+ # and modified at runtime as needed
143
+ "CONFIG_JOINS": """
144
+ SELECT
145
+ image.Image_TableNumber,
146
+ image.Metadata_ImageNumber,
147
+ image.Metadata_Plate,
148
+ image.Metadata_Well,
149
+ image.Image_Metadata_Site,
150
+ image.Image_Metadata_Row,
151
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
152
+ cells.* EXCLUDE (Metadata_ImageNumber),
153
+ nuclei.* EXCLUDE (Metadata_ImageNumber)
154
+ FROM
155
+ read_parquet('cytoplasm.parquet') AS cytoplasm
156
+ LEFT JOIN read_parquet('cells.parquet') AS cells ON
157
+ cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
158
+ AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
159
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
160
+ nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
161
+ AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
162
+ LEFT JOIN read_parquet('image.parquet') AS image ON
163
+ image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
164
+ """,
165
+ },
88
166
  "cellprofiler_sqlite_pycytominer": {
89
167
  # version specifications using related references
90
168
  "CONFIG_SOURCE_VERSION": {
@@ -107,6 +185,16 @@ config = {
107
185
  "Cells_Number_Object_Number",
108
186
  "Nuclei_Number_Object_Number",
109
187
  ),
188
+ # pagination keys for use with this data
189
+ # of the rough format "table" -> "column".
190
+ # note: page keys are expected to be numeric (int, float)
191
+ "CONFIG_PAGE_KEYS": {
192
+ "image": "ImageNumber",
193
+ "cells": "Cells_Number_Object_Number",
194
+ "nuclei": "Nuclei_Number_Object_Number",
195
+ "cytoplasm": "Cytoplasm_Number_Object_Number",
196
+ "join": "Cytoplasm_Number_Object_Number",
197
+ },
110
198
  # chunk size to use for join operations to help with possible performance issues
111
199
  # note: this number is an estimate and is may need changes contingent on data
112
200
  # and system used by this library.
@@ -155,6 +243,16 @@ config = {
155
243
  "Cells_ObjectNumber",
156
244
  "Nuclei_ObjectNumber",
157
245
  ),
246
+ # pagination keys for use with this data
247
+ # of the rough format "table" -> "column".
248
+ # note: page keys are expected to be numeric (int, float)
249
+ "CONFIG_PAGE_KEYS": {
250
+ "image": "ImageNumber",
251
+ "cells": "ObjectNumber",
252
+ "nuclei": "ObjectNumber",
253
+ "cytoplasm": "ObjectNumber",
254
+ "join": "Cytoplasm_Number_Object_Number",
255
+ },
158
256
  # chunk size to use for join operations to help with possible performance issues
159
257
  # note: this number is an estimate and is may need changes contingent on data
160
258
  # and system used by this library.
@@ -200,6 +298,12 @@ config = {
200
298
  "Z",
201
299
  "T",
202
300
  ),
301
+ # pagination keys for use with this data
302
+ # of the rough format "table" -> "column".
303
+ # note: page keys are expected to be numeric (int, float)
304
+ "CONFIG_PAGE_KEYS": {
305
+ "test": '"OBJECT ID"',
306
+ },
203
307
  # chunk size to use for join operations to help with possible performance issues
204
308
  # note: this number is an estimate and is may need changes contingent on data
205
309
  # and system used by this library.
@@ -7,13 +7,11 @@ import pathlib
7
7
  from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from cloudpathlib import AnyPath
10
- from parsl.app.app import join_app, python_app
11
10
 
11
+ from cytotable.exceptions import NoInputDataException
12
12
 
13
- @python_app
14
- def _build_path(
15
- path: Union[str, pathlib.Path, AnyPath], **kwargs
16
- ) -> Union[pathlib.Path, AnyPath]:
13
+
14
+ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
17
15
  """
18
16
  Build a path client or return local path.
19
17
 
@@ -43,10 +41,9 @@ def _build_path(
43
41
  return processed_path
44
42
 
45
43
 
46
- @python_app
47
44
  def _get_source_filepaths(
48
45
  path: Union[pathlib.Path, AnyPath],
49
- targets: List[str],
46
+ targets: Optional[List[str]] = None,
50
47
  source_datatype: Optional[str] = None,
51
48
  ) -> Dict[str, List[Dict[str, Any]]]:
52
49
  """
@@ -75,7 +72,7 @@ def _get_source_filepaths(
75
72
 
76
73
  if (targets is None or targets == []) and source_datatype is None:
77
74
  raise DatatypeException(
78
- f"A source_datatype must be specified when using undefined compartments and metadata names."
75
+ "A source_datatype must be specified when using undefined compartments and metadata names."
79
76
  )
80
77
 
81
78
  # gathers files from provided path using compartments + metadata as a filter
@@ -87,9 +84,9 @@ def _get_source_filepaths(
87
84
  for subpath in (
88
85
  (path,)
89
86
  # used if the source path is a single file
90
- if AnyPath(path).is_file()
87
+ if path.is_file()
91
88
  # iterates through a source directory
92
- else (x for x in AnyPath(path).glob("**/*") if AnyPath(x).is_file())
89
+ else (x for x in path.glob("**/*") if x.is_file())
93
90
  )
94
91
  # ensure the subpaths meet certain specifications
95
92
  if (
@@ -129,7 +126,8 @@ def _get_source_filepaths(
129
126
  .arrow()["table_name"]
130
127
  .to_pylist()
131
128
  # make sure the table names match with compartment + metadata names
132
- if any(target.lower() in table_name.lower() for target in targets)
129
+ if targets is not None
130
+ and any(target.lower() in table_name.lower() for target in targets)
133
131
  ]
134
132
  else:
135
133
  # if we don't have sqlite source, append the existing element
@@ -181,7 +179,6 @@ def _get_source_filepaths(
181
179
  return grouped_sources
182
180
 
183
181
 
184
- @python_app
185
182
  def _infer_source_datatype(
186
183
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
187
184
  ) -> str:
@@ -230,7 +227,6 @@ def _infer_source_datatype(
230
227
  return source_datatype
231
228
 
232
229
 
233
- @python_app
234
230
  def _filter_source_filepaths(
235
231
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
236
232
  ) -> Dict[str, List[Dict[str, Any]]]:
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
260
256
  if file["source_path"].stat().st_size > 0
261
257
  # ensure the datatype matches the source datatype
262
258
  and file["source_path"].suffix == f".{source_datatype}"
259
+ and _file_is_more_than_one_line(path=file["source_path"])
263
260
  ]
264
261
  for filegroup, files in sources.items()
265
262
  }
266
263
 
267
264
 
268
- @join_app
265
+ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
266
+ """
267
+ Check if the file has more than one line.
268
+
269
+ Args:
270
+ path (Union[pathlib.Path, AnyPath]):
271
+ The path to the file.
272
+
273
+ Returns:
274
+ bool:
275
+ True if the file has more than one line, False otherwise.
276
+
277
+ Raises:
278
+ NoInputDataException: If the file has zero lines.
279
+ """
280
+
281
+ # if we don't have a sqlite file
282
+ # (we can't check sqlite files for lines)
283
+ if path.suffix.lower() != ".sqlite":
284
+ with path.open("r") as f:
285
+ try:
286
+ # read two lines, if the second is empty return false
287
+ return bool(f.readline() and f.readline())
288
+
289
+ except StopIteration:
290
+ # If we encounter the end of the file, it has only one line
291
+ raise NoInputDataException(
292
+ f"Data file has 0 rows of values. Error in file: {path}"
293
+ )
294
+ else:
295
+ return True
296
+
297
+
269
298
  def _gather_sources(
270
299
  source_path: str,
271
300
  source_datatype: Optional[str] = None,
@@ -295,11 +324,11 @@ def _gather_sources(
295
324
  _infer_source_datatype,
296
325
  )
297
326
 
298
- source_path = _build_path(path=source_path, **kwargs)
327
+ built_path = _build_path(path=source_path, **kwargs)
299
328
 
300
329
  # gather filepaths which will be used as the basis for this work
301
330
  sources = _get_source_filepaths(
302
- path=source_path, targets=targets, source_datatype=source_datatype
331
+ path=built_path, targets=targets, source_datatype=source_datatype
303
332
  )
304
333
 
305
334
  # infer or validate the source datatype based on source filepaths
@@ -5,7 +5,7 @@ Utility functions for CytoTable
5
5
  import logging
6
6
  import os
7
7
  import pathlib
8
- from typing import Any, Dict, List, Optional, Union, cast
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
9
 
10
10
  import duckdb
11
11
  import parsl
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
149
149
  INSTALL sqlite_scanner;
150
150
  LOAD sqlite_scanner;
151
151
 
152
+ /* Install httpfs plugin to avoid error
153
+ https://github.com/duckdb/duckdb/issues/3243 */
154
+ INSTALL httpfs;
155
+
152
156
  /*
153
157
  Set threads available to duckdb
154
158
  See the following for more information:
@@ -169,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
169
173
  def _sqlite_mixed_type_query_to_parquet(
170
174
  source_path: str,
171
175
  table_name: str,
172
- chunk_size: int,
173
- offset: int,
176
+ page_key: str,
177
+ pageset: Tuple[Union[int, float], Union[int, float]],
174
178
  sort_output: bool,
175
- add_cytotable_meta: bool = False,
176
179
  ) -> str:
177
180
  """
178
181
  Performs SQLite table data extraction where one or many
@@ -184,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
184
187
  A str which is a path to a SQLite database file.
185
188
  table_name: str:
186
189
  The name of the table being queried.
187
- chunk_size: int:
188
- Row count to use for chunked output.
189
- offset: int:
190
- The offset for chunking the data from source.
190
+ page_key: str:
191
+ The column name to be used to identify pagination chunks.
192
+ pageset: Tuple[int, int]:
193
+ The range for values used for paginating data from source.
191
194
  sort_output: bool
192
195
  Specifies whether to sort cytotable output or not.
193
196
  add_cytotable_meta: bool, default=False:
@@ -201,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
201
204
 
202
205
  import pyarrow as pa
203
206
 
204
- from cytotable.constants import (
205
- CYOTABLE_META_COLUMN_TYPES,
206
- SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
207
- )
207
+ from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
208
208
  from cytotable.exceptions import DatatypeException
209
209
 
210
210
  # open sqlite3 connection
@@ -264,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
264
264
  for col in column_info
265
265
  ]
266
266
 
267
- if add_cytotable_meta:
268
- query_parts += [
269
- (
270
- f"CAST( '{f'{source_path}_table_{table_name}'}' "
271
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
272
- "AS cytotable_meta_source_path"
273
- ),
274
- (
275
- f"CAST( {offset} "
276
- f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
277
- "AS cytotable_meta_offset"
278
- ),
279
- (
280
- f"CAST( (ROW_NUMBER() OVER ()) AS "
281
- f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
282
- "AS cytotable_meta_rownum"
283
- ),
284
- ]
285
-
286
267
  # perform the select using the cases built above and using chunksize + offset
287
- sql_stmt = (
288
- f"""
289
- SELECT
290
- {', '.join(query_parts)}
291
- FROM {table_name}
292
- ORDER BY {', '.join([col['column_name'] for col in column_info])}
293
- LIMIT {chunk_size} OFFSET {offset};
294
- """
295
- if sort_output
296
- else f"""
268
+ sql_stmt = f"""
297
269
  SELECT
298
270
  {', '.join(query_parts)}
299
271
  FROM {table_name}
300
- LIMIT {chunk_size} OFFSET {offset};
272
+ WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
273
+ {"ORDER BY " + page_key if sort_output else ""};
301
274
  """
302
- )
303
275
 
304
276
  # execute the sql stmt
305
277
  cursor.execute(sql_stmt)
@@ -322,7 +294,7 @@ def _sqlite_mixed_type_query_to_parquet(
322
294
  return pa.Table.from_pylist(results)
323
295
 
324
296
 
325
- def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
297
+ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
326
298
  """
327
299
  Takes a cloudpath and uses cache to convert to a local copy
328
300
  for use in scenarios where remote work is not possible (sqlite).
@@ -337,24 +309,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
337
309
  A local pathlib.Path to cached version of cloudpath file.
338
310
  """
339
311
 
340
- candidate_path = AnyPath(path)
341
-
342
312
  # check that the path is a file (caching won't work with a dir)
343
313
  # and check that the file is of sqlite type
344
314
  # (other file types will be handled remotely in cloud)
345
- if candidate_path.is_file() and candidate_path.suffix.lower() == ".sqlite":
315
+ if (
316
+ isinstance(path, CloudPath)
317
+ and path.is_file()
318
+ and path.suffix.lower() == ".sqlite"
319
+ ):
346
320
  try:
347
321
  # update the path to be the local filepath for reference in CytoTable ops
348
322
  # note: incurs a data read which will trigger caching of the file
349
- path = CloudPath(path).fspath
323
+ path = pathlib.Path(path.fspath)
350
324
  except InvalidPrefixError:
351
325
  # share information about not finding a cloud path
352
326
  logger.info(
353
327
  "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
354
328
  )
355
329
 
356
- # cast the result as a pathlib.Path
357
- return pathlib.Path(path)
330
+ return path
358
331
 
359
332
 
360
333
  def _arrow_type_cast_if_specified(
@@ -595,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
595
568
  if isinstance(sources, dict)
596
569
  else _unwrap_value(sources)
597
570
  )
571
+
572
+
573
+ def _generate_pagesets(
574
+ keys: List[Union[int, float]], chunk_size: int
575
+ ) -> List[Tuple[Union[int, float], Union[int, float]]]:
576
+ """
577
+ Generate a pageset (keyset pagination) from a list of keys.
578
+
579
+ Parameters:
580
+ keys List[Union[int, float]]:
581
+ List of keys to paginate.
582
+ chunk_size int:
583
+ Size of each chunk/page.
584
+
585
+ Returns:
586
+ List[Tuple[Union[int, float], Union[int, float]]]:
587
+ List of (start_key, end_key) tuples representing each page.
588
+ """
589
+
590
+ # Initialize an empty list to store the chunks/pages
591
+ chunks = []
592
+
593
+ # Start index for iteration through the keys
594
+ i = 0
595
+
596
+ while i < len(keys):
597
+ # Get the start key for the current chunk
598
+ start_key = keys[i]
599
+
600
+ # Calculate the end index for the current chunk
601
+ end_index = min(i + chunk_size, len(keys)) - 1
602
+
603
+ # Get the end key for the current chunk
604
+ end_key = keys[end_index]
605
+
606
+ # Ensure non-overlapping by incrementing the start of the next range if there are duplicates
607
+ while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
608
+ end_index += 1
609
+
610
+ # Append the current chunk (start_key, end_key) to the list of chunks
611
+ chunks.append((start_key, end_key))
612
+
613
+ # Update the index to start from the next chunk
614
+ i = end_index + 1
615
+
616
+ # Return the list of chunks/pages
617
+ return chunks
618
+
619
+
620
+ def _natural_sort(list_to_sort):
621
+ """
622
+ Sorts the given iterable using natural sort adapted from approach
623
+ provided by the following link:
624
+ https://stackoverflow.com/a/4836734
625
+
626
+ Args:
627
+ list_to_sort: List:
628
+ The list to sort.
629
+
630
+ Returns:
631
+ List: The sorted list.
632
+ """
633
+ import re
634
+
635
+ return sorted(
636
+ list_to_sort,
637
+ # use a custom key to sort the list
638
+ key=lambda key: [
639
+ # use integer of c if it's a digit, otherwise str
640
+ int(c) if c.isdigit() else c
641
+ # Split the key into parts, separating numbers from alphabetic characters
642
+ for c in re.split("([0-9]+)", str(key))
643
+ ],
644
+ )
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "CytoTable"
3
3
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
4
- version = "0.0.8"
4
+ version = "0.0.10"
5
5
  description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
6
6
  authors = ["Cytomining Community"]
7
7
  license = "BSD-3-Clause License"
@@ -25,7 +25,7 @@ build-backend = "poetry_dynamic_versioning.backend"
25
25
  [tool.poetry.dependencies]
26
26
  python = ">=3.8,<3.13"
27
27
  pyarrow = ">=13.0.0"
28
- cloudpathlib = {extras = ["all"], version = "^0.18.0"}
28
+ cloudpathlib = {extras = ["all", "s3"], version = "^0.18.0"}
29
29
  duckdb = ">=0.8.0,!=0.10.0,>=0.10.1"
30
30
  parsl = ">=2023.9.25"
31
31
  numpy = [
@@ -43,10 +43,10 @@ pytest-cov = "^4.1.0"
43
43
  Sphinx = "^6.0.0"
44
44
  myst-parser = "^2.0.0"
45
45
  sphinxcontrib-mermaid = "^0.9.0"
46
- moto = {extras = ["s3", "server"], version = "^4.0.0"}
47
46
  cytominer-database = "^0.3.4"
48
47
  pycytominer = "^1.1.0"
49
48
  dunamai = "^1.19.0"
49
+ botocore = "^1.34.133" # added to help avoid dependency reolution issues
50
50
 
51
51
  [tool.vulture]
52
52
  min_confidence = 80
@@ -69,3 +69,6 @@ filterwarnings = [
69
69
  "ignore::DeprecationWarning:pkg_resources",
70
70
  "ignore::DeprecationWarning:cytominer_database",
71
71
  ]
72
+ markers = [
73
+ "large_data_tests: tests which involve the use of large data.",
74
+ ]
File without changes
File without changes