CytoTable 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/constants.py +0 -7
- cytotable/convert.py +167 -161
- cytotable/presets.py +56 -0
- cytotable/utils.py +85 -43
- {cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/METADATA +1 -1
- cytotable-0.0.10.dist-info/RECORD +11 -0
- cytotable-0.0.9.dist-info/RECORD +0 -11
- {cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/LICENSE +0 -0
- {cytotable-0.0.9.dist-info → cytotable-0.0.10.dist-info}/WHEEL +0 -0
cytotable/__init__.py
CHANGED
cytotable/constants.py
CHANGED
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
|
68
68
|
],
|
69
69
|
}
|
70
70
|
|
71
|
-
# metadata column names and types for internal use within CytoTable
|
72
|
-
CYOTABLE_META_COLUMN_TYPES = {
|
73
|
-
"cytotable_meta_source_path": "VARCHAR",
|
74
|
-
"cytotable_meta_offset": "BIGINT",
|
75
|
-
"cytotable_meta_rownum": "BIGINT",
|
76
|
-
}
|
77
|
-
|
78
71
|
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
79
72
|
"data-producer": "https://github.com/cytomining/CytoTable",
|
80
73
|
"data-producer-version": str(_get_cytotable_version()),
|
cytotable/convert.py
CHANGED
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
|
|
4
4
|
|
5
5
|
import itertools
|
6
6
|
import logging
|
7
|
-
import uuid
|
8
7
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
|
9
8
|
|
10
9
|
import parsl
|
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
|
|
33
32
|
|
34
33
|
Args:
|
35
34
|
source: Dict[str, Any]
|
36
|
-
Contains
|
35
|
+
Contains source data details. Represents a single
|
37
36
|
file or table of some kind.
|
38
37
|
sort_output:
|
39
38
|
Specifies whether to sort cytotable output or not.
|
@@ -43,10 +42,7 @@ def _get_table_columns_and_types(
|
|
43
42
|
list of dictionaries which each include column level information
|
44
43
|
"""
|
45
44
|
|
46
|
-
import pathlib
|
47
|
-
|
48
45
|
import duckdb
|
49
|
-
from cloudpathlib import AnyPath
|
50
46
|
|
51
47
|
from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
|
52
48
|
|
@@ -89,7 +85,7 @@ def _get_table_columns_and_types(
|
|
89
85
|
# with exception handling to read mixed-type data
|
90
86
|
# using sqlite3 and special utility function
|
91
87
|
try:
|
92
|
-
# isolate using new connection to read data
|
88
|
+
# isolate using new connection to read data based on pageset
|
93
89
|
# and export directly to parquet via duckdb (avoiding need to return data to python)
|
94
90
|
# perform the query and create a list of dictionaries with the column data for table
|
95
91
|
with _duckdb_reader() as ddb_reader:
|
@@ -109,13 +105,8 @@ def _get_table_columns_and_types(
|
|
109
105
|
arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
|
110
106
|
source_path=str(source["source_path"]),
|
111
107
|
table_name=str(source["table_name"]),
|
112
|
-
|
113
|
-
|
114
|
-
chunk_size=5,
|
115
|
-
# offset is set to 0 start at first row
|
116
|
-
# result from table
|
117
|
-
offset=0,
|
118
|
-
add_cytotable_meta=False,
|
108
|
+
page_key=source["page_key"],
|
109
|
+
pageset=source["pagesets"][0],
|
119
110
|
sort_output=sort_output,
|
120
111
|
)
|
121
112
|
with _duckdb_reader() as ddb_reader:
|
@@ -183,13 +174,14 @@ def _prep_cast_column_data_types(
|
|
183
174
|
|
184
175
|
|
185
176
|
@python_app
|
186
|
-
def
|
177
|
+
def _get_table_keyset_pagination_sets(
|
187
178
|
chunk_size: int,
|
179
|
+
page_key: str,
|
188
180
|
source: Optional[Dict[str, Any]] = None,
|
189
181
|
sql_stmt: Optional[str] = None,
|
190
|
-
) -> Union[List[int], None]:
|
182
|
+
) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
|
191
183
|
"""
|
192
|
-
Get table data chunk
|
184
|
+
Get table data chunk keys for later use in capturing segments
|
193
185
|
of values. This work also provides a chance to catch problematic
|
194
186
|
input data which will be ignored with warnings.
|
195
187
|
|
@@ -199,21 +191,27 @@ def _get_table_chunk_offsets(
|
|
199
191
|
file or table of some kind.
|
200
192
|
chunk_size: int
|
201
193
|
The size in rowcount of the chunks to create.
|
194
|
+
page_key: str
|
195
|
+
The column name to be used to identify pagination chunks.
|
196
|
+
Expected to be of numeric type (int, float) for ordering.
|
197
|
+
sql_stmt:
|
198
|
+
Optional sql statement to form the pagination set from.
|
199
|
+
Default behavior extracts pagination sets from the full
|
200
|
+
data source.
|
202
201
|
|
203
202
|
Returns:
|
204
|
-
List[
|
205
|
-
List of
|
206
|
-
the data later on.
|
203
|
+
List[Any]
|
204
|
+
List of keys to use for reading the data later on.
|
207
205
|
"""
|
208
206
|
|
209
207
|
import logging
|
210
|
-
import
|
208
|
+
import sqlite3
|
209
|
+
from contextlib import closing
|
211
210
|
|
212
211
|
import duckdb
|
213
|
-
from cloudpathlib import AnyPath, CloudPath
|
214
212
|
|
215
213
|
from cytotable.exceptions import NoInputDataException
|
216
|
-
from cytotable.utils import _duckdb_reader
|
214
|
+
from cytotable.utils import _duckdb_reader, _generate_pagesets
|
217
215
|
|
218
216
|
logger = logging.getLogger(__name__)
|
219
217
|
|
@@ -223,18 +221,29 @@ def _get_table_chunk_offsets(
|
|
223
221
|
source_type = str(source_path.suffix).lower()
|
224
222
|
|
225
223
|
try:
|
226
|
-
# gather the total rowcount from csv or sqlite data input sources
|
227
224
|
with _duckdb_reader() as ddb_reader:
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
).
|
235
|
-
|
225
|
+
if source_type == ".csv":
|
226
|
+
sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
|
227
|
+
else:
|
228
|
+
sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
|
229
|
+
|
230
|
+
page_keys = [
|
231
|
+
results[0] for results in ddb_reader.execute(sql_query).fetchall()
|
232
|
+
]
|
233
|
+
|
234
|
+
# exception case for when we have mixed types
|
235
|
+
# (i.e. integer col with string and ints) in a sqlite column
|
236
|
+
except duckdb.TypeMismatchException:
|
237
|
+
with closing(sqlite3.connect(source_path)) as cx:
|
238
|
+
with cx:
|
239
|
+
page_keys = [
|
240
|
+
key[0]
|
241
|
+
for key in cx.execute(
|
242
|
+
f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
|
243
|
+
).fetchall()
|
244
|
+
if isinstance(key[0], (int, float))
|
245
|
+
]
|
236
246
|
|
237
|
-
# catch input errors which will result in skipped files
|
238
247
|
except (
|
239
248
|
duckdb.InvalidInputException,
|
240
249
|
NoInputDataException,
|
@@ -245,34 +254,20 @@ def _get_table_chunk_offsets(
|
|
245
254
|
|
246
255
|
return None
|
247
256
|
|
248
|
-
# find chunk offsets from sql statement
|
249
257
|
elif sql_stmt is not None:
|
250
|
-
# gather the total rowcount from csv or sqlite data input sources
|
251
258
|
with _duckdb_reader() as ddb_reader:
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
f"SELECT COUNT(*) FROM ({sql_stmt})"
|
256
|
-
).fetchone()[0]
|
257
|
-
)
|
259
|
+
sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
|
260
|
+
page_keys = ddb_reader.execute(sql_query).fetchall()
|
261
|
+
page_keys = [key[0] for key in page_keys]
|
258
262
|
|
259
|
-
return
|
260
|
-
range(
|
261
|
-
0,
|
262
|
-
# gather rowcount from table and use as maximum for range
|
263
|
-
rowcount,
|
264
|
-
# step through using chunk size
|
265
|
-
chunk_size,
|
266
|
-
)
|
267
|
-
)
|
263
|
+
return _generate_pagesets(page_keys, chunk_size)
|
268
264
|
|
269
265
|
|
270
266
|
@python_app
|
271
|
-
def
|
267
|
+
def _source_pageset_to_parquet(
|
272
268
|
source_group_name: str,
|
273
269
|
source: Dict[str, Any],
|
274
|
-
|
275
|
-
offset: int,
|
270
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
276
271
|
dest_path: str,
|
277
272
|
sort_output: bool,
|
278
273
|
) -> str:
|
@@ -285,10 +280,8 @@ def _source_chunk_to_parquet(
|
|
285
280
|
source: Dict[str, Any]
|
286
281
|
Contains the source data to be chunked. Represents a single
|
287
282
|
file or table of some kind along with collected information about table.
|
288
|
-
|
289
|
-
|
290
|
-
offset: int
|
291
|
-
The offset for chunking the data from source.
|
283
|
+
pageset: Tuple[int, int]
|
284
|
+
The pageset for chunking the data from source.
|
292
285
|
dest_path: str
|
293
286
|
Path to store the output data.
|
294
287
|
sort_output: bool
|
@@ -303,9 +296,7 @@ def _source_chunk_to_parquet(
|
|
303
296
|
|
304
297
|
import duckdb
|
305
298
|
from cloudpathlib import AnyPath
|
306
|
-
from pyarrow import parquet
|
307
299
|
|
308
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
309
300
|
from cytotable.utils import (
|
310
301
|
_duckdb_reader,
|
311
302
|
_sqlite_mixed_type_query_to_parquet,
|
@@ -319,26 +310,6 @@ def _source_chunk_to_parquet(
|
|
319
310
|
)
|
320
311
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
321
312
|
|
322
|
-
source_path_str = (
|
323
|
-
source["source_path"]
|
324
|
-
if "table_name" not in source.keys()
|
325
|
-
else f"{source['source_path']}_table_{source['table_name']}"
|
326
|
-
)
|
327
|
-
# build the column selection block of query
|
328
|
-
|
329
|
-
# add cytotable metadata columns
|
330
|
-
cytotable_metadata_cols = [
|
331
|
-
(
|
332
|
-
f"CAST( '{source_path_str}' "
|
333
|
-
f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
|
334
|
-
' AS "cytotable_meta_source_path"'
|
335
|
-
),
|
336
|
-
f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
|
337
|
-
(
|
338
|
-
f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
|
339
|
-
' AS "cytotable_meta_rownum"'
|
340
|
-
),
|
341
|
-
]
|
342
313
|
# add source table columns
|
343
314
|
casted_source_cols = [
|
344
315
|
# here we cast the column to the specified type ensure the colname remains the same
|
@@ -349,7 +320,7 @@ def _source_chunk_to_parquet(
|
|
349
320
|
# create selection statement from lists above
|
350
321
|
select_columns = ",".join(
|
351
322
|
# if we should sort the output, add the metadata_cols
|
352
|
-
|
323
|
+
casted_source_cols
|
353
324
|
if sort_output
|
354
325
|
else casted_source_cols
|
355
326
|
)
|
@@ -364,7 +335,8 @@ def _source_chunk_to_parquet(
|
|
364
335
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
365
336
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
366
337
|
|
367
|
-
|
338
|
+
# form a filepath which indicates the pageset
|
339
|
+
result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
|
368
340
|
|
369
341
|
# Attempt to read the data to parquet file
|
370
342
|
# using duckdb for extraction and pyarrow for
|
@@ -377,14 +349,9 @@ def _source_chunk_to_parquet(
|
|
377
349
|
table=ddb_reader.execute(
|
378
350
|
f"""
|
379
351
|
{base_query}
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
"""
|
384
|
-
if sort_output
|
385
|
-
else f"""
|
386
|
-
{base_query}
|
387
|
-
LIMIT {chunk_size} OFFSET {offset}
|
352
|
+
WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
|
353
|
+
/* optional ordering per pageset */
|
354
|
+
{"ORDER BY " + source['page_key'] if sort_output else ""};
|
388
355
|
"""
|
389
356
|
).arrow(),
|
390
357
|
where=result_filepath,
|
@@ -406,9 +373,8 @@ def _source_chunk_to_parquet(
|
|
406
373
|
table=_sqlite_mixed_type_query_to_parquet(
|
407
374
|
source_path=str(source["source_path"]),
|
408
375
|
table_name=str(source["table_name"]),
|
409
|
-
|
410
|
-
|
411
|
-
add_cytotable_meta=True if sort_output else False,
|
376
|
+
page_key=source["page_key"],
|
377
|
+
pageset=pageset,
|
412
378
|
sort_output=sort_output,
|
413
379
|
),
|
414
380
|
where=result_filepath,
|
@@ -458,10 +424,7 @@ def _prepend_column_name(
|
|
458
424
|
|
459
425
|
import pyarrow.parquet as parquet
|
460
426
|
|
461
|
-
from cytotable.constants import
|
462
|
-
CYOTABLE_META_COLUMN_TYPES,
|
463
|
-
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
464
|
-
)
|
427
|
+
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
465
428
|
from cytotable.utils import _write_parquet_table_with_metadata
|
466
429
|
|
467
430
|
logger = logging.getLogger(__name__)
|
@@ -472,7 +435,7 @@ def _prepend_column_name(
|
|
472
435
|
if len(targets) == 0:
|
473
436
|
logger.warning(
|
474
437
|
msg=(
|
475
|
-
"Skipping column name prepend operations"
|
438
|
+
"Skipping column name prepend operations "
|
476
439
|
"because no compartments or metadata were provided."
|
477
440
|
)
|
478
441
|
)
|
@@ -509,10 +472,8 @@ def _prepend_column_name(
|
|
509
472
|
# source_group_name_stem: 'Cells'
|
510
473
|
# column_name: 'AreaShape_Area'
|
511
474
|
# updated_column_name: 'Cells_AreaShape_Area'
|
512
|
-
if (
|
513
|
-
|
514
|
-
and not column_name.startswith(source_group_name_stem.capitalize())
|
515
|
-
and column_name not in CYOTABLE_META_COLUMN_TYPES
|
475
|
+
if column_name not in identifying_columns and not column_name.startswith(
|
476
|
+
source_group_name_stem.capitalize()
|
516
477
|
):
|
517
478
|
updated_column_names.append(f"{source_group_name_stem}_{column_name}")
|
518
479
|
# if-condition for prepending 'Metadata_' to column name
|
@@ -574,6 +535,7 @@ def _concat_source_group(
|
|
574
535
|
source_group: List[Dict[str, Any]],
|
575
536
|
dest_path: str,
|
576
537
|
common_schema: Optional[List[Tuple[str, str]]] = None,
|
538
|
+
sort_output: bool = True,
|
577
539
|
) -> List[Dict[str, Any]]:
|
578
540
|
"""
|
579
541
|
Concatenate group of source data together as single file.
|
@@ -620,6 +582,8 @@ def _concat_source_group(
|
|
620
582
|
common_schema: List[Tuple[str, str]] (Default value = None)
|
621
583
|
Common schema to use for concatenation amongst arrow tables
|
622
584
|
which may have slightly different but compatible schema.
|
585
|
+
sort_output: bool
|
586
|
+
Specifies whether to sort cytotable output or not.
|
623
587
|
|
624
588
|
Returns:
|
625
589
|
List[Dict[str, Any]]
|
@@ -637,7 +601,7 @@ def _concat_source_group(
|
|
637
601
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
638
602
|
)
|
639
603
|
from cytotable.exceptions import SchemaException
|
640
|
-
from cytotable.utils import
|
604
|
+
from cytotable.utils import _natural_sort
|
641
605
|
|
642
606
|
# build a result placeholder
|
643
607
|
concatted: List[Dict[str, Any]] = [
|
@@ -676,7 +640,10 @@ def _concat_source_group(
|
|
676
640
|
# (all must be the same schema)
|
677
641
|
with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
|
678
642
|
for source in source_group:
|
679
|
-
|
643
|
+
tables = [table for table in source["table"]]
|
644
|
+
if sort_output:
|
645
|
+
tables = _natural_sort(tables)
|
646
|
+
for table in tables:
|
680
647
|
# if we haven't inferred the common schema
|
681
648
|
# check that our file matches the expected schema, otherwise raise an error
|
682
649
|
if common_schema is None and not writer_schema.equals(
|
@@ -720,7 +687,6 @@ def _concat_source_group(
|
|
720
687
|
def _prepare_join_sql(
|
721
688
|
sources: Dict[str, List[Dict[str, Any]]],
|
722
689
|
joins: str,
|
723
|
-
sort_output: bool,
|
724
690
|
) -> str:
|
725
691
|
"""
|
726
692
|
Prepare join SQL statement with actual locations of data based on the sources.
|
@@ -741,8 +707,6 @@ def _prepare_join_sql(
|
|
741
707
|
"""
|
742
708
|
import pathlib
|
743
709
|
|
744
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
745
|
-
|
746
710
|
# replace with real location of sources for join sql
|
747
711
|
order_by_tables = []
|
748
712
|
for key, val in sources.items():
|
@@ -754,25 +718,17 @@ def _prepare_join_sql(
|
|
754
718
|
)
|
755
719
|
order_by_tables.append(table_name)
|
756
720
|
|
757
|
-
# create order by statement with from all tables using cytotable metadata
|
758
|
-
order_by_sql = "ORDER BY " + ", ".join(
|
759
|
-
[
|
760
|
-
f"{table}.{meta_column}"
|
761
|
-
for table in order_by_tables
|
762
|
-
for meta_column in CYOTABLE_META_COLUMN_TYPES
|
763
|
-
]
|
764
|
-
)
|
765
|
-
|
766
721
|
# add the order by statements to the join
|
767
|
-
return joins
|
722
|
+
return joins
|
768
723
|
|
769
724
|
|
770
725
|
@python_app
|
771
|
-
def
|
726
|
+
def _join_source_pageset(
|
772
727
|
dest_path: str,
|
773
728
|
joins: str,
|
774
|
-
|
775
|
-
|
729
|
+
page_key: str,
|
730
|
+
pageset: Tuple[int, int],
|
731
|
+
sort_output: bool,
|
776
732
|
drop_null: bool,
|
777
733
|
) -> str:
|
778
734
|
"""
|
@@ -798,31 +754,20 @@ def _join_source_chunk(
|
|
798
754
|
|
799
755
|
import pathlib
|
800
756
|
|
801
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
802
757
|
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
803
758
|
|
804
|
-
# Attempt to read the data to parquet file
|
805
|
-
# using duckdb for extraction and pyarrow for
|
806
|
-
# writing data to a parquet file.
|
807
|
-
# read data with chunk size + offset
|
808
|
-
# and export to parquet
|
809
|
-
exclude_meta_cols = [
|
810
|
-
f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
|
811
|
-
]
|
812
|
-
|
813
759
|
with _duckdb_reader() as ddb_reader:
|
814
760
|
result = ddb_reader.execute(
|
815
761
|
f"""
|
816
|
-
|
762
|
+
WITH joined AS (
|
817
763
|
{joins}
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
"""
|
764
|
+
)
|
765
|
+
SELECT *
|
766
|
+
FROM joined
|
767
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
768
|
+
/* optional sorting per pagset */
|
769
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
770
|
+
"""
|
826
771
|
).arrow()
|
827
772
|
|
828
773
|
# drop nulls if specified
|
@@ -847,10 +792,8 @@ def _join_source_chunk(
|
|
847
792
|
f"{str(pathlib.Path(dest_path).parent)}/"
|
848
793
|
# use the dest_path stem in the name
|
849
794
|
f"{str(pathlib.Path(dest_path).stem)}-"
|
850
|
-
#
|
851
|
-
|
852
|
-
# and before they are brought together as one dataset
|
853
|
-
f"{str(uuid.uuid4().hex)}.parquet"
|
795
|
+
# add the pageset indication to the filename
|
796
|
+
f"{pageset[0]}-{pageset[1]}.parquet"
|
854
797
|
)
|
855
798
|
|
856
799
|
# write the result
|
@@ -867,6 +810,7 @@ def _concat_join_sources(
|
|
867
810
|
sources: Dict[str, List[Dict[str, Any]]],
|
868
811
|
dest_path: str,
|
869
812
|
join_sources: List[str],
|
813
|
+
sort_output: bool = True,
|
870
814
|
) -> str:
|
871
815
|
"""
|
872
816
|
Concatenate join sources from parquet-based chunks.
|
@@ -883,6 +827,8 @@ def _concat_join_sources(
|
|
883
827
|
join_sources: List[str]:
|
884
828
|
List of local filepath destination for join source chunks
|
885
829
|
which will be concatenated.
|
830
|
+
sort_output: bool
|
831
|
+
Specifies whether to sort cytotable output or not.
|
886
832
|
|
887
833
|
Returns:
|
888
834
|
str
|
@@ -898,7 +844,7 @@ def _concat_join_sources(
|
|
898
844
|
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
899
845
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
900
846
|
)
|
901
|
-
from cytotable.utils import
|
847
|
+
from cytotable.utils import _natural_sort
|
902
848
|
|
903
849
|
# remove the unjoined concatted compartments to prepare final dest_path usage
|
904
850
|
# (we now have joined results)
|
@@ -918,7 +864,11 @@ def _concat_join_sources(
|
|
918
864
|
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
919
865
|
)
|
920
866
|
with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
|
921
|
-
for table_path in
|
867
|
+
for table_path in (
|
868
|
+
join_sources
|
869
|
+
if not sort_output
|
870
|
+
else _natural_sort(list_to_sort=join_sources)
|
871
|
+
):
|
922
872
|
writer.write_table(
|
923
873
|
parquet.read_table(
|
924
874
|
table_path,
|
@@ -1042,6 +992,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1042
992
|
infer_common_schema: bool,
|
1043
993
|
drop_null: bool,
|
1044
994
|
sort_output: bool,
|
995
|
+
page_keys: Dict[str, str],
|
1045
996
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1046
997
|
**kwargs,
|
1047
998
|
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
@@ -1082,6 +1033,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1082
1033
|
Whether to drop null results.
|
1083
1034
|
sort_output: bool
|
1084
1035
|
Specifies whether to sort cytotable output or not.
|
1036
|
+
page_keys: Dict[str, str]
|
1037
|
+
A dictionary which defines which column names are used for keyset pagination
|
1038
|
+
in order to perform data extraction.
|
1085
1039
|
data_type_cast_map: Dict[str, str]
|
1086
1040
|
A dictionary mapping data type groups to specific types.
|
1087
1041
|
Roughly includes Arrow data types language from:
|
@@ -1112,16 +1066,35 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1112
1066
|
# expand the destination path
|
1113
1067
|
expanded_dest_path = _expand_path(path=dest_path)
|
1114
1068
|
|
1115
|
-
#
|
1116
|
-
|
1069
|
+
# check that each source group name has a pagination key
|
1070
|
+
for source_group_name in sources.keys():
|
1071
|
+
matching_keys = [
|
1072
|
+
key for key in page_keys.keys() if key.lower() in source_group_name.lower()
|
1073
|
+
]
|
1074
|
+
if not matching_keys:
|
1075
|
+
raise CytoTableException(
|
1076
|
+
f"No matching key found in page_keys for source_group_name: {source_group_name}."
|
1077
|
+
"Please include a pagination key based on a column name from the table."
|
1078
|
+
)
|
1079
|
+
|
1080
|
+
# prepare pagesets for chunked data export from source tables
|
1081
|
+
pagesets_prepared = {
|
1117
1082
|
source_group_name: [
|
1118
1083
|
dict(
|
1119
1084
|
source,
|
1120
1085
|
**{
|
1121
|
-
"
|
1086
|
+
"page_key": (
|
1087
|
+
page_key := [
|
1088
|
+
value
|
1089
|
+
for key, value in page_keys.items()
|
1090
|
+
if key.lower() in source_group_name.lower()
|
1091
|
+
][0]
|
1092
|
+
),
|
1093
|
+
"pagesets": _get_table_keyset_pagination_sets(
|
1122
1094
|
source=source,
|
1123
1095
|
chunk_size=chunk_size,
|
1124
|
-
|
1096
|
+
page_key=page_key,
|
1097
|
+
),
|
1125
1098
|
},
|
1126
1099
|
)
|
1127
1100
|
for source in source_group_vals
|
@@ -1129,17 +1102,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1129
1102
|
for source_group_name, source_group_vals in sources.items()
|
1130
1103
|
}
|
1131
1104
|
|
1132
|
-
# if
|
1105
|
+
# if pagesets is none and we haven't halted, remove the file as there
|
1133
1106
|
# were input formatting errors which will create challenges downstream
|
1134
1107
|
invalid_files_dropped = {
|
1135
1108
|
source_group_name: [
|
1136
|
-
# ensure we have
|
1109
|
+
# ensure we have pagesets
|
1137
1110
|
source
|
1138
1111
|
for source in source_group_vals
|
1139
|
-
if source["
|
1112
|
+
if source["pagesets"] is not None
|
1140
1113
|
]
|
1141
1114
|
for source_group_name, source_group_vals in evaluate_futures(
|
1142
|
-
|
1115
|
+
pagesets_prepared
|
1143
1116
|
).items()
|
1144
1117
|
# ensure we have source_groups with at least one source table
|
1145
1118
|
if len(source_group_vals) > 0
|
@@ -1172,12 +1145,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1172
1145
|
"table": [
|
1173
1146
|
# perform column renaming and create potential return result
|
1174
1147
|
_prepend_column_name(
|
1175
|
-
# perform chunked data export to parquet using
|
1176
|
-
table_path=
|
1148
|
+
# perform chunked data export to parquet using pagesets
|
1149
|
+
table_path=_source_pageset_to_parquet(
|
1177
1150
|
source_group_name=source_group_name,
|
1178
1151
|
source=source,
|
1179
|
-
|
1180
|
-
offset=offset,
|
1152
|
+
pageset=pageset,
|
1181
1153
|
dest_path=expanded_dest_path,
|
1182
1154
|
sort_output=sort_output,
|
1183
1155
|
),
|
@@ -1186,7 +1158,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1186
1158
|
metadata=metadata,
|
1187
1159
|
compartments=compartments,
|
1188
1160
|
)
|
1189
|
-
for
|
1161
|
+
for pageset in source["pagesets"]
|
1190
1162
|
]
|
1191
1163
|
},
|
1192
1164
|
)
|
@@ -1227,6 +1199,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1227
1199
|
source_group=source_group_vals[0]["sources"],
|
1228
1200
|
dest_path=expanded_dest_path,
|
1229
1201
|
common_schema=source_group_vals[0]["common_schema"],
|
1202
|
+
sort_output=sort_output,
|
1230
1203
|
)
|
1231
1204
|
for source_group_name, source_group_vals in evaluate_futures(
|
1232
1205
|
common_schema_determined
|
@@ -1240,28 +1213,34 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1240
1213
|
evaluated_results = evaluate_futures(results)
|
1241
1214
|
|
1242
1215
|
prepared_joins_sql = _prepare_join_sql(
|
1243
|
-
sources=evaluated_results, joins=joins
|
1216
|
+
sources=evaluated_results, joins=joins
|
1244
1217
|
).result()
|
1245
1218
|
|
1219
|
+
page_key_join = [
|
1220
|
+
value for key, value in page_keys.items() if key.lower() == "join"
|
1221
|
+
][0]
|
1222
|
+
|
1246
1223
|
# map joined results based on the join groups gathered above
|
1247
1224
|
# note: after mapping we end up with a list of strings (task returns str)
|
1248
1225
|
join_sources_result = [
|
1249
|
-
|
1226
|
+
_join_source_pageset(
|
1250
1227
|
# gather the result of concatted sources prior to
|
1251
1228
|
# join group merging as each mapped task run will need
|
1252
1229
|
# full concat results
|
1253
1230
|
dest_path=expanded_dest_path,
|
1254
1231
|
joins=prepared_joins_sql,
|
1255
|
-
|
1256
|
-
|
1232
|
+
page_key=page_key_join,
|
1233
|
+
pageset=pageset,
|
1234
|
+
sort_output=sort_output,
|
1257
1235
|
drop_null=drop_null,
|
1258
1236
|
)
|
1259
1237
|
# create join group for querying the concatenated
|
1260
1238
|
# data in order to perform memory-safe joining
|
1261
1239
|
# per user chunk size specification.
|
1262
|
-
for
|
1240
|
+
for pageset in _get_table_keyset_pagination_sets(
|
1263
1241
|
sql_stmt=prepared_joins_sql,
|
1264
1242
|
chunk_size=chunk_size,
|
1243
|
+
page_key=page_key_join,
|
1265
1244
|
).result()
|
1266
1245
|
]
|
1267
1246
|
|
@@ -1272,6 +1251,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1272
1251
|
dest_path=expanded_dest_path,
|
1273
1252
|
join_sources=[join.result() for join in join_sources_result],
|
1274
1253
|
sources=evaluated_results,
|
1254
|
+
sort_output=sort_output,
|
1275
1255
|
)
|
1276
1256
|
|
1277
1257
|
# wrap the final result as a future and return
|
@@ -1293,6 +1273,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1293
1273
|
infer_common_schema: bool = True,
|
1294
1274
|
drop_null: bool = False,
|
1295
1275
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1276
|
+
page_keys: Optional[Dict[str, str]] = None,
|
1296
1277
|
sort_output: bool = True,
|
1297
1278
|
preset: Optional[str] = "cellprofiler_csv",
|
1298
1279
|
parsl_config: Optional[parsl.Config] = None,
|
@@ -1341,6 +1322,12 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1341
1322
|
A dictionary mapping data type groups to specific types.
|
1342
1323
|
Roughly includes Arrow data types language from:
|
1343
1324
|
https://arrow.apache.org/docs/python/api/datatypes.html
|
1325
|
+
page_keys: str:
|
1326
|
+
The table and column names to be used for key pagination.
|
1327
|
+
Uses the form: {"table_name":"column_name"}.
|
1328
|
+
Expects columns to include numeric data (ints or floats).
|
1329
|
+
Interacts with the `chunk_size` parameter to form
|
1330
|
+
pages of `chunk_size`.
|
1344
1331
|
sort_output: bool (Default value = True)
|
1345
1332
|
Specifies whether to sort cytotable output or not.
|
1346
1333
|
drop_null: bool (Default value = False)
|
@@ -1440,6 +1427,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1440
1427
|
if chunk_size is None
|
1441
1428
|
else chunk_size
|
1442
1429
|
)
|
1430
|
+
page_keys = (
|
1431
|
+
cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
|
1432
|
+
if page_keys is None
|
1433
|
+
else page_keys
|
1434
|
+
)
|
1435
|
+
|
1436
|
+
# Raise an exception for scenarios where one configures CytoTable to join
|
1437
|
+
# but does not provide a pagination key for the joins.
|
1438
|
+
if join and (page_keys is None or "join" not in page_keys.keys()):
|
1439
|
+
raise CytoTableException(
|
1440
|
+
(
|
1441
|
+
"When using join=True one must pass a 'join' pagination key "
|
1442
|
+
"in the page_keys parameter. The 'join' pagination key is a column "
|
1443
|
+
"name found within the joined results based on the SQL provided from "
|
1444
|
+
"the joins parameter. This special key is required as not all columns "
|
1445
|
+
"from the source tables might not be included."
|
1446
|
+
)
|
1447
|
+
)
|
1443
1448
|
|
1444
1449
|
# send sources to be written to parquet if selected
|
1445
1450
|
if dest_datatype == "parquet":
|
@@ -1458,6 +1463,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1458
1463
|
drop_null=drop_null,
|
1459
1464
|
data_type_cast_map=data_type_cast_map,
|
1460
1465
|
sort_output=sort_output,
|
1466
|
+
page_keys=cast(dict, page_keys),
|
1461
1467
|
**kwargs,
|
1462
1468
|
)
|
1463
1469
|
|
cytotable/presets.py
CHANGED
@@ -22,6 +22,16 @@ config = {
|
|
22
22
|
"Parent_Cells",
|
23
23
|
"Parent_Nuclei",
|
24
24
|
),
|
25
|
+
# pagination keys for use with this data
|
26
|
+
# of the rough format "table" -> "column".
|
27
|
+
# note: page keys are expected to be numeric (int, float)
|
28
|
+
"CONFIG_PAGE_KEYS": {
|
29
|
+
"image": "ImageNumber",
|
30
|
+
"cells": "ObjectNumber",
|
31
|
+
"nuclei": "ObjectNumber",
|
32
|
+
"cytoplasm": "ObjectNumber",
|
33
|
+
"join": "Cytoplasm_Number_Object_Number",
|
34
|
+
},
|
25
35
|
# chunk size to use for join operations to help with possible performance issues
|
26
36
|
# note: this number is an estimate and is may need changes contingent on data
|
27
37
|
# and system used by this library.
|
@@ -61,6 +71,16 @@ config = {
|
|
61
71
|
"Parent_Cells",
|
62
72
|
"Parent_Nuclei",
|
63
73
|
),
|
74
|
+
# pagination keys for use with this data
|
75
|
+
# of the rough format "table" -> "column".
|
76
|
+
# note: page keys are expected to be numeric (int, float)
|
77
|
+
"CONFIG_PAGE_KEYS": {
|
78
|
+
"image": "ImageNumber",
|
79
|
+
"cells": "Cells_Number_Object_Number",
|
80
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
81
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
82
|
+
"join": "Cytoplasm_Number_Object_Number",
|
83
|
+
},
|
64
84
|
# chunk size to use for join operations to help with possible performance issues
|
65
85
|
# note: this number is an estimate and is may need changes contingent on data
|
66
86
|
# and system used by this library.
|
@@ -104,6 +124,16 @@ config = {
|
|
104
124
|
"Parent_Cells",
|
105
125
|
"Parent_Nuclei",
|
106
126
|
),
|
127
|
+
# pagination keys for use with this data
|
128
|
+
# of the rough format "table" -> "column".
|
129
|
+
# note: page keys are expected to be numeric (int, float)
|
130
|
+
"CONFIG_PAGE_KEYS": {
|
131
|
+
"image": "ImageNumber",
|
132
|
+
"cells": "ObjectNumber",
|
133
|
+
"nuclei": "ObjectNumber",
|
134
|
+
"cytoplasm": "ObjectNumber",
|
135
|
+
"join": "Cytoplasm_Number_Object_Number",
|
136
|
+
},
|
107
137
|
# chunk size to use for join operations to help with possible performance issues
|
108
138
|
# note: this number is an estimate and is may need changes contingent on data
|
109
139
|
# and system used by this library.
|
@@ -155,6 +185,16 @@ config = {
|
|
155
185
|
"Cells_Number_Object_Number",
|
156
186
|
"Nuclei_Number_Object_Number",
|
157
187
|
),
|
188
|
+
# pagination keys for use with this data
|
189
|
+
# of the rough format "table" -> "column".
|
190
|
+
# note: page keys are expected to be numeric (int, float)
|
191
|
+
"CONFIG_PAGE_KEYS": {
|
192
|
+
"image": "ImageNumber",
|
193
|
+
"cells": "Cells_Number_Object_Number",
|
194
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
195
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
196
|
+
"join": "Cytoplasm_Number_Object_Number",
|
197
|
+
},
|
158
198
|
# chunk size to use for join operations to help with possible performance issues
|
159
199
|
# note: this number is an estimate and is may need changes contingent on data
|
160
200
|
# and system used by this library.
|
@@ -203,6 +243,16 @@ config = {
|
|
203
243
|
"Cells_ObjectNumber",
|
204
244
|
"Nuclei_ObjectNumber",
|
205
245
|
),
|
246
|
+
# pagination keys for use with this data
|
247
|
+
# of the rough format "table" -> "column".
|
248
|
+
# note: page keys are expected to be numeric (int, float)
|
249
|
+
"CONFIG_PAGE_KEYS": {
|
250
|
+
"image": "ImageNumber",
|
251
|
+
"cells": "ObjectNumber",
|
252
|
+
"nuclei": "ObjectNumber",
|
253
|
+
"cytoplasm": "ObjectNumber",
|
254
|
+
"join": "Cytoplasm_Number_Object_Number",
|
255
|
+
},
|
206
256
|
# chunk size to use for join operations to help with possible performance issues
|
207
257
|
# note: this number is an estimate and is may need changes contingent on data
|
208
258
|
# and system used by this library.
|
@@ -248,6 +298,12 @@ config = {
|
|
248
298
|
"Z",
|
249
299
|
"T",
|
250
300
|
),
|
301
|
+
# pagination keys for use with this data
|
302
|
+
# of the rough format "table" -> "column".
|
303
|
+
# note: page keys are expected to be numeric (int, float)
|
304
|
+
"CONFIG_PAGE_KEYS": {
|
305
|
+
"test": '"OBJECT ID"',
|
306
|
+
},
|
251
307
|
# chunk size to use for join operations to help with possible performance issues
|
252
308
|
# note: this number is an estimate and is may need changes contingent on data
|
253
309
|
# and system used by this library.
|
cytotable/utils.py
CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import duckdb
|
11
11
|
import parsl
|
@@ -173,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
173
173
|
def _sqlite_mixed_type_query_to_parquet(
|
174
174
|
source_path: str,
|
175
175
|
table_name: str,
|
176
|
-
|
177
|
-
|
176
|
+
page_key: str,
|
177
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
178
178
|
sort_output: bool,
|
179
|
-
add_cytotable_meta: bool = False,
|
180
179
|
) -> str:
|
181
180
|
"""
|
182
181
|
Performs SQLite table data extraction where one or many
|
@@ -188,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
188
187
|
A str which is a path to a SQLite database file.
|
189
188
|
table_name: str:
|
190
189
|
The name of the table being queried.
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
The
|
190
|
+
page_key: str:
|
191
|
+
The column name to be used to identify pagination chunks.
|
192
|
+
pageset: Tuple[int, int]:
|
193
|
+
The range for values used for paginating data from source.
|
195
194
|
sort_output: bool
|
196
195
|
Specifies whether to sort cytotable output or not.
|
197
196
|
add_cytotable_meta: bool, default=False:
|
@@ -205,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
205
204
|
|
206
205
|
import pyarrow as pa
|
207
206
|
|
208
|
-
from cytotable.constants import
|
209
|
-
CYOTABLE_META_COLUMN_TYPES,
|
210
|
-
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
|
211
|
-
)
|
207
|
+
from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
212
208
|
from cytotable.exceptions import DatatypeException
|
213
209
|
|
214
210
|
# open sqlite3 connection
|
@@ -268,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
268
264
|
for col in column_info
|
269
265
|
]
|
270
266
|
|
271
|
-
if add_cytotable_meta:
|
272
|
-
query_parts += [
|
273
|
-
(
|
274
|
-
f"CAST( '{f'{source_path}_table_{table_name}'}' "
|
275
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
|
276
|
-
"AS cytotable_meta_source_path"
|
277
|
-
),
|
278
|
-
(
|
279
|
-
f"CAST( {offset} "
|
280
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
|
281
|
-
"AS cytotable_meta_offset"
|
282
|
-
),
|
283
|
-
(
|
284
|
-
f"CAST( (ROW_NUMBER() OVER ()) AS "
|
285
|
-
f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
|
286
|
-
"AS cytotable_meta_rownum"
|
287
|
-
),
|
288
|
-
]
|
289
|
-
|
290
267
|
# perform the select using the cases built above and using chunksize + offset
|
291
|
-
sql_stmt =
|
292
|
-
f"""
|
268
|
+
sql_stmt = f"""
|
293
269
|
SELECT
|
294
270
|
{', '.join(query_parts)}
|
295
271
|
FROM {table_name}
|
296
|
-
|
297
|
-
|
272
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
273
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
298
274
|
"""
|
299
|
-
if sort_output
|
300
|
-
else f"""
|
301
|
-
SELECT
|
302
|
-
{', '.join(query_parts)}
|
303
|
-
FROM {table_name}
|
304
|
-
LIMIT {chunk_size} OFFSET {offset};
|
305
|
-
"""
|
306
|
-
)
|
307
275
|
|
308
276
|
# execute the sql stmt
|
309
277
|
cursor.execute(sql_stmt)
|
@@ -600,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
|
|
600
568
|
if isinstance(sources, dict)
|
601
569
|
else _unwrap_value(sources)
|
602
570
|
)
|
571
|
+
|
572
|
+
|
573
|
+
def _generate_pagesets(
|
574
|
+
keys: List[Union[int, float]], chunk_size: int
|
575
|
+
) -> List[Tuple[Union[int, float], Union[int, float]]]:
|
576
|
+
"""
|
577
|
+
Generate a pageset (keyset pagination) from a list of keys.
|
578
|
+
|
579
|
+
Parameters:
|
580
|
+
keys List[Union[int, float]]:
|
581
|
+
List of keys to paginate.
|
582
|
+
chunk_size int:
|
583
|
+
Size of each chunk/page.
|
584
|
+
|
585
|
+
Returns:
|
586
|
+
List[Tuple[Union[int, float], Union[int, float]]]:
|
587
|
+
List of (start_key, end_key) tuples representing each page.
|
588
|
+
"""
|
589
|
+
|
590
|
+
# Initialize an empty list to store the chunks/pages
|
591
|
+
chunks = []
|
592
|
+
|
593
|
+
# Start index for iteration through the keys
|
594
|
+
i = 0
|
595
|
+
|
596
|
+
while i < len(keys):
|
597
|
+
# Get the start key for the current chunk
|
598
|
+
start_key = keys[i]
|
599
|
+
|
600
|
+
# Calculate the end index for the current chunk
|
601
|
+
end_index = min(i + chunk_size, len(keys)) - 1
|
602
|
+
|
603
|
+
# Get the end key for the current chunk
|
604
|
+
end_key = keys[end_index]
|
605
|
+
|
606
|
+
# Ensure non-overlapping by incrementing the start of the next range if there are duplicates
|
607
|
+
while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
|
608
|
+
end_index += 1
|
609
|
+
|
610
|
+
# Append the current chunk (start_key, end_key) to the list of chunks
|
611
|
+
chunks.append((start_key, end_key))
|
612
|
+
|
613
|
+
# Update the index to start from the next chunk
|
614
|
+
i = end_index + 1
|
615
|
+
|
616
|
+
# Return the list of chunks/pages
|
617
|
+
return chunks
|
618
|
+
|
619
|
+
|
620
|
+
def _natural_sort(list_to_sort):
|
621
|
+
"""
|
622
|
+
Sorts the given iterable using natural sort adapted from approach
|
623
|
+
provided by the following link:
|
624
|
+
https://stackoverflow.com/a/4836734
|
625
|
+
|
626
|
+
Args:
|
627
|
+
list_to_sort: List:
|
628
|
+
The list to sort.
|
629
|
+
|
630
|
+
Returns:
|
631
|
+
List: The sorted list.
|
632
|
+
"""
|
633
|
+
import re
|
634
|
+
|
635
|
+
return sorted(
|
636
|
+
list_to_sort,
|
637
|
+
# use a custom key to sort the list
|
638
|
+
key=lambda key: [
|
639
|
+
# use integer of c if it's a digit, otherwise str
|
640
|
+
int(c) if c.isdigit() else c
|
641
|
+
# Split the key into parts, separating numbers from alphabetic characters
|
642
|
+
for c in re.split("([0-9]+)", str(key))
|
643
|
+
],
|
644
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.10
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
|
2
|
+
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
+
cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
|
8
|
+
cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
|
10
|
+
cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
+
cytotable-0.0.10.dist-info/RECORD,,
|
cytotable-0.0.9.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
|
2
|
-
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
-
cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
|
6
|
-
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
-
cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
|
8
|
-
cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
|
10
|
-
cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|