CytoTable 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cytotable/__init__.py +1 -1
- cytotable/constants.py +0 -7
- cytotable/convert.py +175 -176
- cytotable/presets.py +104 -0
- cytotable/sources.py +45 -16
- cytotable/utils.py +97 -50
- {cytotable-0.0.8.dist-info → cytotable-0.0.10.dist-info}/METADATA +2 -2
- cytotable-0.0.10.dist-info/RECORD +11 -0
- cytotable-0.0.8.dist-info/RECORD +0 -11
- {cytotable-0.0.8.dist-info → cytotable-0.0.10.dist-info}/LICENSE +0 -0
- {cytotable-0.0.8.dist-info → cytotable-0.0.10.dist-info}/WHEEL +0 -0
cytotable/__init__.py
CHANGED
cytotable/constants.py
CHANGED
@@ -68,13 +68,6 @@ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
|
|
68
68
|
],
|
69
69
|
}
|
70
70
|
|
71
|
-
# metadata column names and types for internal use within CytoTable
|
72
|
-
CYOTABLE_META_COLUMN_TYPES = {
|
73
|
-
"cytotable_meta_source_path": "VARCHAR",
|
74
|
-
"cytotable_meta_offset": "BIGINT",
|
75
|
-
"cytotable_meta_rownum": "BIGINT",
|
76
|
-
}
|
77
|
-
|
78
71
|
CYTOTABLE_DEFAULT_PARQUET_METADATA = {
|
79
72
|
"data-producer": "https://github.com/cytomining/CytoTable",
|
80
73
|
"data-producer-version": str(_get_cytotable_version()),
|
cytotable/convert.py
CHANGED
@@ -4,7 +4,6 @@ CytoTable: convert - transforming data for use with pyctyominer.
|
|
4
4
|
|
5
5
|
import itertools
|
6
6
|
import logging
|
7
|
-
import uuid
|
8
7
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
|
9
8
|
|
10
9
|
import parsl
|
@@ -33,7 +32,7 @@ def _get_table_columns_and_types(
|
|
33
32
|
|
34
33
|
Args:
|
35
34
|
source: Dict[str, Any]
|
36
|
-
Contains
|
35
|
+
Contains source data details. Represents a single
|
37
36
|
file or table of some kind.
|
38
37
|
sort_output:
|
39
38
|
Specifies whether to sort cytotable output or not.
|
@@ -43,14 +42,12 @@ def _get_table_columns_and_types(
|
|
43
42
|
list of dictionaries which each include column level information
|
44
43
|
"""
|
45
44
|
|
46
|
-
import pathlib
|
47
|
-
|
48
45
|
import duckdb
|
49
46
|
|
50
47
|
from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
|
51
48
|
|
52
49
|
source_path = source["source_path"]
|
53
|
-
source_type = str(
|
50
|
+
source_type = str(source_path.suffix).lower()
|
54
51
|
|
55
52
|
# prepare the data source in the form of a duckdb query
|
56
53
|
select_source = (
|
@@ -88,7 +85,7 @@ def _get_table_columns_and_types(
|
|
88
85
|
# with exception handling to read mixed-type data
|
89
86
|
# using sqlite3 and special utility function
|
90
87
|
try:
|
91
|
-
# isolate using new connection to read data
|
88
|
+
# isolate using new connection to read data based on pageset
|
92
89
|
# and export directly to parquet via duckdb (avoiding need to return data to python)
|
93
90
|
# perform the query and create a list of dictionaries with the column data for table
|
94
91
|
with _duckdb_reader() as ddb_reader:
|
@@ -108,13 +105,8 @@ def _get_table_columns_and_types(
|
|
108
105
|
arrow_data_tbl = _sqlite_mixed_type_query_to_parquet(
|
109
106
|
source_path=str(source["source_path"]),
|
110
107
|
table_name=str(source["table_name"]),
|
111
|
-
|
112
|
-
|
113
|
-
chunk_size=5,
|
114
|
-
# offset is set to 0 start at first row
|
115
|
-
# result from table
|
116
|
-
offset=0,
|
117
|
-
add_cytotable_meta=False,
|
108
|
+
page_key=source["page_key"],
|
109
|
+
pageset=source["pagesets"][0],
|
118
110
|
sort_output=sort_output,
|
119
111
|
)
|
120
112
|
with _duckdb_reader() as ddb_reader:
|
@@ -182,13 +174,14 @@ def _prep_cast_column_data_types(
|
|
182
174
|
|
183
175
|
|
184
176
|
@python_app
|
185
|
-
def
|
177
|
+
def _get_table_keyset_pagination_sets(
|
186
178
|
chunk_size: int,
|
179
|
+
page_key: str,
|
187
180
|
source: Optional[Dict[str, Any]] = None,
|
188
181
|
sql_stmt: Optional[str] = None,
|
189
|
-
) -> Union[List[int], None]:
|
182
|
+
) -> Union[List[Tuple[Union[int, float], Union[int, float]]], None]:
|
190
183
|
"""
|
191
|
-
Get table data chunk
|
184
|
+
Get table data chunk keys for later use in capturing segments
|
192
185
|
of values. This work also provides a chance to catch problematic
|
193
186
|
input data which will be ignored with warnings.
|
194
187
|
|
@@ -198,51 +191,59 @@ def _get_table_chunk_offsets(
|
|
198
191
|
file or table of some kind.
|
199
192
|
chunk_size: int
|
200
193
|
The size in rowcount of the chunks to create.
|
194
|
+
page_key: str
|
195
|
+
The column name to be used to identify pagination chunks.
|
196
|
+
Expected to be of numeric type (int, float) for ordering.
|
197
|
+
sql_stmt:
|
198
|
+
Optional sql statement to form the pagination set from.
|
199
|
+
Default behavior extracts pagination sets from the full
|
200
|
+
data source.
|
201
201
|
|
202
202
|
Returns:
|
203
|
-
List[
|
204
|
-
List of
|
205
|
-
the data later on.
|
203
|
+
List[Any]
|
204
|
+
List of keys to use for reading the data later on.
|
206
205
|
"""
|
207
206
|
|
208
207
|
import logging
|
209
|
-
import
|
208
|
+
import sqlite3
|
209
|
+
from contextlib import closing
|
210
210
|
|
211
211
|
import duckdb
|
212
|
-
from cloudpathlib import AnyPath
|
213
212
|
|
214
213
|
from cytotable.exceptions import NoInputDataException
|
215
|
-
from cytotable.utils import _duckdb_reader
|
214
|
+
from cytotable.utils import _duckdb_reader, _generate_pagesets
|
216
215
|
|
217
216
|
logger = logging.getLogger(__name__)
|
218
217
|
|
219
218
|
if source is not None:
|
220
219
|
table_name = source["table_name"] if "table_name" in source.keys() else None
|
221
220
|
source_path = source["source_path"]
|
222
|
-
source_type = str(
|
221
|
+
source_type = str(source_path.suffix).lower()
|
223
222
|
|
224
223
|
try:
|
225
|
-
# for csv's, check that we have more than one row (a header and data values)
|
226
|
-
if (
|
227
|
-
source_type == ".csv"
|
228
|
-
and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
|
229
|
-
):
|
230
|
-
raise NoInputDataException(
|
231
|
-
f"Data file has 0 rows of values. Error in file: {source_path}"
|
232
|
-
)
|
233
|
-
|
234
|
-
# gather the total rowcount from csv or sqlite data input sources
|
235
224
|
with _duckdb_reader() as ddb_reader:
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
).
|
243
|
-
|
225
|
+
if source_type == ".csv":
|
226
|
+
sql_query = f"SELECT {page_key} FROM read_csv_auto('{source_path}', header=TRUE, delim=',') ORDER BY {page_key}"
|
227
|
+
else:
|
228
|
+
sql_query = f"SELECT {page_key} FROM sqlite_scan('{source_path}', '{table_name}') ORDER BY {page_key}"
|
229
|
+
|
230
|
+
page_keys = [
|
231
|
+
results[0] for results in ddb_reader.execute(sql_query).fetchall()
|
232
|
+
]
|
233
|
+
|
234
|
+
# exception case for when we have mixed types
|
235
|
+
# (i.e. integer col with string and ints) in a sqlite column
|
236
|
+
except duckdb.TypeMismatchException:
|
237
|
+
with closing(sqlite3.connect(source_path)) as cx:
|
238
|
+
with cx:
|
239
|
+
page_keys = [
|
240
|
+
key[0]
|
241
|
+
for key in cx.execute(
|
242
|
+
f"SELECT {page_key} FROM {table_name} ORDER BY {page_key};"
|
243
|
+
).fetchall()
|
244
|
+
if isinstance(key[0], (int, float))
|
245
|
+
]
|
244
246
|
|
245
|
-
# catch input errors which will result in skipped files
|
246
247
|
except (
|
247
248
|
duckdb.InvalidInputException,
|
248
249
|
NoInputDataException,
|
@@ -253,34 +254,20 @@ def _get_table_chunk_offsets(
|
|
253
254
|
|
254
255
|
return None
|
255
256
|
|
256
|
-
# find chunk offsets from sql statement
|
257
257
|
elif sql_stmt is not None:
|
258
|
-
# gather the total rowcount from csv or sqlite data input sources
|
259
258
|
with _duckdb_reader() as ddb_reader:
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
f"SELECT COUNT(*) FROM ({sql_stmt})"
|
264
|
-
).fetchone()[0]
|
265
|
-
)
|
259
|
+
sql_query = f"SELECT {page_key} FROM ({sql_stmt}) ORDER BY {page_key}"
|
260
|
+
page_keys = ddb_reader.execute(sql_query).fetchall()
|
261
|
+
page_keys = [key[0] for key in page_keys]
|
266
262
|
|
267
|
-
return
|
268
|
-
range(
|
269
|
-
0,
|
270
|
-
# gather rowcount from table and use as maximum for range
|
271
|
-
rowcount,
|
272
|
-
# step through using chunk size
|
273
|
-
chunk_size,
|
274
|
-
)
|
275
|
-
)
|
263
|
+
return _generate_pagesets(page_keys, chunk_size)
|
276
264
|
|
277
265
|
|
278
266
|
@python_app
|
279
|
-
def
|
267
|
+
def _source_pageset_to_parquet(
|
280
268
|
source_group_name: str,
|
281
269
|
source: Dict[str, Any],
|
282
|
-
|
283
|
-
offset: int,
|
270
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
284
271
|
dest_path: str,
|
285
272
|
sort_output: bool,
|
286
273
|
) -> str:
|
@@ -293,10 +280,8 @@ def _source_chunk_to_parquet(
|
|
293
280
|
source: Dict[str, Any]
|
294
281
|
Contains the source data to be chunked. Represents a single
|
295
282
|
file or table of some kind along with collected information about table.
|
296
|
-
|
297
|
-
|
298
|
-
offset: int
|
299
|
-
The offset for chunking the data from source.
|
283
|
+
pageset: Tuple[int, int]
|
284
|
+
The pageset for chunking the data from source.
|
300
285
|
dest_path: str
|
301
286
|
Path to store the output data.
|
302
287
|
sort_output: bool
|
@@ -311,9 +296,7 @@ def _source_chunk_to_parquet(
|
|
311
296
|
|
312
297
|
import duckdb
|
313
298
|
from cloudpathlib import AnyPath
|
314
|
-
from pyarrow import parquet
|
315
299
|
|
316
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
317
300
|
from cytotable.utils import (
|
318
301
|
_duckdb_reader,
|
319
302
|
_sqlite_mixed_type_query_to_parquet,
|
@@ -322,31 +305,11 @@ def _source_chunk_to_parquet(
|
|
322
305
|
|
323
306
|
# attempt to build dest_path
|
324
307
|
source_dest_path = (
|
325
|
-
f"{dest_path}/{str(
|
326
|
-
f"{str(
|
308
|
+
f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
|
309
|
+
f"{str(source['source_path'].parent.name).lower()}"
|
327
310
|
)
|
328
311
|
pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
|
329
312
|
|
330
|
-
source_path_str = (
|
331
|
-
source["source_path"]
|
332
|
-
if "table_name" not in source.keys()
|
333
|
-
else f"{source['source_path']}_table_{source['table_name']}"
|
334
|
-
)
|
335
|
-
# build the column selection block of query
|
336
|
-
|
337
|
-
# add cytotable metadata columns
|
338
|
-
cytotable_metadata_cols = [
|
339
|
-
(
|
340
|
-
f"CAST( '{source_path_str}' "
|
341
|
-
f"AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path']})"
|
342
|
-
' AS "cytotable_meta_source_path"'
|
343
|
-
),
|
344
|
-
f"CAST( {offset} AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset']}) AS \"cytotable_meta_offset\"",
|
345
|
-
(
|
346
|
-
f"CAST( (row_number() OVER ()) AS {CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum']})"
|
347
|
-
' AS "cytotable_meta_rownum"'
|
348
|
-
),
|
349
|
-
]
|
350
313
|
# add source table columns
|
351
314
|
casted_source_cols = [
|
352
315
|
# here we cast the column to the specified type ensure the colname remains the same
|
@@ -357,22 +320,23 @@ def _source_chunk_to_parquet(
|
|
357
320
|
# create selection statement from lists above
|
358
321
|
select_columns = ",".join(
|
359
322
|
# if we should sort the output, add the metadata_cols
|
360
|
-
|
323
|
+
casted_source_cols
|
361
324
|
if sort_output
|
362
325
|
else casted_source_cols
|
363
326
|
)
|
364
327
|
|
365
328
|
# build output query and filepath base
|
366
329
|
# (chunked output will append offset to keep output paths unique)
|
367
|
-
if str(
|
330
|
+
if str(source["source_path"].suffix).lower() == ".csv":
|
368
331
|
base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
|
369
332
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
|
370
333
|
|
371
|
-
elif str(
|
334
|
+
elif str(source["source_path"].suffix).lower() == ".sqlite":
|
372
335
|
base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
|
373
336
|
result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
|
374
337
|
|
375
|
-
|
338
|
+
# form a filepath which indicates the pageset
|
339
|
+
result_filepath = f"{result_filepath_base}-{pageset[0]}-{pageset[1]}.parquet"
|
376
340
|
|
377
341
|
# Attempt to read the data to parquet file
|
378
342
|
# using duckdb for extraction and pyarrow for
|
@@ -385,14 +349,9 @@ def _source_chunk_to_parquet(
|
|
385
349
|
table=ddb_reader.execute(
|
386
350
|
f"""
|
387
351
|
{base_query}
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
"""
|
392
|
-
if sort_output
|
393
|
-
else f"""
|
394
|
-
{base_query}
|
395
|
-
LIMIT {chunk_size} OFFSET {offset}
|
352
|
+
WHERE {source['page_key']} BETWEEN {pageset[0]} AND {pageset[1]}
|
353
|
+
/* optional ordering per pageset */
|
354
|
+
{"ORDER BY " + source['page_key'] if sort_output else ""};
|
396
355
|
"""
|
397
356
|
).arrow(),
|
398
357
|
where=result_filepath,
|
@@ -405,7 +364,7 @@ def _source_chunk_to_parquet(
|
|
405
364
|
# to handle the mixed types
|
406
365
|
if (
|
407
366
|
"Mismatch Type Error" in str(e)
|
408
|
-
and str(
|
367
|
+
and str(source["source_path"].suffix).lower() == ".sqlite"
|
409
368
|
):
|
410
369
|
_write_parquet_table_with_metadata(
|
411
370
|
# here we use sqlite instead of duckdb to extract
|
@@ -414,9 +373,8 @@ def _source_chunk_to_parquet(
|
|
414
373
|
table=_sqlite_mixed_type_query_to_parquet(
|
415
374
|
source_path=str(source["source_path"]),
|
416
375
|
table_name=str(source["table_name"]),
|
417
|
-
|
418
|
-
|
419
|
-
add_cytotable_meta=True if sort_output else False,
|
376
|
+
page_key=source["page_key"],
|
377
|
+
pageset=pageset,
|
420
378
|
sort_output=sort_output,
|
421
379
|
),
|
422
380
|
where=result_filepath,
|
@@ -466,10 +424,7 @@ def _prepend_column_name(
|
|
466
424
|
|
467
425
|
import pyarrow.parquet as parquet
|
468
426
|
|
469
|
-
from cytotable.constants import
|
470
|
-
CYOTABLE_META_COLUMN_TYPES,
|
471
|
-
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
472
|
-
)
|
427
|
+
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
|
473
428
|
from cytotable.utils import _write_parquet_table_with_metadata
|
474
429
|
|
475
430
|
logger = logging.getLogger(__name__)
|
@@ -480,7 +435,7 @@ def _prepend_column_name(
|
|
480
435
|
if len(targets) == 0:
|
481
436
|
logger.warning(
|
482
437
|
msg=(
|
483
|
-
"Skipping column name prepend operations"
|
438
|
+
"Skipping column name prepend operations "
|
484
439
|
"because no compartments or metadata were provided."
|
485
440
|
)
|
486
441
|
)
|
@@ -517,10 +472,8 @@ def _prepend_column_name(
|
|
517
472
|
# source_group_name_stem: 'Cells'
|
518
473
|
# column_name: 'AreaShape_Area'
|
519
474
|
# updated_column_name: 'Cells_AreaShape_Area'
|
520
|
-
if (
|
521
|
-
|
522
|
-
and not column_name.startswith(source_group_name_stem.capitalize())
|
523
|
-
and column_name not in CYOTABLE_META_COLUMN_TYPES
|
475
|
+
if column_name not in identifying_columns and not column_name.startswith(
|
476
|
+
source_group_name_stem.capitalize()
|
524
477
|
):
|
525
478
|
updated_column_names.append(f"{source_group_name_stem}_{column_name}")
|
526
479
|
# if-condition for prepending 'Metadata_' to column name
|
@@ -582,6 +535,7 @@ def _concat_source_group(
|
|
582
535
|
source_group: List[Dict[str, Any]],
|
583
536
|
dest_path: str,
|
584
537
|
common_schema: Optional[List[Tuple[str, str]]] = None,
|
538
|
+
sort_output: bool = True,
|
585
539
|
) -> List[Dict[str, Any]]:
|
586
540
|
"""
|
587
541
|
Concatenate group of source data together as single file.
|
@@ -628,6 +582,8 @@ def _concat_source_group(
|
|
628
582
|
common_schema: List[Tuple[str, str]] (Default value = None)
|
629
583
|
Common schema to use for concatenation amongst arrow tables
|
630
584
|
which may have slightly different but compatible schema.
|
585
|
+
sort_output: bool
|
586
|
+
Specifies whether to sort cytotable output or not.
|
631
587
|
|
632
588
|
Returns:
|
633
589
|
List[Dict[str, Any]]
|
@@ -645,7 +601,7 @@ def _concat_source_group(
|
|
645
601
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
646
602
|
)
|
647
603
|
from cytotable.exceptions import SchemaException
|
648
|
-
from cytotable.utils import
|
604
|
+
from cytotable.utils import _natural_sort
|
649
605
|
|
650
606
|
# build a result placeholder
|
651
607
|
concatted: List[Dict[str, Any]] = [
|
@@ -684,7 +640,10 @@ def _concat_source_group(
|
|
684
640
|
# (all must be the same schema)
|
685
641
|
with parquet.ParquetWriter(str(destination_path), writer_schema) as writer:
|
686
642
|
for source in source_group:
|
687
|
-
|
643
|
+
tables = [table for table in source["table"]]
|
644
|
+
if sort_output:
|
645
|
+
tables = _natural_sort(tables)
|
646
|
+
for table in tables:
|
688
647
|
# if we haven't inferred the common schema
|
689
648
|
# check that our file matches the expected schema, otherwise raise an error
|
690
649
|
if common_schema is None and not writer_schema.equals(
|
@@ -728,7 +687,6 @@ def _concat_source_group(
|
|
728
687
|
def _prepare_join_sql(
|
729
688
|
sources: Dict[str, List[Dict[str, Any]]],
|
730
689
|
joins: str,
|
731
|
-
sort_output: bool,
|
732
690
|
) -> str:
|
733
691
|
"""
|
734
692
|
Prepare join SQL statement with actual locations of data based on the sources.
|
@@ -749,8 +707,6 @@ def _prepare_join_sql(
|
|
749
707
|
"""
|
750
708
|
import pathlib
|
751
709
|
|
752
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
753
|
-
|
754
710
|
# replace with real location of sources for join sql
|
755
711
|
order_by_tables = []
|
756
712
|
for key, val in sources.items():
|
@@ -762,25 +718,17 @@ def _prepare_join_sql(
|
|
762
718
|
)
|
763
719
|
order_by_tables.append(table_name)
|
764
720
|
|
765
|
-
# create order by statement with from all tables using cytotable metadata
|
766
|
-
order_by_sql = "ORDER BY " + ", ".join(
|
767
|
-
[
|
768
|
-
f"{table}.{meta_column}"
|
769
|
-
for table in order_by_tables
|
770
|
-
for meta_column in CYOTABLE_META_COLUMN_TYPES
|
771
|
-
]
|
772
|
-
)
|
773
|
-
|
774
721
|
# add the order by statements to the join
|
775
|
-
return joins
|
722
|
+
return joins
|
776
723
|
|
777
724
|
|
778
725
|
@python_app
|
779
|
-
def
|
726
|
+
def _join_source_pageset(
|
780
727
|
dest_path: str,
|
781
728
|
joins: str,
|
782
|
-
|
783
|
-
|
729
|
+
page_key: str,
|
730
|
+
pageset: Tuple[int, int],
|
731
|
+
sort_output: bool,
|
784
732
|
drop_null: bool,
|
785
733
|
) -> str:
|
786
734
|
"""
|
@@ -806,30 +754,20 @@ def _join_source_chunk(
|
|
806
754
|
|
807
755
|
import pathlib
|
808
756
|
|
809
|
-
from cytotable.constants import CYOTABLE_META_COLUMN_TYPES
|
810
757
|
from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
|
811
758
|
|
812
|
-
# Attempt to read the data to parquet file
|
813
|
-
# using duckdb for extraction and pyarrow for
|
814
|
-
# writing data to a parquet file.
|
815
|
-
# read data with chunk size + offset
|
816
|
-
# and export to parquet
|
817
|
-
exclude_meta_cols = [
|
818
|
-
f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
|
819
|
-
]
|
820
759
|
with _duckdb_reader() as ddb_reader:
|
821
760
|
result = ddb_reader.execute(
|
822
761
|
f"""
|
823
|
-
|
762
|
+
WITH joined AS (
|
824
763
|
{joins}
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
"""
|
764
|
+
)
|
765
|
+
SELECT *
|
766
|
+
FROM joined
|
767
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
768
|
+
/* optional sorting per pagset */
|
769
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
770
|
+
"""
|
833
771
|
).arrow()
|
834
772
|
|
835
773
|
# drop nulls if specified
|
@@ -854,10 +792,8 @@ def _join_source_chunk(
|
|
854
792
|
f"{str(pathlib.Path(dest_path).parent)}/"
|
855
793
|
# use the dest_path stem in the name
|
856
794
|
f"{str(pathlib.Path(dest_path).stem)}-"
|
857
|
-
#
|
858
|
-
|
859
|
-
# and before they are brought together as one dataset
|
860
|
-
f"{str(uuid.uuid4().hex)}.parquet"
|
795
|
+
# add the pageset indication to the filename
|
796
|
+
f"{pageset[0]}-{pageset[1]}.parquet"
|
861
797
|
)
|
862
798
|
|
863
799
|
# write the result
|
@@ -874,6 +810,7 @@ def _concat_join_sources(
|
|
874
810
|
sources: Dict[str, List[Dict[str, Any]]],
|
875
811
|
dest_path: str,
|
876
812
|
join_sources: List[str],
|
813
|
+
sort_output: bool = True,
|
877
814
|
) -> str:
|
878
815
|
"""
|
879
816
|
Concatenate join sources from parquet-based chunks.
|
@@ -890,6 +827,8 @@ def _concat_join_sources(
|
|
890
827
|
join_sources: List[str]:
|
891
828
|
List of local filepath destination for join source chunks
|
892
829
|
which will be concatenated.
|
830
|
+
sort_output: bool
|
831
|
+
Specifies whether to sort cytotable output or not.
|
893
832
|
|
894
833
|
Returns:
|
895
834
|
str
|
@@ -905,7 +844,7 @@ def _concat_join_sources(
|
|
905
844
|
CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
|
906
845
|
CYTOTABLE_DEFAULT_PARQUET_METADATA,
|
907
846
|
)
|
908
|
-
from cytotable.utils import
|
847
|
+
from cytotable.utils import _natural_sort
|
909
848
|
|
910
849
|
# remove the unjoined concatted compartments to prepare final dest_path usage
|
911
850
|
# (we now have joined results)
|
@@ -925,7 +864,11 @@ def _concat_join_sources(
|
|
925
864
|
CYTOTABLE_DEFAULT_PARQUET_METADATA
|
926
865
|
)
|
927
866
|
with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
|
928
|
-
for table_path in
|
867
|
+
for table_path in (
|
868
|
+
join_sources
|
869
|
+
if not sort_output
|
870
|
+
else _natural_sort(list_to_sort=join_sources)
|
871
|
+
):
|
929
872
|
writer.write_table(
|
930
873
|
parquet.read_table(
|
931
874
|
table_path,
|
@@ -1049,6 +992,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1049
992
|
infer_common_schema: bool,
|
1050
993
|
drop_null: bool,
|
1051
994
|
sort_output: bool,
|
995
|
+
page_keys: Dict[str, str],
|
1052
996
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1053
997
|
**kwargs,
|
1054
998
|
) -> Union[Dict[str, List[Dict[str, Any]]], str]:
|
@@ -1089,6 +1033,9 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1089
1033
|
Whether to drop null results.
|
1090
1034
|
sort_output: bool
|
1091
1035
|
Specifies whether to sort cytotable output or not.
|
1036
|
+
page_keys: Dict[str, str]
|
1037
|
+
A dictionary which defines which column names are used for keyset pagination
|
1038
|
+
in order to perform data extraction.
|
1092
1039
|
data_type_cast_map: Dict[str, str]
|
1093
1040
|
A dictionary mapping data type groups to specific types.
|
1094
1041
|
Roughly includes Arrow data types language from:
|
@@ -1114,21 +1061,40 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1114
1061
|
else []
|
1115
1062
|
),
|
1116
1063
|
**kwargs,
|
1117
|
-
)
|
1064
|
+
)
|
1118
1065
|
|
1119
1066
|
# expand the destination path
|
1120
1067
|
expanded_dest_path = _expand_path(path=dest_path)
|
1121
1068
|
|
1122
|
-
#
|
1123
|
-
|
1069
|
+
# check that each source group name has a pagination key
|
1070
|
+
for source_group_name in sources.keys():
|
1071
|
+
matching_keys = [
|
1072
|
+
key for key in page_keys.keys() if key.lower() in source_group_name.lower()
|
1073
|
+
]
|
1074
|
+
if not matching_keys:
|
1075
|
+
raise CytoTableException(
|
1076
|
+
f"No matching key found in page_keys for source_group_name: {source_group_name}."
|
1077
|
+
"Please include a pagination key based on a column name from the table."
|
1078
|
+
)
|
1079
|
+
|
1080
|
+
# prepare pagesets for chunked data export from source tables
|
1081
|
+
pagesets_prepared = {
|
1124
1082
|
source_group_name: [
|
1125
1083
|
dict(
|
1126
1084
|
source,
|
1127
1085
|
**{
|
1128
|
-
"
|
1086
|
+
"page_key": (
|
1087
|
+
page_key := [
|
1088
|
+
value
|
1089
|
+
for key, value in page_keys.items()
|
1090
|
+
if key.lower() in source_group_name.lower()
|
1091
|
+
][0]
|
1092
|
+
),
|
1093
|
+
"pagesets": _get_table_keyset_pagination_sets(
|
1129
1094
|
source=source,
|
1130
1095
|
chunk_size=chunk_size,
|
1131
|
-
|
1096
|
+
page_key=page_key,
|
1097
|
+
),
|
1132
1098
|
},
|
1133
1099
|
)
|
1134
1100
|
for source in source_group_vals
|
@@ -1136,17 +1102,17 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1136
1102
|
for source_group_name, source_group_vals in sources.items()
|
1137
1103
|
}
|
1138
1104
|
|
1139
|
-
# if
|
1105
|
+
# if pagesets is none and we haven't halted, remove the file as there
|
1140
1106
|
# were input formatting errors which will create challenges downstream
|
1141
1107
|
invalid_files_dropped = {
|
1142
1108
|
source_group_name: [
|
1143
|
-
# ensure we have
|
1109
|
+
# ensure we have pagesets
|
1144
1110
|
source
|
1145
1111
|
for source in source_group_vals
|
1146
|
-
if source["
|
1112
|
+
if source["pagesets"] is not None
|
1147
1113
|
]
|
1148
1114
|
for source_group_name, source_group_vals in evaluate_futures(
|
1149
|
-
|
1115
|
+
pagesets_prepared
|
1150
1116
|
).items()
|
1151
1117
|
# ensure we have source_groups with at least one source table
|
1152
1118
|
if len(source_group_vals) > 0
|
@@ -1179,12 +1145,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1179
1145
|
"table": [
|
1180
1146
|
# perform column renaming and create potential return result
|
1181
1147
|
_prepend_column_name(
|
1182
|
-
# perform chunked data export to parquet using
|
1183
|
-
table_path=
|
1148
|
+
# perform chunked data export to parquet using pagesets
|
1149
|
+
table_path=_source_pageset_to_parquet(
|
1184
1150
|
source_group_name=source_group_name,
|
1185
1151
|
source=source,
|
1186
|
-
|
1187
|
-
offset=offset,
|
1152
|
+
pageset=pageset,
|
1188
1153
|
dest_path=expanded_dest_path,
|
1189
1154
|
sort_output=sort_output,
|
1190
1155
|
),
|
@@ -1193,7 +1158,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1193
1158
|
metadata=metadata,
|
1194
1159
|
compartments=compartments,
|
1195
1160
|
)
|
1196
|
-
for
|
1161
|
+
for pageset in source["pagesets"]
|
1197
1162
|
]
|
1198
1163
|
},
|
1199
1164
|
)
|
@@ -1234,6 +1199,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1234
1199
|
source_group=source_group_vals[0]["sources"],
|
1235
1200
|
dest_path=expanded_dest_path,
|
1236
1201
|
common_schema=source_group_vals[0]["common_schema"],
|
1202
|
+
sort_output=sort_output,
|
1237
1203
|
)
|
1238
1204
|
for source_group_name, source_group_vals in evaluate_futures(
|
1239
1205
|
common_schema_determined
|
@@ -1247,28 +1213,34 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1247
1213
|
evaluated_results = evaluate_futures(results)
|
1248
1214
|
|
1249
1215
|
prepared_joins_sql = _prepare_join_sql(
|
1250
|
-
sources=evaluated_results, joins=joins
|
1216
|
+
sources=evaluated_results, joins=joins
|
1251
1217
|
).result()
|
1252
1218
|
|
1219
|
+
page_key_join = [
|
1220
|
+
value for key, value in page_keys.items() if key.lower() == "join"
|
1221
|
+
][0]
|
1222
|
+
|
1253
1223
|
# map joined results based on the join groups gathered above
|
1254
1224
|
# note: after mapping we end up with a list of strings (task returns str)
|
1255
1225
|
join_sources_result = [
|
1256
|
-
|
1226
|
+
_join_source_pageset(
|
1257
1227
|
# gather the result of concatted sources prior to
|
1258
1228
|
# join group merging as each mapped task run will need
|
1259
1229
|
# full concat results
|
1260
1230
|
dest_path=expanded_dest_path,
|
1261
1231
|
joins=prepared_joins_sql,
|
1262
|
-
|
1263
|
-
|
1232
|
+
page_key=page_key_join,
|
1233
|
+
pageset=pageset,
|
1234
|
+
sort_output=sort_output,
|
1264
1235
|
drop_null=drop_null,
|
1265
1236
|
)
|
1266
1237
|
# create join group for querying the concatenated
|
1267
1238
|
# data in order to perform memory-safe joining
|
1268
1239
|
# per user chunk size specification.
|
1269
|
-
for
|
1240
|
+
for pageset in _get_table_keyset_pagination_sets(
|
1270
1241
|
sql_stmt=prepared_joins_sql,
|
1271
1242
|
chunk_size=chunk_size,
|
1243
|
+
page_key=page_key_join,
|
1272
1244
|
).result()
|
1273
1245
|
]
|
1274
1246
|
|
@@ -1279,6 +1251,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
|
|
1279
1251
|
dest_path=expanded_dest_path,
|
1280
1252
|
join_sources=[join.result() for join in join_sources_result],
|
1281
1253
|
sources=evaluated_results,
|
1254
|
+
sort_output=sort_output,
|
1282
1255
|
)
|
1283
1256
|
|
1284
1257
|
# wrap the final result as a future and return
|
@@ -1300,6 +1273,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1300
1273
|
infer_common_schema: bool = True,
|
1301
1274
|
drop_null: bool = False,
|
1302
1275
|
data_type_cast_map: Optional[Dict[str, str]] = None,
|
1276
|
+
page_keys: Optional[Dict[str, str]] = None,
|
1303
1277
|
sort_output: bool = True,
|
1304
1278
|
preset: Optional[str] = "cellprofiler_csv",
|
1305
1279
|
parsl_config: Optional[parsl.Config] = None,
|
@@ -1348,6 +1322,12 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1348
1322
|
A dictionary mapping data type groups to specific types.
|
1349
1323
|
Roughly includes Arrow data types language from:
|
1350
1324
|
https://arrow.apache.org/docs/python/api/datatypes.html
|
1325
|
+
page_keys: str:
|
1326
|
+
The table and column names to be used for key pagination.
|
1327
|
+
Uses the form: {"table_name":"column_name"}.
|
1328
|
+
Expects columns to include numeric data (ints or floats).
|
1329
|
+
Interacts with the `chunk_size` parameter to form
|
1330
|
+
pages of `chunk_size`.
|
1351
1331
|
sort_output: bool (Default value = True)
|
1352
1332
|
Specifies whether to sort cytotable output or not.
|
1353
1333
|
drop_null: bool (Default value = False)
|
@@ -1447,6 +1427,24 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1447
1427
|
if chunk_size is None
|
1448
1428
|
else chunk_size
|
1449
1429
|
)
|
1430
|
+
page_keys = (
|
1431
|
+
cast(dict, config[preset]["CONFIG_PAGE_KEYS"])
|
1432
|
+
if page_keys is None
|
1433
|
+
else page_keys
|
1434
|
+
)
|
1435
|
+
|
1436
|
+
# Raise an exception for scenarios where one configures CytoTable to join
|
1437
|
+
# but does not provide a pagination key for the joins.
|
1438
|
+
if join and (page_keys is None or "join" not in page_keys.keys()):
|
1439
|
+
raise CytoTableException(
|
1440
|
+
(
|
1441
|
+
"When using join=True one must pass a 'join' pagination key "
|
1442
|
+
"in the page_keys parameter. The 'join' pagination key is a column "
|
1443
|
+
"name found within the joined results based on the SQL provided from "
|
1444
|
+
"the joins parameter. This special key is required as not all columns "
|
1445
|
+
"from the source tables might not be included."
|
1446
|
+
)
|
1447
|
+
)
|
1450
1448
|
|
1451
1449
|
# send sources to be written to parquet if selected
|
1452
1450
|
if dest_datatype == "parquet":
|
@@ -1465,6 +1463,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
|
|
1465
1463
|
drop_null=drop_null,
|
1466
1464
|
data_type_cast_map=data_type_cast_map,
|
1467
1465
|
sort_output=sort_output,
|
1466
|
+
page_keys=cast(dict, page_keys),
|
1468
1467
|
**kwargs,
|
1469
1468
|
)
|
1470
1469
|
|
cytotable/presets.py
CHANGED
@@ -22,6 +22,16 @@ config = {
|
|
22
22
|
"Parent_Cells",
|
23
23
|
"Parent_Nuclei",
|
24
24
|
),
|
25
|
+
# pagination keys for use with this data
|
26
|
+
# of the rough format "table" -> "column".
|
27
|
+
# note: page keys are expected to be numeric (int, float)
|
28
|
+
"CONFIG_PAGE_KEYS": {
|
29
|
+
"image": "ImageNumber",
|
30
|
+
"cells": "ObjectNumber",
|
31
|
+
"nuclei": "ObjectNumber",
|
32
|
+
"cytoplasm": "ObjectNumber",
|
33
|
+
"join": "Cytoplasm_Number_Object_Number",
|
34
|
+
},
|
25
35
|
# chunk size to use for join operations to help with possible performance issues
|
26
36
|
# note: this number is an estimate and is may need changes contingent on data
|
27
37
|
# and system used by this library.
|
@@ -61,6 +71,16 @@ config = {
|
|
61
71
|
"Parent_Cells",
|
62
72
|
"Parent_Nuclei",
|
63
73
|
),
|
74
|
+
# pagination keys for use with this data
|
75
|
+
# of the rough format "table" -> "column".
|
76
|
+
# note: page keys are expected to be numeric (int, float)
|
77
|
+
"CONFIG_PAGE_KEYS": {
|
78
|
+
"image": "ImageNumber",
|
79
|
+
"cells": "Cells_Number_Object_Number",
|
80
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
81
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
82
|
+
"join": "Cytoplasm_Number_Object_Number",
|
83
|
+
},
|
64
84
|
# chunk size to use for join operations to help with possible performance issues
|
65
85
|
# note: this number is an estimate and is may need changes contingent on data
|
66
86
|
# and system used by this library.
|
@@ -85,6 +105,64 @@ config = {
|
|
85
105
|
AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
|
86
106
|
""",
|
87
107
|
},
|
108
|
+
"cellprofiler_sqlite_cpg0016_jump": {
|
109
|
+
# version specifications using related references
|
110
|
+
"CONFIG_SOURCE_VERSION": {
|
111
|
+
"cellprofiler": "v4.0.0",
|
112
|
+
},
|
113
|
+
# names of source table compartments (for ex. cells.csv, etc.)
|
114
|
+
"CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
|
115
|
+
# names of source table metadata (for ex. image.csv, etc.)
|
116
|
+
"CONFIG_NAMES_METADATA": ("image",),
|
117
|
+
# column names in any compartment or metadata tables which contain
|
118
|
+
# unique names to avoid renaming
|
119
|
+
"CONFIG_IDENTIFYING_COLUMNS": (
|
120
|
+
"ImageNumber",
|
121
|
+
"ObjectNumber",
|
122
|
+
"Metadata_Well",
|
123
|
+
"Metadata_Plate",
|
124
|
+
"Parent_Cells",
|
125
|
+
"Parent_Nuclei",
|
126
|
+
),
|
127
|
+
# pagination keys for use with this data
|
128
|
+
# of the rough format "table" -> "column".
|
129
|
+
# note: page keys are expected to be numeric (int, float)
|
130
|
+
"CONFIG_PAGE_KEYS": {
|
131
|
+
"image": "ImageNumber",
|
132
|
+
"cells": "ObjectNumber",
|
133
|
+
"nuclei": "ObjectNumber",
|
134
|
+
"cytoplasm": "ObjectNumber",
|
135
|
+
"join": "Cytoplasm_Number_Object_Number",
|
136
|
+
},
|
137
|
+
# chunk size to use for join operations to help with possible performance issues
|
138
|
+
# note: this number is an estimate and is may need changes contingent on data
|
139
|
+
# and system used by this library.
|
140
|
+
"CONFIG_CHUNK_SIZE": 1000,
|
141
|
+
# compartment and metadata joins performed using DuckDB SQL
|
142
|
+
# and modified at runtime as needed
|
143
|
+
"CONFIG_JOINS": """
|
144
|
+
SELECT
|
145
|
+
image.Image_TableNumber,
|
146
|
+
image.Metadata_ImageNumber,
|
147
|
+
image.Metadata_Plate,
|
148
|
+
image.Metadata_Well,
|
149
|
+
image.Image_Metadata_Site,
|
150
|
+
image.Image_Metadata_Row,
|
151
|
+
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
|
152
|
+
cells.* EXCLUDE (Metadata_ImageNumber),
|
153
|
+
nuclei.* EXCLUDE (Metadata_ImageNumber)
|
154
|
+
FROM
|
155
|
+
read_parquet('cytoplasm.parquet') AS cytoplasm
|
156
|
+
LEFT JOIN read_parquet('cells.parquet') AS cells ON
|
157
|
+
cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
158
|
+
AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
|
159
|
+
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
|
160
|
+
nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
161
|
+
AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
|
162
|
+
LEFT JOIN read_parquet('image.parquet') AS image ON
|
163
|
+
image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
|
164
|
+
""",
|
165
|
+
},
|
88
166
|
"cellprofiler_sqlite_pycytominer": {
|
89
167
|
# version specifications using related references
|
90
168
|
"CONFIG_SOURCE_VERSION": {
|
@@ -107,6 +185,16 @@ config = {
|
|
107
185
|
"Cells_Number_Object_Number",
|
108
186
|
"Nuclei_Number_Object_Number",
|
109
187
|
),
|
188
|
+
# pagination keys for use with this data
|
189
|
+
# of the rough format "table" -> "column".
|
190
|
+
# note: page keys are expected to be numeric (int, float)
|
191
|
+
"CONFIG_PAGE_KEYS": {
|
192
|
+
"image": "ImageNumber",
|
193
|
+
"cells": "Cells_Number_Object_Number",
|
194
|
+
"nuclei": "Nuclei_Number_Object_Number",
|
195
|
+
"cytoplasm": "Cytoplasm_Number_Object_Number",
|
196
|
+
"join": "Cytoplasm_Number_Object_Number",
|
197
|
+
},
|
110
198
|
# chunk size to use for join operations to help with possible performance issues
|
111
199
|
# note: this number is an estimate and is may need changes contingent on data
|
112
200
|
# and system used by this library.
|
@@ -155,6 +243,16 @@ config = {
|
|
155
243
|
"Cells_ObjectNumber",
|
156
244
|
"Nuclei_ObjectNumber",
|
157
245
|
),
|
246
|
+
# pagination keys for use with this data
|
247
|
+
# of the rough format "table" -> "column".
|
248
|
+
# note: page keys are expected to be numeric (int, float)
|
249
|
+
"CONFIG_PAGE_KEYS": {
|
250
|
+
"image": "ImageNumber",
|
251
|
+
"cells": "ObjectNumber",
|
252
|
+
"nuclei": "ObjectNumber",
|
253
|
+
"cytoplasm": "ObjectNumber",
|
254
|
+
"join": "Cytoplasm_Number_Object_Number",
|
255
|
+
},
|
158
256
|
# chunk size to use for join operations to help with possible performance issues
|
159
257
|
# note: this number is an estimate and is may need changes contingent on data
|
160
258
|
# and system used by this library.
|
@@ -200,6 +298,12 @@ config = {
|
|
200
298
|
"Z",
|
201
299
|
"T",
|
202
300
|
),
|
301
|
+
# pagination keys for use with this data
|
302
|
+
# of the rough format "table" -> "column".
|
303
|
+
# note: page keys are expected to be numeric (int, float)
|
304
|
+
"CONFIG_PAGE_KEYS": {
|
305
|
+
"test": '"OBJECT ID"',
|
306
|
+
},
|
203
307
|
# chunk size to use for join operations to help with possible performance issues
|
204
308
|
# note: this number is an estimate and is may need changes contingent on data
|
205
309
|
# and system used by this library.
|
cytotable/sources.py
CHANGED
@@ -7,13 +7,11 @@ import pathlib
|
|
7
7
|
from typing import Any, Dict, List, Optional, Union
|
8
8
|
|
9
9
|
from cloudpathlib import AnyPath
|
10
|
-
from parsl.app.app import join_app, python_app
|
11
10
|
|
11
|
+
from cytotable.exceptions import NoInputDataException
|
12
12
|
|
13
|
-
|
14
|
-
def _build_path(
|
15
|
-
path: Union[str, pathlib.Path, AnyPath], **kwargs
|
16
|
-
) -> Union[pathlib.Path, AnyPath]:
|
13
|
+
|
14
|
+
def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
|
17
15
|
"""
|
18
16
|
Build a path client or return local path.
|
19
17
|
|
@@ -43,10 +41,9 @@ def _build_path(
|
|
43
41
|
return processed_path
|
44
42
|
|
45
43
|
|
46
|
-
@python_app
|
47
44
|
def _get_source_filepaths(
|
48
45
|
path: Union[pathlib.Path, AnyPath],
|
49
|
-
targets: List[str],
|
46
|
+
targets: Optional[List[str]] = None,
|
50
47
|
source_datatype: Optional[str] = None,
|
51
48
|
) -> Dict[str, List[Dict[str, Any]]]:
|
52
49
|
"""
|
@@ -75,7 +72,7 @@ def _get_source_filepaths(
|
|
75
72
|
|
76
73
|
if (targets is None or targets == []) and source_datatype is None:
|
77
74
|
raise DatatypeException(
|
78
|
-
|
75
|
+
"A source_datatype must be specified when using undefined compartments and metadata names."
|
79
76
|
)
|
80
77
|
|
81
78
|
# gathers files from provided path using compartments + metadata as a filter
|
@@ -87,9 +84,9 @@ def _get_source_filepaths(
|
|
87
84
|
for subpath in (
|
88
85
|
(path,)
|
89
86
|
# used if the source path is a single file
|
90
|
-
if
|
87
|
+
if path.is_file()
|
91
88
|
# iterates through a source directory
|
92
|
-
else (x for x in
|
89
|
+
else (x for x in path.glob("**/*") if x.is_file())
|
93
90
|
)
|
94
91
|
# ensure the subpaths meet certain specifications
|
95
92
|
if (
|
@@ -129,7 +126,8 @@ def _get_source_filepaths(
|
|
129
126
|
.arrow()["table_name"]
|
130
127
|
.to_pylist()
|
131
128
|
# make sure the table names match with compartment + metadata names
|
132
|
-
if
|
129
|
+
if targets is not None
|
130
|
+
and any(target.lower() in table_name.lower() for target in targets)
|
133
131
|
]
|
134
132
|
else:
|
135
133
|
# if we don't have sqlite source, append the existing element
|
@@ -181,7 +179,6 @@ def _get_source_filepaths(
|
|
181
179
|
return grouped_sources
|
182
180
|
|
183
181
|
|
184
|
-
@python_app
|
185
182
|
def _infer_source_datatype(
|
186
183
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
|
187
184
|
) -> str:
|
@@ -230,7 +227,6 @@ def _infer_source_datatype(
|
|
230
227
|
return source_datatype
|
231
228
|
|
232
229
|
|
233
|
-
@python_app
|
234
230
|
def _filter_source_filepaths(
|
235
231
|
sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
|
236
232
|
) -> Dict[str, List[Dict[str, Any]]]:
|
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
|
|
260
256
|
if file["source_path"].stat().st_size > 0
|
261
257
|
# ensure the datatype matches the source datatype
|
262
258
|
and file["source_path"].suffix == f".{source_datatype}"
|
259
|
+
and _file_is_more_than_one_line(path=file["source_path"])
|
263
260
|
]
|
264
261
|
for filegroup, files in sources.items()
|
265
262
|
}
|
266
263
|
|
267
264
|
|
268
|
-
|
265
|
+
def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
|
266
|
+
"""
|
267
|
+
Check if the file has more than one line.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
path (Union[pathlib.Path, AnyPath]):
|
271
|
+
The path to the file.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
bool:
|
275
|
+
True if the file has more than one line, False otherwise.
|
276
|
+
|
277
|
+
Raises:
|
278
|
+
NoInputDataException: If the file has zero lines.
|
279
|
+
"""
|
280
|
+
|
281
|
+
# if we don't have a sqlite file
|
282
|
+
# (we can't check sqlite files for lines)
|
283
|
+
if path.suffix.lower() != ".sqlite":
|
284
|
+
with path.open("r") as f:
|
285
|
+
try:
|
286
|
+
# read two lines, if the second is empty return false
|
287
|
+
return bool(f.readline() and f.readline())
|
288
|
+
|
289
|
+
except StopIteration:
|
290
|
+
# If we encounter the end of the file, it has only one line
|
291
|
+
raise NoInputDataException(
|
292
|
+
f"Data file has 0 rows of values. Error in file: {path}"
|
293
|
+
)
|
294
|
+
else:
|
295
|
+
return True
|
296
|
+
|
297
|
+
|
269
298
|
def _gather_sources(
|
270
299
|
source_path: str,
|
271
300
|
source_datatype: Optional[str] = None,
|
@@ -295,11 +324,11 @@ def _gather_sources(
|
|
295
324
|
_infer_source_datatype,
|
296
325
|
)
|
297
326
|
|
298
|
-
|
327
|
+
built_path = _build_path(path=source_path, **kwargs)
|
299
328
|
|
300
329
|
# gather filepaths which will be used as the basis for this work
|
301
330
|
sources = _get_source_filepaths(
|
302
|
-
path=
|
331
|
+
path=built_path, targets=targets, source_datatype=source_datatype
|
303
332
|
)
|
304
333
|
|
305
334
|
# infer or validate the source datatype based on source filepaths
|
cytotable/utils.py
CHANGED
@@ -5,7 +5,7 @@ Utility functions for CytoTable
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Any, Dict, List, Optional, Union, cast
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
9
9
|
|
10
10
|
import duckdb
|
11
11
|
import parsl
|
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
149
149
|
INSTALL sqlite_scanner;
|
150
150
|
LOAD sqlite_scanner;
|
151
151
|
|
152
|
+
/* Install httpfs plugin to avoid error
|
153
|
+
https://github.com/duckdb/duckdb/issues/3243 */
|
154
|
+
INSTALL httpfs;
|
155
|
+
|
152
156
|
/*
|
153
157
|
Set threads available to duckdb
|
154
158
|
See the following for more information:
|
@@ -169,10 +173,9 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
|
|
169
173
|
def _sqlite_mixed_type_query_to_parquet(
|
170
174
|
source_path: str,
|
171
175
|
table_name: str,
|
172
|
-
|
173
|
-
|
176
|
+
page_key: str,
|
177
|
+
pageset: Tuple[Union[int, float], Union[int, float]],
|
174
178
|
sort_output: bool,
|
175
|
-
add_cytotable_meta: bool = False,
|
176
179
|
) -> str:
|
177
180
|
"""
|
178
181
|
Performs SQLite table data extraction where one or many
|
@@ -184,10 +187,10 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
184
187
|
A str which is a path to a SQLite database file.
|
185
188
|
table_name: str:
|
186
189
|
The name of the table being queried.
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
The
|
190
|
+
page_key: str:
|
191
|
+
The column name to be used to identify pagination chunks.
|
192
|
+
pageset: Tuple[int, int]:
|
193
|
+
The range for values used for paginating data from source.
|
191
194
|
sort_output: bool
|
192
195
|
Specifies whether to sort cytotable output or not.
|
193
196
|
add_cytotable_meta: bool, default=False:
|
@@ -201,10 +204,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
201
204
|
|
202
205
|
import pyarrow as pa
|
203
206
|
|
204
|
-
from cytotable.constants import
|
205
|
-
CYOTABLE_META_COLUMN_TYPES,
|
206
|
-
SQLITE_AFFINITY_DATA_TYPE_SYNONYMS,
|
207
|
-
)
|
207
|
+
from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
|
208
208
|
from cytotable.exceptions import DatatypeException
|
209
209
|
|
210
210
|
# open sqlite3 connection
|
@@ -264,42 +264,14 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
264
264
|
for col in column_info
|
265
265
|
]
|
266
266
|
|
267
|
-
if add_cytotable_meta:
|
268
|
-
query_parts += [
|
269
|
-
(
|
270
|
-
f"CAST( '{f'{source_path}_table_{table_name}'}' "
|
271
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_source_path'].lower())}) "
|
272
|
-
"AS cytotable_meta_source_path"
|
273
|
-
),
|
274
|
-
(
|
275
|
-
f"CAST( {offset} "
|
276
|
-
f"AS {_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_offset'].lower())}) "
|
277
|
-
"AS cytotable_meta_offset"
|
278
|
-
),
|
279
|
-
(
|
280
|
-
f"CAST( (ROW_NUMBER() OVER ()) AS "
|
281
|
-
f"{_sqlite_affinity_data_type_lookup(CYOTABLE_META_COLUMN_TYPES['cytotable_meta_rownum'].lower())}) "
|
282
|
-
"AS cytotable_meta_rownum"
|
283
|
-
),
|
284
|
-
]
|
285
|
-
|
286
267
|
# perform the select using the cases built above and using chunksize + offset
|
287
|
-
sql_stmt =
|
288
|
-
f"""
|
289
|
-
SELECT
|
290
|
-
{', '.join(query_parts)}
|
291
|
-
FROM {table_name}
|
292
|
-
ORDER BY {', '.join([col['column_name'] for col in column_info])}
|
293
|
-
LIMIT {chunk_size} OFFSET {offset};
|
294
|
-
"""
|
295
|
-
if sort_output
|
296
|
-
else f"""
|
268
|
+
sql_stmt = f"""
|
297
269
|
SELECT
|
298
270
|
{', '.join(query_parts)}
|
299
271
|
FROM {table_name}
|
300
|
-
|
272
|
+
WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]}
|
273
|
+
{"ORDER BY " + page_key if sort_output else ""};
|
301
274
|
"""
|
302
|
-
)
|
303
275
|
|
304
276
|
# execute the sql stmt
|
305
277
|
cursor.execute(sql_stmt)
|
@@ -322,7 +294,7 @@ def _sqlite_mixed_type_query_to_parquet(
|
|
322
294
|
return pa.Table.from_pylist(results)
|
323
295
|
|
324
296
|
|
325
|
-
def _cache_cloudpath_to_local(path:
|
297
|
+
def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
|
326
298
|
"""
|
327
299
|
Takes a cloudpath and uses cache to convert to a local copy
|
328
300
|
for use in scenarios where remote work is not possible (sqlite).
|
@@ -337,24 +309,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
|
|
337
309
|
A local pathlib.Path to cached version of cloudpath file.
|
338
310
|
"""
|
339
311
|
|
340
|
-
candidate_path = AnyPath(path)
|
341
|
-
|
342
312
|
# check that the path is a file (caching won't work with a dir)
|
343
313
|
# and check that the file is of sqlite type
|
344
314
|
# (other file types will be handled remotely in cloud)
|
345
|
-
if
|
315
|
+
if (
|
316
|
+
isinstance(path, CloudPath)
|
317
|
+
and path.is_file()
|
318
|
+
and path.suffix.lower() == ".sqlite"
|
319
|
+
):
|
346
320
|
try:
|
347
321
|
# update the path to be the local filepath for reference in CytoTable ops
|
348
322
|
# note: incurs a data read which will trigger caching of the file
|
349
|
-
path =
|
323
|
+
path = pathlib.Path(path.fspath)
|
350
324
|
except InvalidPrefixError:
|
351
325
|
# share information about not finding a cloud path
|
352
326
|
logger.info(
|
353
327
|
"Did not detect a cloud path based on prefix. Defaulting to use local path operations."
|
354
328
|
)
|
355
329
|
|
356
|
-
|
357
|
-
return pathlib.Path(path)
|
330
|
+
return path
|
358
331
|
|
359
332
|
|
360
333
|
def _arrow_type_cast_if_specified(
|
@@ -595,3 +568,77 @@ def evaluate_futures(sources: Union[Dict[str, List[Dict[str, Any]]], str]) -> An
|
|
595
568
|
if isinstance(sources, dict)
|
596
569
|
else _unwrap_value(sources)
|
597
570
|
)
|
571
|
+
|
572
|
+
|
573
|
+
def _generate_pagesets(
|
574
|
+
keys: List[Union[int, float]], chunk_size: int
|
575
|
+
) -> List[Tuple[Union[int, float], Union[int, float]]]:
|
576
|
+
"""
|
577
|
+
Generate a pageset (keyset pagination) from a list of keys.
|
578
|
+
|
579
|
+
Parameters:
|
580
|
+
keys List[Union[int, float]]:
|
581
|
+
List of keys to paginate.
|
582
|
+
chunk_size int:
|
583
|
+
Size of each chunk/page.
|
584
|
+
|
585
|
+
Returns:
|
586
|
+
List[Tuple[Union[int, float], Union[int, float]]]:
|
587
|
+
List of (start_key, end_key) tuples representing each page.
|
588
|
+
"""
|
589
|
+
|
590
|
+
# Initialize an empty list to store the chunks/pages
|
591
|
+
chunks = []
|
592
|
+
|
593
|
+
# Start index for iteration through the keys
|
594
|
+
i = 0
|
595
|
+
|
596
|
+
while i < len(keys):
|
597
|
+
# Get the start key for the current chunk
|
598
|
+
start_key = keys[i]
|
599
|
+
|
600
|
+
# Calculate the end index for the current chunk
|
601
|
+
end_index = min(i + chunk_size, len(keys)) - 1
|
602
|
+
|
603
|
+
# Get the end key for the current chunk
|
604
|
+
end_key = keys[end_index]
|
605
|
+
|
606
|
+
# Ensure non-overlapping by incrementing the start of the next range if there are duplicates
|
607
|
+
while end_index + 1 < len(keys) and keys[end_index + 1] == end_key:
|
608
|
+
end_index += 1
|
609
|
+
|
610
|
+
# Append the current chunk (start_key, end_key) to the list of chunks
|
611
|
+
chunks.append((start_key, end_key))
|
612
|
+
|
613
|
+
# Update the index to start from the next chunk
|
614
|
+
i = end_index + 1
|
615
|
+
|
616
|
+
# Return the list of chunks/pages
|
617
|
+
return chunks
|
618
|
+
|
619
|
+
|
620
|
+
def _natural_sort(list_to_sort):
|
621
|
+
"""
|
622
|
+
Sorts the given iterable using natural sort adapted from approach
|
623
|
+
provided by the following link:
|
624
|
+
https://stackoverflow.com/a/4836734
|
625
|
+
|
626
|
+
Args:
|
627
|
+
list_to_sort: List:
|
628
|
+
The list to sort.
|
629
|
+
|
630
|
+
Returns:
|
631
|
+
List: The sorted list.
|
632
|
+
"""
|
633
|
+
import re
|
634
|
+
|
635
|
+
return sorted(
|
636
|
+
list_to_sort,
|
637
|
+
# use a custom key to sort the list
|
638
|
+
key=lambda key: [
|
639
|
+
# use integer of c if it's a digit, otherwise str
|
640
|
+
int(c) if c.isdigit() else c
|
641
|
+
# Split the key into parts, separating numbers from alphabetic characters
|
642
|
+
for c in re.split("([0-9]+)", str(key))
|
643
|
+
],
|
644
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: CytoTable
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.10
|
4
4
|
Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
|
5
5
|
Home-page: https://github.com/cytomining/CytoTable
|
6
6
|
License: BSD-3-Clause License
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
17
|
-
Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
|
17
|
+
Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
|
18
18
|
Requires-Dist: duckdb (>=0.10.1)
|
19
19
|
Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
|
20
20
|
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
cytotable/__init__.py,sha256=0rX3g1Ay8RtEW8cYuPbiMzyitFqAJPQz-xLJhxMMD3I,316
|
2
|
+
cytotable/constants.py,sha256=w_AUm_fKKXeZjnZxbHf-dxq7NN7BkvCWbkGK24sfzLw,1872
|
3
|
+
cytotable/convert.py,sha256=p0ghH03pi7VCPCaNyNFkb19yizlx1oLSAwr3xJUfBWI,55499
|
4
|
+
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
+
cytotable/presets.py,sha256=CpUrVSCfsV9CDvNfkNj-rAOguA68lb2-w7g-XMcHezU,14806
|
6
|
+
cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
|
7
|
+
cytotable/utils.py,sha256=ohmEIo-fB8T5mJoQh1u6NFGRk3MnYba-yMqqq2DJezg,20432
|
8
|
+
cytotable-0.0.10.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
+
cytotable-0.0.10.dist-info/METADATA,sha256=ll6vl8oT2ERyNRQNaUwdczg3ybe2vQLYCPM7rCXBhjo,3424
|
10
|
+
cytotable-0.0.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
+
cytotable-0.0.10.dist-info/RECORD,,
|
cytotable-0.0.8.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
|
2
|
-
cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
|
3
|
-
cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
|
4
|
-
cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
|
5
|
-
cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
|
6
|
-
cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
|
7
|
-
cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
|
8
|
-
cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
|
9
|
-
cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
|
10
|
-
cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
11
|
-
cytotable-0.0.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|