CytoTable 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -14,8 +14,8 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
17
- Requires-Dist: duckdb (>=0.8.0,<0.9.0)
18
- Requires-Dist: parsl (>=2023.9.18)
17
+ Requires-Dist: duckdb (>=0.8.0)
18
+ Requires-Dist: parsl (>=2023.9.25)
19
19
  Requires-Dist: pyarrow (>=13.0.0,<14.0.0)
20
20
  Project-URL: Documentation, https://cytomining.github.io/CytoTable/
21
21
  Project-URL: Repository, https://github.com/cytomining/CytoTable
@@ -25,7 +25,7 @@ Description-Content-Type: text/markdown
25
25
 
26
26
  # CytoTable
27
27
 
28
- ![dataflow](docs/source/_static/dataflow.svg)
28
+ ![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
29
29
  _Diagram showing data flow relative to this project._
30
30
 
31
31
  ## Summary
@@ -36,9 +36,13 @@ The Parquet files will have a unified and documented data model, including refer
36
36
 
37
37
  ## Installation
38
38
 
39
- Install CytoTable with the following command:
39
+ Install CytoTable from [PyPI](https://pypi.org/) or from source:
40
40
 
41
41
  ```shell
42
+ # install from pypi
43
+ pip install cytotable
44
+
45
+ # install directly from source
42
46
  pip install git+https://github.com/cytomining/CytoTable.git
43
47
  ```
44
48
 
@@ -175,8 +175,9 @@ def _prep_cast_column_data_types(
175
175
 
176
176
  @python_app
177
177
  def _get_table_chunk_offsets(
178
- source: Dict[str, Any],
179
178
  chunk_size: int,
179
+ source: Optional[Dict[str, Any]] = None,
180
+ sql_stmt: Optional[str] = None,
180
181
  ) -> Union[List[int], None]:
181
182
  """
182
183
  Get table data chunk offsets for later use in capturing segments
@@ -207,39 +208,54 @@ def _get_table_chunk_offsets(
207
208
 
208
209
  logger = logging.getLogger(__name__)
209
210
 
210
- table_name = source["table_name"] if "table_name" in source.keys() else None
211
- source_path = source["source_path"]
212
- source_type = str(pathlib.Path(source_path).suffix).lower()
211
+ if source is not None:
212
+ table_name = source["table_name"] if "table_name" in source.keys() else None
213
+ source_path = source["source_path"]
214
+ source_type = str(pathlib.Path(source_path).suffix).lower()
213
215
 
214
- try:
215
- # for csv's, check that we have more than one row (a header and data values)
216
- if (
217
- source_type == ".csv"
218
- and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
219
- ):
220
- raise NoInputDataException(
221
- f"Data file has 0 rows of values. Error in file: {source_path}"
216
+ try:
217
+ # for csv's, check that we have more than one row (a header and data values)
218
+ if (
219
+ source_type == ".csv"
220
+ and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
221
+ ):
222
+ raise NoInputDataException(
223
+ f"Data file has 0 rows of values. Error in file: {source_path}"
224
+ )
225
+
226
+ # gather the total rowcount from csv or sqlite data input sources
227
+ with _duckdb_reader() as ddb_reader:
228
+ rowcount = int(
229
+ ddb_reader.execute(
230
+ # nosec
231
+ f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
232
+ if source_type == ".csv"
233
+ else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
234
+ ).fetchone()[0]
235
+ )
236
+
237
+ # catch input errors which will result in skipped files
238
+ except (
239
+ duckdb.InvalidInputException,
240
+ NoInputDataException,
241
+ ) as invalid_input_exc:
242
+ logger.warning(
243
+ msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
222
244
  )
223
245
 
246
+ return None
247
+
248
+ # find chunk offsets from sql statement
249
+ elif sql_stmt is not None:
224
250
  # gather the total rowcount from csv or sqlite data input sources
225
251
  with _duckdb_reader() as ddb_reader:
226
252
  rowcount = int(
227
253
  ddb_reader.execute(
228
254
  # nosec
229
- f"SELECT COUNT(*) from read_csv_auto('{source_path}', header=TRUE, delim=',')"
230
- if source_type == ".csv"
231
- else f"SELECT COUNT(*) from sqlite_scan('{source_path}', '{table_name}')"
255
+ f"SELECT COUNT(*) FROM ({sql_stmt})"
232
256
  ).fetchone()[0]
233
257
  )
234
258
 
235
- # catch input errors which will result in skipped files
236
- except (duckdb.InvalidInputException, NoInputDataException) as invalid_input_exc:
237
- logger.warning(
238
- msg=f"Skipping file due to input file errors: {str(invalid_input_exc)}"
239
- )
240
-
241
- return None
242
-
243
259
  return list(
244
260
  range(
245
261
  0,
@@ -258,7 +274,6 @@ def _source_chunk_to_parquet(
258
274
  chunk_size: int,
259
275
  offset: int,
260
276
  dest_path: str,
261
- data_type_cast_map: Optional[Dict[str, str]] = None,
262
277
  ) -> str:
263
278
  """
264
279
  Export source data to chunked parquet file using chunk size and offsets.
@@ -632,75 +647,51 @@ def _concat_source_group(
632
647
  return concatted
633
648
 
634
649
 
635
- @python_app
636
- def _get_join_chunks(
650
+ @python_app()
651
+ def _prepare_join_sql(
637
652
  sources: Dict[str, List[Dict[str, Any]]],
638
- metadata: Union[List[str], Tuple[str, ...]],
639
- chunk_columns: Union[List[str], Tuple[str, ...]],
640
- chunk_size: int,
641
- ) -> List[List[Dict[str, Any]]]:
653
+ joins: str,
654
+ ) -> str:
642
655
  """
643
- Build groups of join keys for later join operations
656
+ Prepare join SQL statement with actual locations of data based on the sources.
644
657
 
645
658
  Args:
646
- sources: Dict[List[Dict[str, Any]]]:
659
+ sources: Dict[str, List[Dict[str, Any]]]:
647
660
  Grouped datasets of files which will be used by other functions.
648
- metadata: Union[List[str], Tuple[str, ...]]:
649
- List of source data names which are used as metadata.
650
- chunk_columns: Union[List[str], Tuple[str, ...]]:
651
- Column names which appear in all compartments to use when performing join.
652
- chunk_size: int:
653
- Size of join chunks which is used to limit data size during join ops.
661
+ Includes the metadata concerning location of actual data.
662
+ joins: str:
663
+ DuckDB-compatible SQL which will be used to perform the join
664
+ operations using the join_group keys as a reference.
654
665
 
655
666
  Returns:
656
- List[List[Dict[str, Any]]]]:
657
- A list of lists with at most chunk size length that contain join keys.
667
+ str:
668
+ String representing the SQL to be used in later join work.
658
669
  """
659
-
660
670
  import pathlib
661
671
 
662
- import pyarrow.parquet as parquet
663
-
664
- from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
672
+ # replace with real location of sources for join sql
673
+ for key, val in sources.items():
674
+ if pathlib.Path(key).stem.lower() in joins.lower():
675
+ joins = joins.replace(
676
+ f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
677
+ str([str(table) for table in val[0]["table"]]),
678
+ )
665
679
 
666
- # fetch the compartment concat result as the basis for join groups
667
- for key, source in sources.items():
668
- if any(name.lower() in pathlib.Path(key).stem.lower() for name in metadata):
669
- first_result = source
670
- break
671
-
672
- # gather the workflow result for basis if it's not yet returned
673
- basis = first_result
674
-
675
- # read only the table's chunk_columns
676
- join_column_rows = parquet.read_table(
677
- source=basis[0]["table"],
678
- columns=list(chunk_columns),
679
- memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
680
- ).to_pylist()
681
-
682
- # build and return the chunked join column rows
683
- return [
684
- join_column_rows[i : i + chunk_size]
685
- for i in range(0, len(join_column_rows), chunk_size)
686
- ]
680
+ return joins
687
681
 
688
682
 
689
683
  @python_app
690
684
  def _join_source_chunk(
691
- sources: Dict[str, List[Dict[str, Any]]],
692
685
  dest_path: str,
693
686
  joins: str,
694
- join_group: List[Dict[str, Any]],
687
+ chunk_size: int,
688
+ offset: int,
695
689
  drop_null: bool,
696
690
  ) -> str:
697
691
  """
698
692
  Join sources based on join group keys (group of specific join column values)
699
693
 
700
694
  Args:
701
- sources: Dict[str, List[Dict[str, Any]]]:
702
- Grouped datasets of files which will be used by other functions.
703
- Includes the metadata concerning location of actual data.
704
695
  dest_path: str:
705
696
  Destination path to write file-based content.
706
697
  joins: str:
@@ -724,52 +715,18 @@ def _join_source_chunk(
724
715
 
725
716
  from cytotable.utils import _duckdb_reader
726
717
 
727
- # replace with real location of sources for join sql
728
- for key, val in sources.items():
729
- if pathlib.Path(key).stem.lower() in joins.lower():
730
- joins = joins.replace(
731
- f"'{str(pathlib.Path(key).stem.lower())}.parquet'",
732
- str([str(table) for table in val[0]["table"]]),
733
- )
734
-
735
- # update the join groups to include unique values per table
736
- updated_join_group = []
737
- for key in sources.keys():
738
- updated_join_group.extend(
739
- [
740
- {
741
- f"{str(pathlib.Path(key).stem)}.{join_key}": val
742
- for join_key, val in chunk.items()
743
- }
744
- for chunk in join_group
745
- ]
746
- )
747
-
748
- # form where clause for sql joins to filter the results
749
- joins += (
750
- "WHERE ("
751
- + ") OR (".join(
752
- [
753
- " AND ".join(
754
- [
755
- # create groups of join column filters where values always
756
- # are expected to equal those within the join_group together
757
- f"{join_column} = {join_column_value}"
758
- if not isinstance(join_column_value, str)
759
- # account for string values
760
- else (f"{join_column} = " f"'{join_column_value}'")
761
- for join_column, join_column_value in chunk.items()
762
- ]
763
- )
764
- for chunk in updated_join_group
765
- ]
766
- )
767
- + ")"
768
- )
769
-
718
+ # Attempt to read the data to parquet file
719
+ # using duckdb for extraction and pyarrow for
720
+ # writing data to a parquet file.
721
+ # read data with chunk size + offset
722
+ # and export to parquet
770
723
  with _duckdb_reader() as ddb_reader:
771
- # perform compartment joins using duckdb over parquet files
772
- result = ddb_reader.execute(joins).arrow()
724
+ result = ddb_reader.execute(
725
+ f"""
726
+ {joins}
727
+ LIMIT {chunk_size} OFFSET {offset}
728
+ """
729
+ ).arrow()
773
730
 
774
731
  # drop nulls if specified
775
732
  if drop_null:
@@ -1012,7 +969,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1012
969
  concat: bool,
1013
970
  join: bool,
1014
971
  joins: Optional[str],
1015
- chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
1016
972
  chunk_size: Optional[int],
1017
973
  infer_common_schema: bool,
1018
974
  drop_null: bool,
@@ -1048,8 +1004,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1048
1004
  Whether to join the compartment data together into one dataset.
1049
1005
  joins: str:
1050
1006
  DuckDB-compatible SQL which will be used to perform the join operations.
1051
- chunk_columns: Optional[Union[List[str], Tuple[str, ...]]],
1052
- Column names which appear in all compartments to use when performing join.
1053
1007
  chunk_size: Optional[int],
1054
1008
  Size of join chunks which is used to limit data size during join ops.
1055
1009
  infer_common_schema: bool: (Default value = True)
@@ -1074,7 +1028,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1074
1028
  from cytotable.convert import (
1075
1029
  _concat_join_sources,
1076
1030
  _concat_source_group,
1077
- _get_join_chunks,
1078
1031
  _get_table_chunk_offsets,
1079
1032
  _infer_source_group_common_schema,
1080
1033
  _join_source_chunk,
@@ -1161,7 +1114,6 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1161
1114
  chunk_size=chunk_size,
1162
1115
  offset=offset,
1163
1116
  dest_path=expanded_dest_path,
1164
- data_type_cast_map=data_type_cast_map,
1165
1117
  ),
1166
1118
  source_group_name=source_group_name,
1167
1119
  identifying_columns=identifying_columns,
@@ -1210,6 +1162,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1210
1162
  # conditional section for merging
1211
1163
  # note: join implies a concat, but concat does not imply a join
1212
1164
  if join:
1165
+ prepared_joins_sql = _prepare_join_sql(sources=results, joins=joins).result()
1166
+
1213
1167
  # map joined results based on the join groups gathered above
1214
1168
  # note: after mapping we end up with a list of strings (task returns str)
1215
1169
  join_sources_result = [
@@ -1217,21 +1171,18 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1217
1171
  # gather the result of concatted sources prior to
1218
1172
  # join group merging as each mapped task run will need
1219
1173
  # full concat results
1220
- sources=results,
1221
1174
  dest_path=expanded_dest_path,
1222
- joins=joins,
1223
- # get merging chunks by join columns
1224
- join_group=join_group,
1175
+ joins=prepared_joins_sql,
1176
+ chunk_size=chunk_size,
1177
+ offset=offset,
1225
1178
  drop_null=drop_null,
1226
1179
  ).result()
1227
1180
  # create join group for querying the concatenated
1228
1181
  # data in order to perform memory-safe joining
1229
1182
  # per user chunk size specification.
1230
- for join_group in _get_join_chunks(
1231
- sources=results,
1232
- chunk_columns=chunk_columns,
1183
+ for offset in _get_table_chunk_offsets(
1184
+ sql_stmt=prepared_joins_sql,
1233
1185
  chunk_size=chunk_size,
1234
- metadata=metadata,
1235
1186
  ).result()
1236
1187
  ]
1237
1188
 
@@ -1259,7 +1210,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1259
1210
  concat: bool = True,
1260
1211
  join: bool = True,
1261
1212
  joins: Optional[str] = None,
1262
- chunk_columns: Optional[Union[List[str], Tuple[str, ...]]] = None,
1263
1213
  chunk_size: Optional[int] = None,
1264
1214
  infer_common_schema: bool = True,
1265
1215
  drop_null: bool = False,
@@ -1303,9 +1253,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1303
1253
  Whether to join the compartment data together into one dataset
1304
1254
  joins: str: (Default value = None):
1305
1255
  DuckDB-compatible SQL which will be used to perform the join operations.
1306
- chunk_columns: Optional[Union[List[str], Tuple[str, ...]]]
1307
- (Default value = None)
1308
- Column names which appear in all compartments to use when performing join
1309
1256
  chunk_size: Optional[int] (Default value = None)
1310
1257
  Size of join chunks which is used to limit data size during join ops
1311
1258
  infer_common_schema: bool: (Default value = True)
@@ -1402,11 +1349,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1402
1349
  else identifying_columns
1403
1350
  )
1404
1351
  joins = cast(str, config[preset]["CONFIG_JOINS"]) if joins is None else joins
1405
- chunk_columns = (
1406
- cast(list, config[preset]["CONFIG_CHUNK_COLUMNS"])
1407
- if chunk_columns is None
1408
- else chunk_columns
1409
- )
1410
1352
  chunk_size = (
1411
1353
  cast(int, config[preset]["CONFIG_CHUNK_SIZE"])
1412
1354
  if chunk_size is None
@@ -1425,7 +1367,6 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals
1425
1367
  concat=concat,
1426
1368
  join=join,
1427
1369
  joins=joins,
1428
- chunk_columns=chunk_columns,
1429
1370
  chunk_size=chunk_size,
1430
1371
  infer_common_schema=infer_common_schema,
1431
1372
  drop_null=drop_null,
@@ -26,8 +26,6 @@ config = {
26
26
  # note: this number is an estimate and is may need changes contingent on data
27
27
  # and system used by this library.
28
28
  "CONFIG_CHUNK_SIZE": 1000,
29
- # chunking columns to use along with chunk size for join operations
30
- "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
31
29
  # compartment and metadata joins performed using DuckDB SQL
32
30
  # and modified at runtime as needed
33
31
  "CONFIG_JOINS": """
@@ -73,8 +71,6 @@ config = {
73
71
  # note: this number is an estimate and is may need changes contingent on data
74
72
  # and system used by this library.
75
73
  "CONFIG_CHUNK_SIZE": 1000,
76
- # chunking columns to use along with chunk size for join operations
77
- "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
78
74
  # compartment and metadata joins performed using DuckDB SQL
79
75
  # and modified at runtime as needed
80
76
  "CONFIG_JOINS": """
@@ -126,8 +122,6 @@ config = {
126
122
  # note: this number is an estimate and is may need changes contingent on data
127
123
  # and system used by this library.
128
124
  "CONFIG_CHUNK_SIZE": 1000,
129
- # chunking columns to use along with chunk size for join operations
130
- "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
131
125
  # compartment and metadata joins performed using DuckDB SQL
132
126
  # and modified at runtime as needed
133
127
  "CONFIG_JOINS": """
@@ -181,8 +175,6 @@ config = {
181
175
  # note: this number is an estimate and is may need changes contingent on data
182
176
  # and system used by this library.
183
177
  "CONFIG_CHUNK_SIZE": 1000,
184
- # chunking columns to use along with chunk size for join operations
185
- "CONFIG_CHUNK_COLUMNS": ("Metadata_ImageNumber",),
186
178
  # compartment and metadata joins performed using DuckDB SQL
187
179
  # and modified at runtime as needed
188
180
  "CONFIG_JOINS": """
@@ -14,7 +14,7 @@ from cloudpathlib import AnyPath, CloudPath
14
14
  from cloudpathlib.exceptions import InvalidPrefixError
15
15
  from parsl.app.app import AppBase
16
16
  from parsl.config import Config
17
- from parsl.errors import ConfigurationError
17
+ from parsl.errors import NoDataFlowKernelError
18
18
  from parsl.executors import HighThroughputExecutor
19
19
 
20
20
  logger = logging.getLogger(__name__)
@@ -108,15 +108,10 @@ def _parsl_loaded() -> bool:
108
108
  try:
109
109
  # try to reference Parsl dataflowkernel
110
110
  parsl.dfk()
111
- except ConfigurationError as pce:
112
- # if we detect a Parsl ConfigurationError that states we need to load config
111
+ except NoDataFlowKernelError:
112
+ # if we detect a Parsl NoDataFlowKernelError
113
113
  # return false to indicate parsl config has not yet been loaded.
114
- if pce.args[0] == "Must first load config":
115
- return False
116
-
117
- # otherwise we raise other ConfigurationError's
118
- else:
119
- raise
114
+ return False
120
115
 
121
116
  # otherwise we indicate parsl config has already been loaded
122
117
  return True
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "CytoTable"
3
- version = "0.0.2"
3
+ version = "0.0.3"
4
4
  description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
5
5
  authors = ["Cytomining Community"]
6
6
  license = "BSD-3-Clause License"
@@ -14,8 +14,8 @@ keywords = ["python", "cellprofiler","single-cell-analysis", "way-lab"]
14
14
  python = ">=3.8,<3.13"
15
15
  pyarrow = "^13.0.0"
16
16
  cloudpathlib = {extras = ["all"], version = "^0.15.0"}
17
- duckdb = "^0.8.0"
18
- parsl = ">=2023.9.18"
17
+ duckdb = ">=0.8.0"
18
+ parsl = ">=2023.9.25"
19
19
 
20
20
  [tool.poetry.dev-dependencies]
21
21
  pytest = "^7.4.0"
@@ -2,7 +2,7 @@
2
2
 
3
3
  # CytoTable
4
4
 
5
- ![dataflow](docs/source/_static/dataflow.svg)
5
+ ![dataflow](https://raw.githubusercontent.com/cytomining/cytotable/main/docs/source/_static/dataflow.svg?raw=true)
6
6
  _Diagram showing data flow relative to this project._
7
7
 
8
8
  ## Summary
@@ -13,9 +13,13 @@ The Parquet files will have a unified and documented data model, including refer
13
13
 
14
14
  ## Installation
15
15
 
16
- Install CytoTable with the following command:
16
+ Install CytoTable from [PyPI](https://pypi.org/) or from source:
17
17
 
18
18
  ```shell
19
+ # install from pypi
20
+ pip install cytotable
21
+
22
+ # install directly from source
19
23
  pip install git+https://github.com/cytomining/CytoTable.git
20
24
  ```
21
25
 
File without changes