CytoTable 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cytotable/__init__.py CHANGED
@@ -3,7 +3,7 @@ __init__.py for cytotable
3
3
  """
4
4
 
5
5
  # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
- __version__ = "0.0.8"
6
+ __version__ = "0.0.9"
7
7
 
8
8
  from .convert import convert
9
9
  from .exceptions import (
cytotable/convert.py CHANGED
@@ -46,11 +46,12 @@ def _get_table_columns_and_types(
46
46
  import pathlib
47
47
 
48
48
  import duckdb
49
+ from cloudpathlib import AnyPath
49
50
 
50
51
  from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
51
52
 
52
53
  source_path = source["source_path"]
53
- source_type = str(pathlib.Path(source_path).suffix).lower()
54
+ source_type = str(source_path.suffix).lower()
54
55
 
55
56
  # prepare the data source in the form of a duckdb query
56
57
  select_source = (
@@ -209,7 +210,7 @@ def _get_table_chunk_offsets(
209
210
  import pathlib
210
211
 
211
212
  import duckdb
212
- from cloudpathlib import AnyPath
213
+ from cloudpathlib import AnyPath, CloudPath
213
214
 
214
215
  from cytotable.exceptions import NoInputDataException
215
216
  from cytotable.utils import _duckdb_reader
@@ -219,18 +220,9 @@ def _get_table_chunk_offsets(
219
220
  if source is not None:
220
221
  table_name = source["table_name"] if "table_name" in source.keys() else None
221
222
  source_path = source["source_path"]
222
- source_type = str(pathlib.Path(source_path).suffix).lower()
223
+ source_type = str(source_path.suffix).lower()
223
224
 
224
225
  try:
225
- # for csv's, check that we have more than one row (a header and data values)
226
- if (
227
- source_type == ".csv"
228
- and sum(1 for _ in AnyPath(source_path).open("r")) <= 1
229
- ):
230
- raise NoInputDataException(
231
- f"Data file has 0 rows of values. Error in file: {source_path}"
232
- )
233
-
234
226
  # gather the total rowcount from csv or sqlite data input sources
235
227
  with _duckdb_reader() as ddb_reader:
236
228
  rowcount = int(
@@ -322,8 +314,8 @@ def _source_chunk_to_parquet(
322
314
 
323
315
  # attempt to build dest_path
324
316
  source_dest_path = (
325
- f"{dest_path}/{str(pathlib.Path(source_group_name).stem).lower()}/"
326
- f"{str(pathlib.Path(source['source_path']).parent.name).lower()}"
317
+ f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
318
+ f"{str(source['source_path'].parent.name).lower()}"
327
319
  )
328
320
  pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
329
321
 
@@ -364,11 +356,11 @@ def _source_chunk_to_parquet(
364
356
 
365
357
  # build output query and filepath base
366
358
  # (chunked output will append offset to keep output paths unique)
367
- if str(AnyPath(source["source_path"]).suffix).lower() == ".csv":
359
+ if str(source["source_path"].suffix).lower() == ".csv":
368
360
  base_query = f"SELECT {select_columns} FROM read_csv_auto('{str(source['source_path'])}', header=TRUE, delim=',')"
369
361
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}"
370
362
 
371
- elif str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite":
363
+ elif str(source["source_path"].suffix).lower() == ".sqlite":
372
364
  base_query = f"SELECT {select_columns} FROM sqlite_scan('{str(source['source_path'])}', '{str(source['table_name'])}')"
373
365
  result_filepath_base = f"{source_dest_path}/{str(source['source_path'].stem)}.{source['table_name']}"
374
366
 
@@ -405,7 +397,7 @@ def _source_chunk_to_parquet(
405
397
  # to handle the mixed types
406
398
  if (
407
399
  "Mismatch Type Error" in str(e)
408
- and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
400
+ and str(source["source_path"].suffix).lower() == ".sqlite"
409
401
  ):
410
402
  _write_parquet_table_with_metadata(
411
403
  # here we use sqlite instead of duckdb to extract
@@ -817,6 +809,7 @@ def _join_source_chunk(
817
809
  exclude_meta_cols = [
818
810
  f"c NOT LIKE '{col}%'" for col in list(CYOTABLE_META_COLUMN_TYPES.keys())
819
811
  ]
812
+
820
813
  with _duckdb_reader() as ddb_reader:
821
814
  result = ddb_reader.execute(
822
815
  f"""
@@ -1114,7 +1107,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals
1114
1107
  else []
1115
1108
  ),
1116
1109
  **kwargs,
1117
- ).result()
1110
+ )
1118
1111
 
1119
1112
  # expand the destination path
1120
1113
  expanded_dest_path = _expand_path(path=dest_path)
cytotable/presets.py CHANGED
@@ -85,6 +85,54 @@ config = {
85
85
  AND per_nuclei.Nuclei_Number_Object_Number = per_cytoplasm.Cytoplasm_Parent_Nuclei
86
86
  """,
87
87
  },
88
+ "cellprofiler_sqlite_cpg0016_jump": {
89
+ # version specifications using related references
90
+ "CONFIG_SOURCE_VERSION": {
91
+ "cellprofiler": "v4.0.0",
92
+ },
93
+ # names of source table compartments (for ex. cells.csv, etc.)
94
+ "CONFIG_NAMES_COMPARTMENTS": ("cells", "nuclei", "cytoplasm"),
95
+ # names of source table metadata (for ex. image.csv, etc.)
96
+ "CONFIG_NAMES_METADATA": ("image",),
97
+ # column names in any compartment or metadata tables which contain
98
+ # unique names to avoid renaming
99
+ "CONFIG_IDENTIFYING_COLUMNS": (
100
+ "ImageNumber",
101
+ "ObjectNumber",
102
+ "Metadata_Well",
103
+ "Metadata_Plate",
104
+ "Parent_Cells",
105
+ "Parent_Nuclei",
106
+ ),
107
+ # chunk size to use for join operations to help with possible performance issues
108
+ # note: this number is an estimate and is may need changes contingent on data
109
+ # and system used by this library.
110
+ "CONFIG_CHUNK_SIZE": 1000,
111
+ # compartment and metadata joins performed using DuckDB SQL
112
+ # and modified at runtime as needed
113
+ "CONFIG_JOINS": """
114
+ SELECT
115
+ image.Image_TableNumber,
116
+ image.Metadata_ImageNumber,
117
+ image.Metadata_Plate,
118
+ image.Metadata_Well,
119
+ image.Image_Metadata_Site,
120
+ image.Image_Metadata_Row,
121
+ cytoplasm.* EXCLUDE (Metadata_ImageNumber),
122
+ cells.* EXCLUDE (Metadata_ImageNumber),
123
+ nuclei.* EXCLUDE (Metadata_ImageNumber)
124
+ FROM
125
+ read_parquet('cytoplasm.parquet') AS cytoplasm
126
+ LEFT JOIN read_parquet('cells.parquet') AS cells ON
127
+ cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
128
+ AND cells.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Cells
129
+ LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON
130
+ nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
131
+ AND nuclei.Metadata_ObjectNumber = cytoplasm.Cytoplasm_Parent_Nuclei
132
+ LEFT JOIN read_parquet('image.parquet') AS image ON
133
+ image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber
134
+ """,
135
+ },
88
136
  "cellprofiler_sqlite_pycytominer": {
89
137
  # version specifications using related references
90
138
  "CONFIG_SOURCE_VERSION": {
cytotable/sources.py CHANGED
@@ -7,13 +7,11 @@ import pathlib
7
7
  from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from cloudpathlib import AnyPath
10
- from parsl.app.app import join_app, python_app
11
10
 
11
+ from cytotable.exceptions import NoInputDataException
12
12
 
13
- @python_app
14
- def _build_path(
15
- path: Union[str, pathlib.Path, AnyPath], **kwargs
16
- ) -> Union[pathlib.Path, AnyPath]:
13
+
14
+ def _build_path(path: str, **kwargs) -> Union[pathlib.Path, AnyPath]:
17
15
  """
18
16
  Build a path client or return local path.
19
17
 
@@ -43,10 +41,9 @@ def _build_path(
43
41
  return processed_path
44
42
 
45
43
 
46
- @python_app
47
44
  def _get_source_filepaths(
48
45
  path: Union[pathlib.Path, AnyPath],
49
- targets: List[str],
46
+ targets: Optional[List[str]] = None,
50
47
  source_datatype: Optional[str] = None,
51
48
  ) -> Dict[str, List[Dict[str, Any]]]:
52
49
  """
@@ -75,7 +72,7 @@ def _get_source_filepaths(
75
72
 
76
73
  if (targets is None or targets == []) and source_datatype is None:
77
74
  raise DatatypeException(
78
- f"A source_datatype must be specified when using undefined compartments and metadata names."
75
+ "A source_datatype must be specified when using undefined compartments and metadata names."
79
76
  )
80
77
 
81
78
  # gathers files from provided path using compartments + metadata as a filter
@@ -87,9 +84,9 @@ def _get_source_filepaths(
87
84
  for subpath in (
88
85
  (path,)
89
86
  # used if the source path is a single file
90
- if AnyPath(path).is_file()
87
+ if path.is_file()
91
88
  # iterates through a source directory
92
- else (x for x in AnyPath(path).glob("**/*") if AnyPath(x).is_file())
89
+ else (x for x in path.glob("**/*") if x.is_file())
93
90
  )
94
91
  # ensure the subpaths meet certain specifications
95
92
  if (
@@ -129,7 +126,8 @@ def _get_source_filepaths(
129
126
  .arrow()["table_name"]
130
127
  .to_pylist()
131
128
  # make sure the table names match with compartment + metadata names
132
- if any(target.lower() in table_name.lower() for target in targets)
129
+ if targets is not None
130
+ and any(target.lower() in table_name.lower() for target in targets)
133
131
  ]
134
132
  else:
135
133
  # if we don't have sqlite source, append the existing element
@@ -181,7 +179,6 @@ def _get_source_filepaths(
181
179
  return grouped_sources
182
180
 
183
181
 
184
- @python_app
185
182
  def _infer_source_datatype(
186
183
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: Optional[str] = None
187
184
  ) -> str:
@@ -230,7 +227,6 @@ def _infer_source_datatype(
230
227
  return source_datatype
231
228
 
232
229
 
233
- @python_app
234
230
  def _filter_source_filepaths(
235
231
  sources: Dict[str, List[Dict[str, Any]]], source_datatype: str
236
232
  ) -> Dict[str, List[Dict[str, Any]]]:
@@ -260,12 +256,45 @@ def _filter_source_filepaths(
260
256
  if file["source_path"].stat().st_size > 0
261
257
  # ensure the datatype matches the source datatype
262
258
  and file["source_path"].suffix == f".{source_datatype}"
259
+ and _file_is_more_than_one_line(path=file["source_path"])
263
260
  ]
264
261
  for filegroup, files in sources.items()
265
262
  }
266
263
 
267
264
 
268
- @join_app
265
+ def _file_is_more_than_one_line(path: Union[pathlib.Path, AnyPath]) -> bool:
266
+ """
267
+ Check if the file has more than one line.
268
+
269
+ Args:
270
+ path (Union[pathlib.Path, AnyPath]):
271
+ The path to the file.
272
+
273
+ Returns:
274
+ bool:
275
+ True if the file has more than one line, False otherwise.
276
+
277
+ Raises:
278
+ NoInputDataException: If the file has zero lines.
279
+ """
280
+
281
+ # if we don't have a sqlite file
282
+ # (we can't check sqlite files for lines)
283
+ if path.suffix.lower() != ".sqlite":
284
+ with path.open("r") as f:
285
+ try:
286
+ # read two lines, if the second is empty return false
287
+ return bool(f.readline() and f.readline())
288
+
289
+ except StopIteration:
290
+ # If we encounter the end of the file, it has only one line
291
+ raise NoInputDataException(
292
+ f"Data file has 0 rows of values. Error in file: {path}"
293
+ )
294
+ else:
295
+ return True
296
+
297
+
269
298
  def _gather_sources(
270
299
  source_path: str,
271
300
  source_datatype: Optional[str] = None,
@@ -295,11 +324,11 @@ def _gather_sources(
295
324
  _infer_source_datatype,
296
325
  )
297
326
 
298
- source_path = _build_path(path=source_path, **kwargs)
327
+ built_path = _build_path(path=source_path, **kwargs)
299
328
 
300
329
  # gather filepaths which will be used as the basis for this work
301
330
  sources = _get_source_filepaths(
302
- path=source_path, targets=targets, source_datatype=source_datatype
331
+ path=built_path, targets=targets, source_datatype=source_datatype
303
332
  )
304
333
 
305
334
  # infer or validate the source datatype based on source filepaths
cytotable/utils.py CHANGED
@@ -149,6 +149,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
149
149
  INSTALL sqlite_scanner;
150
150
  LOAD sqlite_scanner;
151
151
 
152
+ /* Install httpfs plugin to avoid error
153
+ https://github.com/duckdb/duckdb/issues/3243 */
154
+ INSTALL httpfs;
155
+
152
156
  /*
153
157
  Set threads available to duckdb
154
158
  See the following for more information:
@@ -322,7 +326,7 @@ def _sqlite_mixed_type_query_to_parquet(
322
326
  return pa.Table.from_pylist(results)
323
327
 
324
328
 
325
- def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
329
+ def _cache_cloudpath_to_local(path: AnyPath) -> pathlib.Path:
326
330
  """
327
331
  Takes a cloudpath and uses cache to convert to a local copy
328
332
  for use in scenarios where remote work is not possible (sqlite).
@@ -337,24 +341,25 @@ def _cache_cloudpath_to_local(path: Union[str, AnyPath]) -> pathlib.Path:
337
341
  A local pathlib.Path to cached version of cloudpath file.
338
342
  """
339
343
 
340
- candidate_path = AnyPath(path)
341
-
342
344
  # check that the path is a file (caching won't work with a dir)
343
345
  # and check that the file is of sqlite type
344
346
  # (other file types will be handled remotely in cloud)
345
- if candidate_path.is_file() and candidate_path.suffix.lower() == ".sqlite":
347
+ if (
348
+ isinstance(path, CloudPath)
349
+ and path.is_file()
350
+ and path.suffix.lower() == ".sqlite"
351
+ ):
346
352
  try:
347
353
  # update the path to be the local filepath for reference in CytoTable ops
348
354
  # note: incurs a data read which will trigger caching of the file
349
- path = CloudPath(path).fspath
355
+ path = pathlib.Path(path.fspath)
350
356
  except InvalidPrefixError:
351
357
  # share information about not finding a cloud path
352
358
  logger.info(
353
359
  "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
354
360
  )
355
361
 
356
- # cast the result as a pathlib.Path
357
- return pathlib.Path(path)
362
+ return path
358
363
 
359
364
 
360
365
  def _arrow_type_cast_if_specified(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
- Requires-Dist: cloudpathlib[all] (>=0.18.0,<0.19.0)
17
+ Requires-Dist: cloudpathlib[all,s3] (>=0.18.0,<0.19.0)
18
18
  Requires-Dist: duckdb (>=0.10.1)
19
19
  Requires-Dist: numpy (<=1.24.4) ; python_version < "3.12"
20
20
  Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
@@ -0,0 +1,11 @@
1
+ cytotable/__init__.py,sha256=OK8rwVqJ4PSMukLgdhGEOGAtSc-NHp-dtOln2ER83iE,315
2
+ cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
+ cytotable/convert.py,sha256=TDPWMYCXrLReaixxS-aLQfK22ZfzvQ0Qsc4RmyHQd-Y,54458
4
+ cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
+ cytotable/presets.py,sha256=iiTzOj6AyYr7kJXspbN7N-6YIhCD7kmV-vQErwNm3U0,12405
6
+ cytotable/sources.py,sha256=TY4dkbwh1PDCNapmMHE09Ey7QPYPhmp5DeErh3Wp4rw,12283
7
+ cytotable/utils.py,sha256=Asy-hfZWZ4mGRE0zi7PYLqaShtvLM2qJoHCOaHjHOWo,19431
8
+ cytotable-0.0.9.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
+ cytotable-0.0.9.dist-info/METADATA,sha256=yUED1TmK-FWe8zIL2T2nRDey6ygHlqt9dXKyRo9QFhY,3423
10
+ cytotable-0.0.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ cytotable-0.0.9.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- cytotable/__init__.py,sha256=hBU893kcWONEc1iC3OoKg5hGyjWso3EzPpFAQocofU8,315
2
- cytotable/constants.py,sha256=5ndA_0fNL66O1Mt6HPkuZGgK2VSUiBF839c7dV_w8EY,2097
3
- cytotable/convert.py,sha256=LncoO0UQj5RDgJYoMVBP7aQ2b9qNI4FaqCCP7IbuESg,54870
4
- cytotable/exceptions.py,sha256=NhkMswjCB0HeVHqlLXzBlyHunQIp_4eBFmyAPu0Nf30,482
5
- cytotable/presets.py,sha256=YgxCsCLfbOK91Kebo4ZxI9t-WE-nHENITCC6JXmOV9I,10105
6
- cytotable/sources.py,sha256=zvkYMJOTBJVgFFSbkfpjFMwlOu4ifhxYALh71NGKEuM,11283
7
- cytotable/utils.py,sha256=JIvmNe9uD71MeUx0t5gMvUNVWpoSYNugtXNjsknjmu0,19357
8
- cytotable-0.0.8.dist-info/LICENSE,sha256=lPK3PtUMP-f1EOFMUr8h3FvuMh89x249Hvm4lchTsv0,1528
9
- cytotable-0.0.8.dist-info/METADATA,sha256=qBqn3Vhmg-X7Y6N0yISwQtXNcj1qWe_JSUcx9XSt0y0,3420
10
- cytotable-0.0.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
- cytotable-0.0.8.dist-info/RECORD,,