CytoTable 0.0.3__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: CytoTable
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools.
5
5
  Home-page: https://github.com/cytomining/CytoTable
6
6
  License: BSD-3-Clause License
@@ -13,10 +13,11 @@ Classifier: Programming Language :: Python :: 3.8
13
13
  Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
16
17
  Requires-Dist: cloudpathlib[all] (>=0.15.0,<0.16.0)
17
18
  Requires-Dist: duckdb (>=0.8.0)
18
19
  Requires-Dist: parsl (>=2023.9.25)
19
- Requires-Dist: pyarrow (>=13.0.0,<14.0.0)
20
+ Requires-Dist: pyarrow (>=13.0.0)
20
21
  Project-URL: Documentation, https://cytomining.github.io/CytoTable/
21
22
  Project-URL: Repository, https://github.com/cytomining/CytoTable
22
23
  Description-Content-Type: text/markdown
@@ -30,10 +31,17 @@ _Diagram showing data flow relative to this project._
30
31
 
31
32
  ## Summary
32
33
 
33
- CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
34
+ CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
34
35
  CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
35
36
  The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
36
37
 
38
+ The name for the project is inspired from:
39
+
40
+ - __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
41
+ - __Table__:
42
+ - "1. Furniture with a top surface to accommodate a variety of uses."
43
+ - "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
44
+
37
45
  ## Installation
38
46
 
39
47
  Install CytoTable from [PyPI](https://pypi.org/) or from source:
@@ -1,6 +1,10 @@
1
1
  """
2
2
  __init__.py for cytotable
3
3
  """
4
+
5
+ # note: version data is maintained by poetry-dynamic-versioning (do not edit)
6
+ __version__ = "0.0.4"
7
+
4
8
  from .convert import convert
5
9
  from .exceptions import (
6
10
  CytoTableException,
@@ -0,0 +1,74 @@
1
+ """
2
+ CytoTable: constants - storing various constants to be used throughout cytotable.
3
+ """
4
+
5
+ import multiprocessing
6
+ import os
7
+ from typing import cast
8
+
9
+ from cytotable.utils import _get_cytotable_version
10
+
11
+ # read max threads from environment if necessary
12
+ # max threads will be used with default Parsl config and Duckdb
13
+ MAX_THREADS = (
14
+ multiprocessing.cpu_count()
15
+ if "CYTOTABLE_MAX_THREADS" not in os.environ
16
+ else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
17
+ )
18
+
19
+ # enables overriding default memory mapping behavior with pyarrow memory mapping
20
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
21
+ os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
22
+ )
23
+
24
+ DDB_DATA_TYPE_SYNONYMS = {
25
+ "real": ["float32", "float4", "float"],
26
+ "double": ["float64", "float8", "numeric", "decimal"],
27
+ "integer": ["int32", "int4", "int", "signed"],
28
+ "bigint": ["int64", "int8", "long"],
29
+ }
30
+
31
+ # A reference dictionary for SQLite affinity and storage class types
32
+ # See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
33
+ SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
34
+ "integer": [
35
+ "int",
36
+ "integer",
37
+ "tinyint",
38
+ "smallint",
39
+ "mediumint",
40
+ "bigint",
41
+ "unsigned big int",
42
+ "int2",
43
+ "int8",
44
+ ],
45
+ "text": [
46
+ "character",
47
+ "varchar",
48
+ "varying character",
49
+ "nchar",
50
+ "native character",
51
+ "nvarchar",
52
+ "text",
53
+ "clob",
54
+ ],
55
+ "blob": ["blob"],
56
+ "real": [
57
+ "real",
58
+ "double",
59
+ "double precision",
60
+ "float",
61
+ ],
62
+ "numeric": [
63
+ "numeric",
64
+ "decimal",
65
+ "boolean",
66
+ "date",
67
+ "datetime",
68
+ ],
69
+ }
70
+
71
+ CYTOTABLE_DEFAULT_PARQUET_METADATA = {
72
+ "data-producer": "https://github.com/cytomining/CytoTable",
73
+ "data-producer-version": str(_get_cytotable_version()),
74
+ }
@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
75
75
  segment_type as column_dtype
76
76
  FROM pragma_storage_info('column_details')
77
77
  /* avoid duplicate entries in the form of VALIDITY segment_types */
78
- WHERE segment_type != 'VALIDITY';
78
+ WHERE segment_type != 'VALIDITY'
79
+ /* explicitly order the columns by their id to avoid inconsistent results */
80
+ ORDER BY column_id ASC;
79
81
  """
80
82
 
81
83
  # attempt to read the data to parquet from duckdb
@@ -302,7 +304,11 @@ def _source_chunk_to_parquet(
302
304
  from cloudpathlib import AnyPath
303
305
  from pyarrow import parquet
304
306
 
305
- from cytotable.utils import _duckdb_reader, _sqlite_mixed_type_query_to_parquet
307
+ from cytotable.utils import (
308
+ _duckdb_reader,
309
+ _sqlite_mixed_type_query_to_parquet,
310
+ _write_parquet_table_with_metadata,
311
+ )
306
312
 
307
313
  # attempt to build dest_path
308
314
  source_dest_path = (
@@ -315,7 +321,7 @@ def _source_chunk_to_parquet(
315
321
  select_columns = ",".join(
316
322
  [
317
323
  # here we cast the column to the specified type ensure the colname remains the same
318
- f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
324
+ f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
319
325
  for column in source["columns"]
320
326
  ]
321
327
  )
@@ -339,7 +345,7 @@ def _source_chunk_to_parquet(
339
345
  # read data with chunk size + offset
340
346
  # and export to parquet
341
347
  with _duckdb_reader() as ddb_reader:
342
- parquet.write_table(
348
+ _write_parquet_table_with_metadata(
343
349
  table=ddb_reader.execute(
344
350
  f"""
345
351
  {base_query}
@@ -358,7 +364,7 @@ def _source_chunk_to_parquet(
358
364
  "Mismatch Type Error" in str(e)
359
365
  and str(AnyPath(source["source_path"]).suffix).lower() == ".sqlite"
360
366
  ):
361
- parquet.write_table(
367
+ _write_parquet_table_with_metadata(
362
368
  # here we use sqlite instead of duckdb to extract
363
369
  # data for special cases where column and value types
364
370
  # may not align (which is valid functionality in SQLite).
@@ -410,14 +416,28 @@ def _prepend_column_name(
410
416
  Path to the modified file.
411
417
  """
412
418
 
419
+ import logging
413
420
  import pathlib
414
421
 
415
422
  import pyarrow.parquet as parquet
416
423
 
417
- from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
424
+ from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
425
+ from cytotable.utils import _write_parquet_table_with_metadata
426
+
427
+ logger = logging.getLogger(__name__)
418
428
 
419
429
  targets = tuple(metadata) + tuple(compartments)
420
430
 
431
+ # if we have no targets or metadata to work from, return the table unchanged
432
+ if len(targets) == 0:
433
+ logger.warning(
434
+ msg=(
435
+ "Skipping column name prepend operations"
436
+ "because no compartments or metadata were provided."
437
+ )
438
+ )
439
+ return table_path
440
+
421
441
  table = parquet.read_table(
422
442
  source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
423
443
  )
@@ -499,7 +519,7 @@ def _prepend_column_name(
499
519
  updated_column_names.append(column_name)
500
520
 
501
521
  # perform table column name updates
502
- parquet.write_table(
522
+ _write_parquet_table_with_metadata(
503
523
  table=table.rename_columns(updated_column_names), where=table_path
504
524
  )
505
525
 
@@ -564,13 +584,18 @@ def _concat_source_group(
564
584
  Updated dictionary containing concatenated sources.
565
585
  """
566
586
 
587
+ import errno
567
588
  import pathlib
568
589
 
569
590
  import pyarrow as pa
570
591
  import pyarrow.parquet as parquet
571
592
 
593
+ from cytotable.constants import (
594
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
595
+ CYTOTABLE_DEFAULT_PARQUET_METADATA,
596
+ )
572
597
  from cytotable.exceptions import SchemaException
573
- from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
598
+ from cytotable.utils import _write_parquet_table_with_metadata
574
599
 
575
600
  # build a result placeholder
576
601
  concatted: List[Dict[str, Any]] = [
@@ -600,7 +625,9 @@ def _concat_source_group(
600
625
  destination_path.parent.mkdir(parents=True, exist_ok=True)
601
626
 
602
627
  # build the schema for concatenation writer
603
- writer_schema = pa.schema(common_schema)
628
+ writer_schema = pa.schema(common_schema).with_metadata(
629
+ CYTOTABLE_DEFAULT_PARQUET_METADATA
630
+ )
604
631
 
605
632
  # build a parquet file writer which will be used to append files
606
633
  # as a single concatted parquet file, referencing the first file's schema
@@ -638,7 +665,7 @@ def _concat_source_group(
638
665
  pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
639
666
  except OSError as os_err:
640
667
  # raise only if we don't have a dir not empty errno
641
- if os_err.errno != 66:
668
+ if os_err.errno != errno.ENOTEMPTY:
642
669
  raise
643
670
 
644
671
  # return the concatted parquet filename
@@ -713,7 +740,7 @@ def _join_source_chunk(
713
740
 
714
741
  import pyarrow.parquet as parquet
715
742
 
716
- from cytotable.utils import _duckdb_reader
743
+ from cytotable.utils import _duckdb_reader, _write_parquet_table_with_metadata
717
744
 
718
745
  # Attempt to read the data to parquet file
719
746
  # using duckdb for extraction and pyarrow for
@@ -757,7 +784,7 @@ def _join_source_chunk(
757
784
  )
758
785
 
759
786
  # write the result
760
- parquet.write_table(
787
+ _write_parquet_table_with_metadata(
761
788
  table=result,
762
789
  where=result_file_path,
763
790
  )
@@ -797,7 +824,11 @@ def _concat_join_sources(
797
824
 
798
825
  import pyarrow.parquet as parquet
799
826
 
800
- from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
827
+ from cytotable.constants import (
828
+ CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
829
+ CYTOTABLE_DEFAULT_PARQUET_METADATA,
830
+ )
831
+ from cytotable.utils import _write_parquet_table_with_metadata
801
832
 
802
833
  # remove the unjoined concatted compartments to prepare final dest_path usage
803
834
  # (we now have joined results)
@@ -811,7 +842,7 @@ def _concat_join_sources(
811
842
  shutil.rmtree(path=dest_path)
812
843
 
813
844
  # write the concatted result as a parquet file
814
- parquet.write_table(
845
+ _write_parquet_table_with_metadata(
815
846
  table=pa.concat_tables(
816
847
  tables=[
817
848
  parquet.read_table(
@@ -826,7 +857,9 @@ def _concat_join_sources(
826
857
  # build a parquet file writer which will be used to append files
827
858
  # as a single concatted parquet file, referencing the first file's schema
828
859
  # (all must be the same schema)
829
- writer_schema = parquet.read_schema(join_sources[0])
860
+ writer_schema = parquet.read_schema(join_sources[0]).with_metadata(
861
+ CYTOTABLE_DEFAULT_PARQUET_METADATA
862
+ )
830
863
  with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
831
864
  for table_path in join_sources:
832
865
  writer.write_table(
@@ -1,5 +1,5 @@
1
1
  """
2
- Presets for common pycytominer-transform configurations.
2
+ Presets for common CytoTable configurations.
3
3
  """
4
4
 
5
5
  config = {
@@ -204,7 +204,35 @@ config = {
204
204
  AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
205
205
  """,
206
206
  },
207
+ "in-carta": {
208
+ # version specifications using related references
209
+ "CONFIG_SOURCE_VERSION": {
210
+ "in-carta": "v1.17.0412545",
211
+ },
212
+ # names of source table compartments (for ex. cells.csv, etc.)
213
+ "CONFIG_NAMES_COMPARTMENTS": tuple(),
214
+ # names of source table metadata (for ex. image.csv, etc.)
215
+ "CONFIG_NAMES_METADATA": tuple(),
216
+ # column names in any compartment or metadata tables which contain
217
+ # unique names to avoid renaming
218
+ "CONFIG_IDENTIFYING_COLUMNS": (
219
+ "OBJECT ID",
220
+ "Row",
221
+ "Column",
222
+ "FOV",
223
+ "WELL LABEL",
224
+ "Z",
225
+ "T",
226
+ ),
227
+ # chunk size to use for join operations to help with possible performance issues
228
+ # note: this number is an estimate and is may need changes contingent on data
229
+ # and system used by this library.
230
+ "CONFIG_CHUNK_SIZE": 1000,
231
+ # compartment and metadata joins performed using DuckDB SQL
232
+ # and modified at runtime as needed
233
+ "CONFIG_JOINS": "",
234
+ },
207
235
  }
208
236
  """
209
- Configuration presets for pycytominer-transform
237
+ Configuration presets for CytoTable
210
238
  """
@@ -47,6 +47,7 @@ def _build_path(
47
47
  def _get_source_filepaths(
48
48
  path: Union[pathlib.Path, AnyPath],
49
49
  targets: List[str],
50
+ source_datatype: Optional[str] = None,
50
51
  ) -> Dict[str, List[Dict[str, Any]]]:
51
52
  """
52
53
  Gather dataset of filepaths from a provided directory path.
@@ -56,19 +57,27 @@ def _get_source_filepaths(
56
57
  Either a directory path to seek filepaths within or a path directly to a file.
57
58
  targets: List[str]:
58
59
  Compartment and metadata names to seek within the provided path.
60
+ source_datatype: Optional[str]: (Default value = None)
61
+ The source datatype (extension) to use for reading the tables.
59
62
 
60
63
  Returns:
61
64
  Dict[str, List[Dict[str, Any]]]
62
65
  Data structure which groups related files based on the compartments.
63
66
  """
64
67
 
68
+ import os
65
69
  import pathlib
66
70
 
67
71
  from cloudpathlib import AnyPath
68
72
 
69
- from cytotable.exceptions import NoInputDataException
73
+ from cytotable.exceptions import DatatypeException, NoInputDataException
70
74
  from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
71
75
 
76
+ if (targets is None or targets == []) and source_datatype is None:
77
+ raise DatatypeException(
78
+ f"A source_datatype must be specified when using undefined compartments and metadata names."
79
+ )
80
+
72
81
  # gathers files from provided path using compartments + metadata as a filter
73
82
  sources = [
74
83
  # build source_paths for all files
@@ -85,6 +94,7 @@ def _get_source_filepaths(
85
94
  # ensure the subpaths meet certain specifications
86
95
  if (
87
96
  targets is None
97
+ or targets == []
88
98
  # checks for name of the file from targets (compartment + metadata names)
89
99
  or str(subpath.stem).lower() in [target.lower() for target in targets]
90
100
  # checks for sqlite extension (which may include compartment + metadata names)
@@ -134,21 +144,38 @@ def _get_source_filepaths(
134
144
 
135
145
  # group files together by similar filename for later data operations
136
146
  grouped_sources = {}
137
- for unique_source in set(source["source_path"].name for source in sources):
138
- grouped_sources[unique_source.capitalize()] = [
139
- # case for files besides sqlite
140
- source if source["source_path"].suffix.lower() != ".sqlite"
141
- # if we have sqlite entries, update the source_path to the parent
142
- # (the parent table database file) as grouped key name will now
143
- # encapsulate the table name details.
144
- else {
145
- "source_path": source["source_path"].parent,
146
- "table_name": source["table_name"],
147
- }
148
- for source in sources
149
- # focus only on entries which include the unique_source name
150
- if source["source_path"].name == unique_source
151
- ]
147
+
148
+ # if we have no targets, create a single group inferred from a common prefix and suffix
149
+ # note: this may apply for scenarios where no compartments or metadata are
150
+ # provided as input to CytoTable operations.
151
+ if targets is None or targets == []:
152
+ # gather a common prefix to use for the group
153
+ common_prefix = os.path.commonprefix(
154
+ [
155
+ source["source_path"].stem
156
+ for source in sources
157
+ if source["source_path"].suffix == f".{source_datatype}"
158
+ ]
159
+ )
160
+ grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
161
+
162
+ # otherwise, use the unique names in the paths to determine source grouping
163
+ else:
164
+ for unique_source in set(source["source_path"].name for source in sources):
165
+ grouped_sources[unique_source.capitalize()] = [
166
+ # case for files besides sqlite
167
+ source if source["source_path"].suffix.lower() != ".sqlite"
168
+ # if we have sqlite entries, update the source_path to the parent
169
+ # (the parent table database file) as grouped key name will now
170
+ # encapsulate the table name details.
171
+ else {
172
+ "source_path": source["source_path"].parent,
173
+ "table_name": source["table_name"],
174
+ }
175
+ for source in sources
176
+ # focus only on entries which include the unique_source name
177
+ if source["source_path"].name == unique_source
178
+ ]
152
179
 
153
180
  return grouped_sources
154
181
 
@@ -190,7 +217,7 @@ def _infer_source_datatype(
190
217
  raise DatatypeException(
191
218
  (
192
219
  f"Unable to find source datatype {source_datatype} "
193
- "within files. Detected datatypes: {suffixes}"
220
+ f"within files. Detected datatypes: {suffixes}"
194
221
  )
195
222
  )
196
223
 
@@ -270,7 +297,9 @@ def _gather_sources(
270
297
  source_path = _build_path(path=source_path, **kwargs)
271
298
 
272
299
  # gather filepaths which will be used as the basis for this work
273
- sources = _get_source_filepaths(path=source_path, targets=targets)
300
+ sources = _get_source_filepaths(
301
+ path=source_path, targets=targets, source_datatype=source_datatype
302
+ )
274
303
 
275
304
  # infer or validate the source datatype based on source filepaths
276
305
  source_datatype = _infer_source_datatype(
@@ -3,13 +3,13 @@ Utility functions for CytoTable
3
3
  """
4
4
 
5
5
  import logging
6
- import multiprocessing
7
6
  import os
8
7
  import pathlib
9
- from typing import Any, Dict, Union, cast
8
+ from typing import Any, Dict, Optional, Union, cast
10
9
 
11
10
  import duckdb
12
11
  import parsl
12
+ import pyarrow as pa
13
13
  from cloudpathlib import AnyPath, CloudPath
14
14
  from cloudpathlib.exceptions import InvalidPrefixError
15
15
  from parsl.app.app import AppBase
@@ -19,67 +19,6 @@ from parsl.executors import HighThroughputExecutor
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
- # read max threads from environment if necessary
23
- # max threads will be used with default Parsl config and Duckdb
24
- MAX_THREADS = (
25
- multiprocessing.cpu_count()
26
- if "CYTOTABLE_MAX_THREADS" not in os.environ
27
- else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
28
- )
29
-
30
- # enables overriding default memory mapping behavior with pyarrow memory mapping
31
- CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
32
- os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
33
- )
34
-
35
- DDB_DATA_TYPE_SYNONYMS = {
36
- "real": ["float32", "float4", "float"],
37
- "double": ["float64", "float8", "numeric", "decimal"],
38
- "integer": ["int32", "int4", "int", "signed"],
39
- "bigint": ["int64", "int8", "long"],
40
- }
41
-
42
- # A reference dictionary for SQLite affinity and storage class types
43
- # See more here: https://www.sqlite.org/datatype3.html#affinity_name_examples
44
- SQLITE_AFFINITY_DATA_TYPE_SYNONYMS = {
45
- "integer": [
46
- "int",
47
- "integer",
48
- "tinyint",
49
- "smallint",
50
- "mediumint",
51
- "bigint",
52
- "unsigned big int",
53
- "int2",
54
- "int8",
55
- ],
56
- "text": [
57
- "character",
58
- "varchar",
59
- "varying character",
60
- "nchar",
61
- "native character",
62
- "nvarchar",
63
- "text",
64
- "clob",
65
- ],
66
- "blob": ["blob"],
67
- "real": [
68
- "real",
69
- "double",
70
- "double precision",
71
- "float",
72
- ],
73
- "numeric": [
74
- "numeric",
75
- "decimal",
76
- "boolean",
77
- "date",
78
- "datetime",
79
- ],
80
- }
81
-
82
-
83
22
  # reference the original init
84
23
  original_init = AppBase.__init__
85
24
 
@@ -198,6 +137,10 @@ def _duckdb_reader() -> duckdb.DuckDBPyConnection:
198
137
  duckdb.DuckDBPyConnection
199
138
  """
200
139
 
140
+ import duckdb
141
+
142
+ from cytotable.constants import MAX_THREADS
143
+
201
144
  return duckdb.connect().execute(
202
145
  # note: we use an f-string here to
203
146
  # dynamically configure threads as appropriate
@@ -252,20 +195,25 @@ def _sqlite_mixed_type_query_to_parquet(
252
195
 
253
196
  import pyarrow as pa
254
197
 
198
+ from cytotable.constants import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
255
199
  from cytotable.exceptions import DatatypeException
256
- from cytotable.utils import SQLITE_AFFINITY_DATA_TYPE_SYNONYMS
257
200
 
258
201
  # open sqlite3 connection
259
202
  with sqlite3.connect(source_path) as conn:
260
203
  cursor = conn.cursor()
261
204
 
262
- # gather table column details including datatype
205
+ # Gather table column details including datatype.
206
+ # Note: uses SQLite pragma for table information.
207
+ # See the following for more information:
208
+ # https://sqlite.org/pragma.html#pragma_table_info
263
209
  cursor.execute(
264
210
  f"""
265
211
  SELECT :table_name as table_name,
266
212
  name as column_name,
267
213
  type as column_type
268
- FROM pragma_table_info(:table_name);
214
+ FROM pragma_table_info(:table_name)
215
+ /* explicit column ordering by 'cid' */
216
+ ORDER BY cid ASC;
269
217
  """,
270
218
  {"table_name": table_name},
271
219
  )
@@ -384,6 +332,9 @@ def _arrow_type_cast_if_specified(
384
332
  Dict[str, str]
385
333
  A potentially data type updated dictionary of column information
386
334
  """
335
+
336
+ from cytotable.constants import DDB_DATA_TYPE_SYNONYMS
337
+
387
338
  # for casting to new float type
388
339
  if "float" in data_type_cast_map.keys() and column["column_dtype"] in [
389
340
  "REAL",
@@ -453,3 +404,56 @@ def _expand_path(
453
404
  modifed_path = modifed_path.expanduser()
454
405
 
455
406
  return modifed_path.resolve()
407
+
408
+
409
+ def _get_cytotable_version() -> str:
410
+ """
411
+ Seeks the current version of CytoTable using either pkg_resources
412
+ or dunamai to determine the current version being used.
413
+
414
+ Returns:
415
+ str
416
+ A string representing the version of CytoTable currently being used.
417
+ """
418
+
419
+ try:
420
+ # attempt to gather the development version from dunamai
421
+ # for scenarios where cytotable from source is used.
422
+ import dunamai
423
+
424
+ return dunamai.Version.from_any_vcs().serialize()
425
+ except (RuntimeError, ModuleNotFoundError):
426
+ # else grab a static version from __init__.py
427
+ # for scenarios where the built/packaged cytotable is used.
428
+ import cytotable
429
+
430
+ return cytotable.__version__
431
+
432
+
433
+ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None:
434
+ """
435
+ Adds metadata to parquet output from CytoTable.
436
+ Note: this mostly wraps pyarrow.parquet.write_table
437
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
438
+
439
+ Args:
440
+ table: pa.Table:
441
+ Pyarrow table to be serialized as parquet table.
442
+ **kwargs: Any:
443
+ kwargs provided to this function roughly align with
444
+ pyarrow.parquet.write_table. The following might be
445
+ examples of what to expect here:
446
+ - where: str or pyarrow.NativeFile
447
+ """
448
+
449
+ from pyarrow import parquet
450
+
451
+ from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA
452
+ from cytotable.utils import _get_cytotable_version
453
+
454
+ parquet.write_table(
455
+ table=table.replace_schema_metadata(
456
+ metadata=CYTOTABLE_DEFAULT_PARQUET_METADATA
457
+ ),
458
+ **kwargs,
459
+ )
@@ -1,6 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "CytoTable"
3
- version = "0.0.3"
3
+ # note: version data is maintained by poetry-dynamic-versioning (do not edit)
4
+ version = "0.0.4"
4
5
  description = "Transform CellProfiler and DeepProfiler data for processing image-based profiling readouts with Pycytominer and other Cytomining tools."
5
6
  authors = ["Cytomining Community"]
6
7
  license = "BSD-3-Clause License"
@@ -10,14 +11,25 @@ repository = "https://github.com/cytomining/CytoTable"
10
11
  documentation = "https://cytomining.github.io/CytoTable/"
11
12
  keywords = ["python", "cellprofiler","single-cell-analysis", "way-lab"]
12
13
 
14
+ [tool.poetry-dynamic-versioning]
15
+ enable = false
16
+ style = "pep440"
17
+ vcs = "git"
18
+
19
+ [build-system]
20
+ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
21
+ build-backend = "poetry_dynamic_versioning.backend"
22
+
23
+ [tool.setuptools_scm]
24
+
13
25
  [tool.poetry.dependencies]
14
26
  python = ">=3.8,<3.13"
15
- pyarrow = "^13.0.0"
27
+ pyarrow = ">=13.0.0"
16
28
  cloudpathlib = {extras = ["all"], version = "^0.15.0"}
17
29
  duckdb = ">=0.8.0"
18
30
  parsl = ">=2023.9.25"
19
31
 
20
- [tool.poetry.dev-dependencies]
32
+ [tool.poetry.group.dev.dependencies]
21
33
  pytest = "^7.4.0"
22
34
  pytest-cov = "^4.1.0"
23
35
  Sphinx = "^6.0.0"
@@ -27,10 +39,7 @@ moto = {extras = ["s3", "server"], version = "^4.0.0"}
27
39
  cffconvert = "^2.0.0"
28
40
  cytominer-database = "^0.3.4"
29
41
  pycytominer = { git = "https://github.com/cytomining/pycytominer.git", rev = "09b2c79aa94908e3520f0931a844db4fba7fd3fb" }
30
-
31
- [build-system]
32
- requires = ["poetry-core"]
33
- build-backend = "poetry.core.masonry.api"
42
+ dunamai = "^1.19.0"
34
43
 
35
44
  [tool.vulture]
36
45
  min_confidence = 80
@@ -7,10 +7,17 @@ _Diagram showing data flow relative to this project._
7
7
 
8
8
  ## Summary
9
9
 
10
- CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
10
+ CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
11
11
  CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
12
12
  The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
13
13
 
14
+ The name for the project is inspired from:
15
+
16
+ - __Cyto__: "1. (biology) cell." ([Wiktionary: Cyto-](https://en.wiktionary.org/wiki/cyto-))
17
+ - __Table__:
18
+ - "1. Furniture with a top surface to accommodate a variety of uses."
19
+ - "3.1. A matrix or grid of data arranged in rows and columns." <br> ([Wiktionary: Table](https://en.wiktionary.org/wiki/table))
20
+
14
21
  ## Installation
15
22
 
16
23
  Install CytoTable from [PyPI](https://pypi.org/) or from source:
File without changes