tablassert 7.2.1__tar.gz → 7.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {tablassert-7.2.1 → tablassert-7.3.0}/.github/workflows/docker.yml +12 -5
  2. {tablassert-7.2.1 → tablassert-7.3.0}/CHANGELOG.md +16 -0
  3. {tablassert-7.2.1 → tablassert-7.3.0}/CITATION.cff +1 -1
  4. {tablassert-7.2.1 → tablassert-7.3.0}/Dockerfile +1 -1
  5. {tablassert-7.2.1 → tablassert-7.3.0}/PKG-INFO +2 -2
  6. tablassert-7.3.0/docs/api/lib.md +236 -0
  7. {tablassert-7.2.1 → tablassert-7.3.0}/docs/docker.md +2 -2
  8. {tablassert-7.2.1 → tablassert-7.3.0}/mkdocs.yml +1 -0
  9. {tablassert-7.2.1 → tablassert-7.3.0}/pyproject.toml +2 -2
  10. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/lib.py +32 -3
  11. {tablassert-7.2.1 → tablassert-7.3.0}/uv.lock +1 -1
  12. {tablassert-7.2.1 → tablassert-7.3.0}/.github/workflows/autotag.yml +0 -0
  13. {tablassert-7.2.1 → tablassert-7.3.0}/.github/workflows/docs.yml +0 -0
  14. {tablassert-7.2.1 → tablassert-7.3.0}/.github/workflows/pipy.yml +0 -0
  15. {tablassert-7.2.1 → tablassert-7.3.0}/.gitignore +0 -0
  16. {tablassert-7.2.1 → tablassert-7.3.0}/.pre-commit-config.yaml +0 -0
  17. {tablassert-7.2.1 → tablassert-7.3.0}/AGENTS.md +0 -0
  18. {tablassert-7.2.1 → tablassert-7.3.0}/CONTRIBUTING.md +0 -0
  19. {tablassert-7.2.1 → tablassert-7.3.0}/LICENSE +0 -0
  20. {tablassert-7.2.1 → tablassert-7.3.0}/README.md +0 -0
  21. {tablassert-7.2.1 → tablassert-7.3.0}/docs/api/fullmap.md +0 -0
  22. {tablassert-7.2.1 → tablassert-7.3.0}/docs/api/qc.md +0 -0
  23. {tablassert-7.2.1 → tablassert-7.3.0}/docs/api/utils.md +0 -0
  24. {tablassert-7.2.1 → tablassert-7.3.0}/docs/cli.md +0 -0
  25. {tablassert-7.2.1 → tablassert-7.3.0}/docs/configuration/advanced-example.md +0 -0
  26. {tablassert-7.2.1 → tablassert-7.3.0}/docs/configuration/graph.md +0 -0
  27. {tablassert-7.2.1 → tablassert-7.3.0}/docs/configuration/table.md +0 -0
  28. {tablassert-7.2.1 → tablassert-7.3.0}/docs/datassert.md +0 -0
  29. {tablassert-7.2.1 → tablassert-7.3.0}/docs/examples/tutorial-data.csv +0 -0
  30. {tablassert-7.2.1 → tablassert-7.3.0}/docs/examples/tutorial-graph.yaml +0 -0
  31. {tablassert-7.2.1 → tablassert-7.3.0}/docs/examples/tutorial-table.yaml +0 -0
  32. {tablassert-7.2.1 → tablassert-7.3.0}/docs/examples.md +0 -0
  33. {tablassert-7.2.1 → tablassert-7.3.0}/docs/index.md +0 -0
  34. {tablassert-7.2.1 → tablassert-7.3.0}/docs/installation.md +0 -0
  35. {tablassert-7.2.1 → tablassert-7.3.0}/docs/tutorial.md +0 -0
  36. {tablassert-7.2.1 → tablassert-7.3.0}/llms.txt +0 -0
  37. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/__init__.py +0 -0
  38. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/cli.py +0 -0
  39. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/downloader.py +0 -0
  40. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/enums.py +0 -0
  41. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/fullmap.py +0 -0
  42. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/ingests.py +0 -0
  43. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/log.py +0 -0
  44. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/models.py +0 -0
  45. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/nlp.py +0 -0
  46. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/qc.py +0 -0
  47. {tablassert-7.2.1 → tablassert-7.3.0}/src/tablassert/utils.py +0 -0
  48. {tablassert-7.2.1 → tablassert-7.3.0}/tests/__init__.py +0 -0
  49. {tablassert-7.2.1 → tablassert-7.3.0}/tests/conftest.py +0 -0
  50. {tablassert-7.2.1 → tablassert-7.3.0}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
  51. {tablassert-7.2.1 → tablassert-7.3.0}/tests/fixtures/minimal_section.yaml +0 -0
  52. {tablassert-7.2.1 → tablassert-7.3.0}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
  53. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_enums.py +0 -0
  54. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_fullmap.py +0 -0
  55. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_ingests.py +0 -0
  56. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_lib.py +0 -0
  57. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_models.py +0 -0
  58. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_nlp.py +0 -0
  59. {tablassert-7.2.1 → tablassert-7.3.0}/tests/test_utils.py +0 -0
@@ -1,9 +1,13 @@
1
1
  name: Publish Docker Image
2
2
  on:
3
3
  workflow_dispatch:
4
- push:
5
- tags:
6
- - "v*"
4
+ workflow_run:
5
+ workflows:
6
+ - "Auto Tag Versions"
7
+ types:
8
+ - completed
9
+ branches:
10
+ - main
7
11
  jobs:
8
12
  publish:
9
13
  runs-on: ubuntu-latest
@@ -12,6 +16,9 @@ jobs:
12
16
  packages: write
13
17
  steps:
14
18
  - uses: actions/checkout@v4
19
+ - name: Get version from pyproject.toml
20
+ id: version
21
+ run: echo "version=v$(grep -m1 'version = "' pyproject.toml | cut -d'"' -f2)" >> "$GITHUB_OUTPUT"
15
22
  - uses: docker/setup-buildx-action@v3
16
23
  - uses: docker/login-action@v3
17
24
  with:
@@ -24,5 +31,5 @@ jobs:
24
31
  file: ./Dockerfile
25
32
  push: true
26
33
  tags: |
27
- ghcr.io/${{ github.repository_owner }}/tablassert:latest
28
- ghcr.io/${{ github.repository_owner }}/tablassert:${{ github.ref_name }}
34
+ ghcr.io/skyeav/tablassert:latest
35
+ ghcr.io/skyeav/tablassert:${{ steps.version.outputs.version }}
@@ -2,6 +2,22 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.3.0 - 2026-04-03
6
+
7
+ ### New Features
8
+ - Added `resolve_many()` to `lib` module — a standalone batch entity resolution function that resolves an iterable of text strings to CURIEs without requiring manual LazyFrame setup, NLP preprocessing, or DuckDB connection management.
9
+
10
+ ### Documentation
11
+ - Added detailed API reference page for `resolve_many()` covering function signature, parameters, return value, usage examples, and integration notes.
12
+
13
+ ## 7.2.2 - 2026-04-01
14
+
15
+ ### Bug Fixes
16
+ - Fixed Docker publish workflow failing due to mixed-case repository owner in image tags. Hardcoded lowercase `ghcr.io/skyeav/tablassert` and switched trigger to run after autotag completion.
17
+
18
+ ### Maintenance
19
+ - Updated PyPI short description.
20
+
5
21
  ## 7.2.1 - 2026-04-01
6
22
 
7
23
  ### Maintenance
@@ -2,7 +2,7 @@ cff-version: 1.2.0
2
2
  message: "If you use Tablassert, please cite it as below."
3
3
  type: software
4
4
  title: Tablassert
5
- version: 7.2.1
5
+ version: 7.2.2
6
6
  license: Apache-2.0
7
7
  repository-code: https://github.com/SkyeAv/Tablassert
8
8
  abstract: Tablassert is a highly performant declarative knowledge graph backend for bioinformatics that extracts knowledge assertions from tabular data, performs entity resolution and data quality control, and exports NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
@@ -1,6 +1,6 @@
1
1
  FROM python:3.14-slim
2
2
 
3
- RUN pip install --no-cache-dir "tablassert[full]"
3
+ RUN pip install --no-cache-dir "tablassert"
4
4
 
5
5
  ENTRYPOINT ["tablassert"]
6
6
  CMD ["--help"]
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.2.1
4
- Summary: Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
3
+ Version: 7.3.0
4
+ Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON declaratively, with entity resolution and quality control built in.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
7
7
  Project-URL: Documentation, https://skyeav.github.io/Tablassert/
@@ -0,0 +1,236 @@
1
+ # Batch Resolution (lib)
2
+
3
+ The `lib` module exposes `resolve_many()`, a high-level convenience function for resolving an iterable of entity strings to CURIEs without requiring manual LazyFrame construction, NLP preprocessing, or DuckDB shard management.
4
+
5
+ It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 16 DuckDB shard connections, executing entity resolution, and returning results as a plain Python dictionary.
6
+
7
+ ## resolve_many()
8
+
9
+ Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a dictionary of lists.
10
+
11
+ ### Function Signature
12
+
13
+ ```python
14
+ def resolve_many(
15
+ col: str,
16
+ entities: Iterable[str],
17
+ datassert: Path,
18
+ taxon: Optional[str] = None,
19
+ prioritize: Optional[list[Categories]] = None,
20
+ avoid: Optional[list[Categories]] = None,
21
+ column_context: bool = True,
22
+ ) -> dict[str, list[str]]
23
+ ```
24
+
25
+ ### Parameters
26
+
27
+ **`col: str`**
28
+
29
+ Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in the returned dictionary.
30
+
31
+ For example, if `col="gene"`, the returned dictionary will contain keys like `"gene"`, `"gene name"`, `"gene category"`, etc.
32
+
33
+ **`entities: Iterable[str]`**
34
+
35
+ An iterable of text strings to resolve. Each string is treated as a candidate entity name that will be normalized and matched against the datassert synonym database. Accepts any iterable — lists, tuples, generators, sets, etc.
36
+
37
+ Examples: `["TP53", "BRCA1", "EGFR"]`, `("aspirin", "ibuprofen")`, or a generator expression.
38
+
39
+ **`datassert: Path`**
40
+
41
+ Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 16 DuckDB shard files (`0.duckdb` through `15.duckdb`).
42
+
43
+ Each shard contains:
44
+ - Synonym mappings (text → CURIE)
45
+ - Preferred entity names
46
+ - Biolink categories
47
+ - NCBI Taxon IDs
48
+ - Source databases and versions
49
+
50
+ **`taxon: Optional[str]` (default: `None`)**
51
+
52
+ Optional NCBI Taxon ID for filtering results to a specific organism.
53
+
54
+ Example: `"9606"` restricts matches to human-specific entities. When `None`, no taxon filtering is applied and matches from all organisms are returned.
55
+
56
+ **`prioritize: Optional[list[Categories]]` (default: `None`)**
57
+
58
+ Optional list of Biolink categories to prefer when multiple matches exist for the same input term. Categories listed here receive higher ranking scores during resolution.
59
+
60
+ Example: `[Categories.Gene, Categories.Protein]` prefers gene and protein mappings over other categories like diseases or chemicals.
61
+
62
+ **`avoid: Optional[list[Categories]]` (default: `None`)**
63
+
64
+ Optional list of Biolink categories to exclude from results entirely. Any match belonging to an avoided category is filtered out before ranking.
65
+
66
+ Example: `[Categories.Gene]` prevents gene mappings from appearing in the output, even if they would otherwise be the best match.
67
+
68
+ **`column_context: bool` (default: `True`)**
69
+
70
+ Controls category-frequency tie-breaking when multiple matches exist for a term. When `True`, the resolution query adds a category frequency score and prefers the category that appears most frequently across all terms in the batch. When `False`, frequency-based tie-breaking is disabled.
71
+
72
+ This is useful when resolving a column of related entities (e.g., all genes) — the shared context helps disambiguate terms that map to multiple categories.
73
+
74
+ ### Return Value
75
+
76
+ Returns a `dict[str, list[str]]` where each key is a column name and each value is a list of resolved values. The dictionary is produced by calling `polars.DataFrame.to_dict(as_series=False)` on the collected resolution output.
77
+
78
+ The returned dictionary contains the following keys (where `{col}` is the value of the `col` parameter):
79
+
80
+ | Key | Description | Example Value |
81
+ |-----|-------------|---------------|
82
+ | `{col}` | CURIE identifier | `"HGNC:11998"` |
83
+ | `{col} name` | Preferred entity name | `"TP53"` |
84
+ | `{col} category` | Biolink category (prefixed) | `"biolink:Gene"` |
85
+ | `{col} taxon` | NCBI Taxon ID (prefixed) | `"NCBITaxon:9606"` |
86
+ | `{col} source` | Source database | `"HGNC"` |
87
+ | `{col} source version` | Database version | `"2025-01"` |
88
+ | `{col} nlp level` | NLP processing level used for match | `0` or `1` |
89
+
90
+ **Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned lists may therefore be shorter than the input iterable.
91
+
92
+ ### Pipeline Internals
93
+
94
+ `resolve_many()` executes the following steps internally:
95
+
96
+ 1. **Series construction** — Wraps the input iterable in a `pl.Series` with the given column name, then converts to a single-column `pl.LazyFrame`.
97
+
98
+ 2. **NLP normalization** — Applies `level_one()` (whitespace stripping + lowercasing) and `level_two()` (non-word character removal via `\W+`) to produce the two normalized columns required by `resolve()`.
99
+
100
+ 3. **DuckDB connection management** — Opens all 16 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
101
+
102
+ 4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
103
+
104
+ 5. **Collection and conversion** — Collects the lazy result into an eager `pl.DataFrame` and converts to a Python dictionary via `to_dict(as_series=False)`.
105
+
106
+ ### Example Usage
107
+
108
+ #### Basic Gene Resolution
109
+
110
+ ```python
111
+ from pathlib import Path
112
+ from tablassert.lib import resolve_many
113
+ from tablassert.enums import Categories
114
+
115
+ datassert: Path = Path("/path/to/datassert")
116
+
117
+ result: dict[str, list[str]] = resolve_many(
118
+ col="gene",
119
+ entities=["TP53", "BRCA1", "EGFR", "KRAS"],
120
+ datassert=datassert,
121
+ taxon="9606",
122
+ prioritize=[Categories.Gene],
123
+ )
124
+
125
+ # result["gene"] → ["HGNC:11998", "HGNC:1100", ...]
126
+ # result["gene name"] → ["TP53", "BRCA1", ...]
127
+ # result["gene category"] → ["biolink:Gene", "biolink:Gene", ...]
128
+ ```
129
+
130
+ #### Disease Resolution With Category Avoidance
131
+
132
+ ```python
133
+ from pathlib import Path
134
+ from tablassert.lib import resolve_many
135
+ from tablassert.enums import Categories
136
+
137
+ datassert: Path = Path("/path/to/datassert")
138
+
139
+ result: dict[str, list[str]] = resolve_many(
140
+ col="disease",
141
+ entities=["diabetes mellitus", "breast cancer", "alzheimer disease"],
142
+ datassert=datassert,
143
+ avoid=[Categories.Gene, Categories.Protein],
144
+ )
145
+
146
+ # result["disease"] → ["MONDO:0005015", ...]
147
+ # result["disease name"] → ["diabetes mellitus", ...]
148
+ # result["disease category"] → ["biolink:Disease", ...]
149
+ ```
150
+
151
+ #### Chemical Resolution Without Column Context
152
+
153
+ ```python
154
+ from pathlib import Path
155
+ from tablassert.lib import resolve_many
156
+
157
+ datassert: Path = Path("/path/to/datassert")
158
+
159
+ result: dict[str, list[str]] = resolve_many(
160
+ col="chemical",
161
+ entities=["aspirin", "metformin", "ibuprofen"],
162
+ datassert=datassert,
163
+ column_context=False,
164
+ )
165
+ ```
166
+
167
+ #### Consuming Results
168
+
169
+ ```python
170
+ import polars as pl
171
+ from pathlib import Path
172
+ from tablassert.lib import resolve_many
173
+
174
+ datassert: Path = Path("/path/to/datassert")
175
+
176
+ result: dict[str, list[str]] = resolve_many(
177
+ col="gene",
178
+ entities=["TP53", "BRCA1"],
179
+ datassert=datassert,
180
+ taxon="9606",
181
+ )
182
+
183
+ # Convert back to a Polars DataFrame
184
+ df: pl.DataFrame = pl.DataFrame(result)
185
+
186
+ # Or iterate over resolved pairs
187
+ for curie, name in zip(result["gene"], result["gene name"]):
188
+ print(f"{name} → {curie}")
189
+ ```
190
+
191
+ ### Comparison With resolve()
192
+
193
+ | Aspect | `resolve_many()` | `resolve()` |
194
+ |--------|-------------------|-------------|
195
+ | **Module** | `tablassert.lib` | `tablassert.fullmap` |
196
+ | **Input** | Plain iterable of strings | Pre-normalized `pl.LazyFrame` |
197
+ | **NLP** | Applied automatically | Must be applied upstream |
198
+ | **Connections** | Managed internally via `ExitStack` | Must be opened externally |
199
+ | **Output** | `dict[str, list[str]]` | `pl.LazyFrame` |
200
+ | **Logging** | Uses default (`log=True`) | Configurable |
201
+ | **Context params** | Not exposed (`section_hash`, `config_file`, `tag`) | Fully configurable |
202
+ | **Use case** | Standalone batch lookups, scripting, notebooks | Internal pipeline integration |
203
+
204
+ `resolve_many()` is designed for ad-hoc and programmatic use — scripts, notebooks, and one-off lookups. For pipeline integration where you need full control over logging, context metadata, and lazy evaluation, use `resolve()` directly.
205
+
206
+ ### NLP Processing
207
+
208
+ `resolve_many()` applies both NLP normalization levels before resolution:
209
+
210
+ **Level one** — `level_one(lf, col)`:
211
+ - Strips leading/trailing whitespace
212
+ - Converts to lowercase
213
+ - Output column: `{col}` (overwrites the original)
214
+
215
+ **Level two** — `level_two(lf, col)`:
216
+ - Removes all non-word characters (`\W+` → `""`) from the level-one result
217
+ - Output column: `{col} two`
218
+
219
+ Both levels are queried during resolution. Level one (exact case-insensitive match) is preferred; level two is used as a fallback for terms with punctuation or special characters.
220
+
221
+ ### Error Handling
222
+
223
+ - If the `datassert` path does not contain the expected shard files, `duckdb.connect()` will raise an `IOException`.
224
+ - If `entities` is empty, the function returns a dictionary with empty lists for all output columns.
225
+ - The `ExitStack` ensures all 16 DuckDB connections are closed even if resolution raises an exception.
226
+ - Unresolved entities are silently filtered from the output (logged at INFO level by default via `resolve()`).
227
+
228
+ ## Integration
229
+
230
+ `resolve_many()` is a self-contained entry point. It does not require any prior setup beyond having a datassert database available. For full pipeline builds, use the CLI (`tablassert build-knowledge-graph`) which orchestrates resolution through the `Tcode` class.
231
+
232
+ ## Next Steps
233
+
234
+ - **[Entity Resolution](fullmap.md)** — Lower-level `resolve()` function details
235
+ - **[Quality Control](qc.md)** — Multi-stage validation of resolved entities
236
+ - **[Configuration](../configuration/table.md)** — YAML-driven entity resolution settings
@@ -10,7 +10,7 @@ The image is based on `python:3.14-slim` with the Tablassert CLI as the entrypoi
10
10
  docker pull ghcr.io/skyeav/tablassert:latest
11
11
  ```
12
12
 
13
- Version-pinned tags match the git tag (e.g., `ghcr.io/skyeav/tablassert:v7.2.1`).
13
+ Version-pinned tags match the git tag (e.g., `ghcr.io/skyeav/tablassert:v7.2.2`).
14
14
 
15
15
  ## Quick Start
16
16
 
@@ -87,4 +87,4 @@ docker run --rm \
87
87
 
88
88
  ## CI/CD Integration
89
89
 
90
- Images are built by `.github/workflows/docker.yml`, which triggers on tag pushes (after autotag and PyPI publish complete). Tags match the repository version tag (e.g., `v7.2.1`).
90
+ Images are built by `.github/workflows/docker.yml`, which triggers on tag pushes (after autotag and PyPI publish complete). Tags match the repository version tag (e.g., `v7.2.2`).
@@ -14,6 +14,7 @@ nav:
14
14
  - Advanced Example: configuration/advanced-example.md
15
15
  - API Reference:
16
16
  - Entity Resolution: api/fullmap.md
17
+ - Batch Resolution: api/lib.md
17
18
  - Quality Control: api/qc.md
18
19
  - Utilities: api/utils.md
19
20
  - Changelog: ../CHANGELOG.md
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.2.1"
4
- description = "Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON."
3
+ version = "7.3.0"
4
+ description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON declaratively, with entity resolution and quality control built in."
5
5
  authors = [
6
6
  { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
7
7
  ]
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import math
4
4
  import operator
5
+ from collections.abc import Iterable
6
+ from contextlib import ExitStack
5
7
  from functools import reduce
6
8
  from operator import add, eq, le
7
9
  from os.path import basename
@@ -13,11 +15,11 @@ from pydantic import Field, NonNegativeInt, PositiveInt
13
15
  from sqlite_utils import Database
14
16
 
15
17
  from tablassert.downloader import from_url
16
- from tablassert.enums import EncodingMethods, Files, Tokens
17
- from tablassert.fullmap import resolve
18
+ from tablassert.enums import Categories, EncodingMethods, Files, Tokens
19
+ from tablassert.fullmap import SHARDS, resolve
18
20
  from tablassert.log import logger
19
- from tablassert.nlp import level_one, level_two
20
21
  from tablassert.models import Encoding, NodeEncoding, Section
22
+ from tablassert.nlp import level_one, level_two
21
23
  from tablassert.qc import fullmap_audit
22
24
  from tablassert.utils import namespace_uuid
23
25
 
@@ -475,3 +477,30 @@ def compile_graph(subgraphs: list[Path], name: str, version: str, fmt: str = "mi
475
477
 
476
478
  dedup_stream(e, is_edges=True)
477
479
  dedup_stream(n, is_edges=False)
480
+
481
+
482
+ def resolve_many(
483
+ col: str,
484
+ entities: Iterable[str],
485
+ datassert: Path,
486
+ taxon: Optional[str] = None,
487
+ prioritize: Optional[list[Categories]] = None,
488
+ avoid: Optional[list[Categories]] = None,
489
+ column_context: bool = True,
490
+ ) -> dict[str, list[str]]:
491
+ series: pl.Series = pl.Series(col, entities)
492
+ lf: pl.LazyFrame = series.to_frame().lazy()
493
+
494
+ lf = level_one(lf, col)
495
+ lf = level_two(lf, col)
496
+
497
+ with ExitStack() as stack:
498
+ conns: list[object] = [
499
+ stack.enter_context(duckdb.connect(datassert / "data" / f"{x}.duckdb", read_only=True))
500
+ for x in range(SHARDS)
501
+ ]
502
+
503
+ lf = resolve(lf, col, conns, taxon=taxon, prioritize=prioritize, avoid=avoid, column_context=column_context)
504
+
505
+ df: pl.DataFrame = lf.collect()
506
+ return df.to_dict(as_series=False)
@@ -2211,7 +2211,7 @@ wheels = [
2211
2211
 
2212
2212
  [[package]]
2213
2213
  name = "tablassert"
2214
- version = "7.2.1"
2214
+ version = "7.3.0"
2215
2215
  source = { editable = "." }
2216
2216
  dependencies = [
2217
2217
  { name = "duckdb" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes