tablassert 7.3.0__tar.gz → 7.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.3.0 → tablassert-7.3.2}/CHANGELOG.md +14 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/Dockerfile +2 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/PKG-INFO +16 -2
- {tablassert-7.3.0 → tablassert-7.3.2}/README.md +14 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/api/lib.md +22 -19
- {tablassert-7.3.0 → tablassert-7.3.2}/pyproject.toml +2 -2
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/lib.py +3 -2
- {tablassert-7.3.0 → tablassert-7.3.2}/.github/workflows/autotag.yml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/.github/workflows/docker.yml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/.github/workflows/docs.yml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/.gitignore +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/.pre-commit-config.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/AGENTS.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/CITATION.cff +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/CONTRIBUTING.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/LICENSE +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/api/fullmap.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/api/qc.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/api/utils.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/cli.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/configuration/advanced-example.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/configuration/graph.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/configuration/table.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/datassert.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/docker.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/examples.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/index.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/installation.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/docs/tutorial.md +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/llms.txt +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/mkdocs.yml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/__init__.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/cli.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/downloader.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/enums.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/fullmap.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/ingests.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/log.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/models.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/nlp.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/qc.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/src/tablassert/utils.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/__init__.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/conftest.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/fixtures/minimal_section.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_enums.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_fullmap.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_ingests.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_lib.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_models.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_nlp.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/tests/test_utils.py +0 -0
- {tablassert-7.3.0 → tablassert-7.3.2}/uv.lock +0 -0
|
@@ -2,6 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.3.2 - 2026-04-03
|
|
6
|
+
|
|
7
|
+
### Maintenance
|
|
8
|
+
- Updated dependencies. No API changes.
|
|
9
|
+
|
|
10
|
+
## 7.3.1 - 2026-04-03
|
|
11
|
+
|
|
12
|
+
### Changes
|
|
13
|
+
- Changed `resolve_many()` return type from `dict[str, list[str]]` to `list[dict[str, Any]]` — each resolved entity is now a row dictionary, produced via `to_dicts()`.
|
|
14
|
+
- `resolve_many()` now preserves the original input text in an `original {col}` key on each result row.
|
|
15
|
+
|
|
16
|
+
### Documentation
|
|
17
|
+
- Updated `resolve_many()` API reference to match the current function signature, return type, and output format.
|
|
18
|
+
|
|
5
19
|
## 7.3.0 - 2026-04-03
|
|
6
20
|
|
|
7
21
|
### New Features
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.3.
|
|
3
|
+
Version: 7.3.2
|
|
4
4
|
Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -42,7 +42,7 @@ Requires-Dist: rapidfuzz>=3.14.3
|
|
|
42
42
|
Requires-Dist: scikit-learn>=1.8.0
|
|
43
43
|
Requires-Dist: sentence-transformers>=5.3.0
|
|
44
44
|
Requires-Dist: sqlite-utils>=3.39
|
|
45
|
-
Requires-Dist: typer>=0.
|
|
45
|
+
Requires-Dist: typer>=0.21.2
|
|
46
46
|
Requires-Dist: xxhash>=3.6.0
|
|
47
47
|
Provides-Extra: rt
|
|
48
48
|
Requires-Dist: polars[rtcompat]>=1.39.0; extra == 'rt'
|
|
@@ -93,6 +93,20 @@ docker run --rm \
|
|
|
93
93
|
|
|
94
94
|
</details>
|
|
95
95
|
|
|
96
|
+
## Quick Demo
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Build a knowledge graph from a YAML configuration
|
|
100
|
+
$ tablassert build-knowledge-graph graph-config.yaml
|
|
101
|
+
⠋ Loading table configurations...
|
|
102
|
+
⠋ Resolving entities across 16 DuckDB shards...
|
|
103
|
+
⠋ Compiling subgraphs...
|
|
104
|
+
⠋ Deduplicating nodes and edges...
|
|
105
|
+
✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
109
|
+
|
|
96
110
|
## Key Features
|
|
97
111
|
|
|
98
112
|
- **Declarative Configuration** — YAML-based, no code required
|
|
@@ -41,6 +41,20 @@ docker run --rm \
|
|
|
41
41
|
|
|
42
42
|
</details>
|
|
43
43
|
|
|
44
|
+
## Quick Demo
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Build a knowledge graph from a YAML configuration
|
|
48
|
+
$ tablassert build-knowledge-graph graph-config.yaml
|
|
49
|
+
⠋ Loading table configurations...
|
|
50
|
+
⠋ Resolving entities across 16 DuckDB shards...
|
|
51
|
+
⠋ Compiling subgraphs...
|
|
52
|
+
⠋ Deduplicating nodes and edges...
|
|
53
|
+
✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
57
|
+
|
|
44
58
|
## Key Features
|
|
45
59
|
|
|
46
60
|
- **Declarative Configuration** — YAML-based, no code required
|
|
@@ -19,7 +19,7 @@ def resolve_many(
|
|
|
19
19
|
prioritize: Optional[list[Categories]] = None,
|
|
20
20
|
avoid: Optional[list[Categories]] = None,
|
|
21
21
|
column_context: bool = True,
|
|
22
|
-
) -> dict[str,
|
|
22
|
+
) -> list[dict[str, Any]]
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
### Parameters
|
|
@@ -73,12 +73,13 @@ This is useful when resolving a column of related entities (e.g., all genes) —
|
|
|
73
73
|
|
|
74
74
|
### Return Value
|
|
75
75
|
|
|
76
|
-
Returns a `dict[str,
|
|
76
|
+
Returns a `list[dict[str, Any]]` — one dictionary per resolved entity. The list is produced by calling `polars.DataFrame.to_dicts()` on the collected resolution output.
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
Each dictionary contains the following keys (where `{col}` is the value of the `col` parameter):
|
|
79
79
|
|
|
80
80
|
| Key | Description | Example Value |
|
|
81
81
|
|-----|-------------|---------------|
|
|
82
|
+
| `original {col}` | Original input text before normalization | `"TP53"` |
|
|
82
83
|
| `{col}` | CURIE identifier | `"HGNC:11998"` |
|
|
83
84
|
| `{col} name` | Preferred entity name | `"TP53"` |
|
|
84
85
|
| `{col} category` | Biolink category (prefixed) | `"biolink:Gene"` |
|
|
@@ -87,7 +88,7 @@ The returned dictionary contains the following keys (where `{col}` is the value
|
|
|
87
88
|
| `{col} source version` | Database version | `"2025-01"` |
|
|
88
89
|
| `{col} nlp level` | NLP processing level used for match | `0` or `1` |
|
|
89
90
|
|
|
90
|
-
**Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned
|
|
91
|
+
**Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned list may therefore be shorter than the input iterable.
|
|
91
92
|
|
|
92
93
|
### Pipeline Internals
|
|
93
94
|
|
|
@@ -101,7 +102,7 @@ The returned dictionary contains the following keys (where `{col}` is the value
|
|
|
101
102
|
|
|
102
103
|
4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
|
|
103
104
|
|
|
104
|
-
5. **Collection and conversion** — Collects the lazy result into an eager `pl.DataFrame` and converts to a
|
|
105
|
+
5. **Collection and conversion** — Collects the lazy result into an eager `pl.DataFrame` and converts to a list of row dictionaries via `to_dicts()`.
|
|
105
106
|
|
|
106
107
|
### Example Usage
|
|
107
108
|
|
|
@@ -109,12 +110,13 @@ The returned dictionary contains the following keys (where `{col}` is the value
|
|
|
109
110
|
|
|
110
111
|
```python
|
|
111
112
|
from pathlib import Path
|
|
113
|
+
from typing import Any
|
|
112
114
|
from tablassert.lib import resolve_many
|
|
113
115
|
from tablassert.enums import Categories
|
|
114
116
|
|
|
115
117
|
datassert: Path = Path("/path/to/datassert")
|
|
116
118
|
|
|
117
|
-
result: dict[str,
|
|
119
|
+
result: list[dict[str, Any]] = resolve_many(
|
|
118
120
|
col="gene",
|
|
119
121
|
entities=["TP53", "BRCA1", "EGFR", "KRAS"],
|
|
120
122
|
datassert=datassert,
|
|
@@ -122,41 +124,41 @@ result: dict[str, list[str]] = resolve_many(
|
|
|
122
124
|
prioritize=[Categories.Gene],
|
|
123
125
|
)
|
|
124
126
|
|
|
125
|
-
# result["gene"
|
|
126
|
-
# result["gene
|
|
127
|
-
# result["gene category"] → ["biolink:Gene", "biolink:Gene", ...]
|
|
127
|
+
# result[0] → {"original gene": "TP53", "gene": "HGNC:11998", "gene name": "TP53", ...}
|
|
128
|
+
# result[1] → {"original gene": "BRCA1", "gene": "HGNC:1100", "gene name": "BRCA1", ...}
|
|
128
129
|
```
|
|
129
130
|
|
|
130
131
|
#### Disease Resolution With Category Avoidance
|
|
131
132
|
|
|
132
133
|
```python
|
|
133
134
|
from pathlib import Path
|
|
135
|
+
from typing import Any
|
|
134
136
|
from tablassert.lib import resolve_many
|
|
135
137
|
from tablassert.enums import Categories
|
|
136
138
|
|
|
137
139
|
datassert: Path = Path("/path/to/datassert")
|
|
138
140
|
|
|
139
|
-
result: dict[str,
|
|
141
|
+
result: list[dict[str, Any]] = resolve_many(
|
|
140
142
|
col="disease",
|
|
141
143
|
entities=["diabetes mellitus", "breast cancer", "alzheimer disease"],
|
|
142
144
|
datassert=datassert,
|
|
143
145
|
avoid=[Categories.Gene, Categories.Protein],
|
|
144
146
|
)
|
|
145
147
|
|
|
146
|
-
# result["disease"
|
|
147
|
-
# result["disease name"
|
|
148
|
-
# result["disease category"] → ["biolink:Disease", ...]
|
|
148
|
+
# result[0] → {"original disease": "diabetes mellitus", "disease": "MONDO:0005015", ...}
|
|
149
|
+
# result[1] → {"original disease": "breast cancer", "disease name": "breast cancer", ...}
|
|
149
150
|
```
|
|
150
151
|
|
|
151
152
|
#### Chemical Resolution Without Column Context
|
|
152
153
|
|
|
153
154
|
```python
|
|
154
155
|
from pathlib import Path
|
|
156
|
+
from typing import Any
|
|
155
157
|
from tablassert.lib import resolve_many
|
|
156
158
|
|
|
157
159
|
datassert: Path = Path("/path/to/datassert")
|
|
158
160
|
|
|
159
|
-
result: dict[str,
|
|
161
|
+
result: list[dict[str, Any]] = resolve_many(
|
|
160
162
|
col="chemical",
|
|
161
163
|
entities=["aspirin", "metformin", "ibuprofen"],
|
|
162
164
|
datassert=datassert,
|
|
@@ -169,11 +171,12 @@ result: dict[str, list[str]] = resolve_many(
|
|
|
169
171
|
```python
|
|
170
172
|
import polars as pl
|
|
171
173
|
from pathlib import Path
|
|
174
|
+
from typing import Any
|
|
172
175
|
from tablassert.lib import resolve_many
|
|
173
176
|
|
|
174
177
|
datassert: Path = Path("/path/to/datassert")
|
|
175
178
|
|
|
176
|
-
result: dict[str,
|
|
179
|
+
result: list[dict[str, Any]] = resolve_many(
|
|
177
180
|
col="gene",
|
|
178
181
|
entities=["TP53", "BRCA1"],
|
|
179
182
|
datassert=datassert,
|
|
@@ -183,9 +186,9 @@ result: dict[str, list[str]] = resolve_many(
|
|
|
183
186
|
# Convert back to a Polars DataFrame
|
|
184
187
|
df: pl.DataFrame = pl.DataFrame(result)
|
|
185
188
|
|
|
186
|
-
# Or iterate over resolved
|
|
187
|
-
for
|
|
188
|
-
print(f"{name} → {
|
|
189
|
+
# Or iterate over resolved rows
|
|
190
|
+
for row in result:
|
|
191
|
+
print(f"{row['gene name']} → {row['gene']}")
|
|
189
192
|
```
|
|
190
193
|
|
|
191
194
|
### Comparison With resolve()
|
|
@@ -196,7 +199,7 @@ for curie, name in zip(result["gene"], result["gene name"]):
|
|
|
196
199
|
| **Input** | Plain iterable of strings | Pre-normalized `pl.LazyFrame` |
|
|
197
200
|
| **NLP** | Applied automatically | Must be applied upstream |
|
|
198
201
|
| **Connections** | Managed internally via `ExitStack` | Must be opened externally |
|
|
199
|
-
| **Output** | `dict[str,
|
|
202
|
+
| **Output** | `list[dict[str, Any]]` | `pl.LazyFrame` |
|
|
200
203
|
| **Logging** | Uses default (`log=True`) | Configurable |
|
|
201
204
|
| **Context params** | Not exposed (`section_hash`, `config_file`, `tag`) | Fully configurable |
|
|
202
205
|
| **Use case** | Standalone batch lookups, scripting, notebooks | Internal pipeline integration |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.3.
|
|
3
|
+
version = "7.3.2"
|
|
4
4
|
description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -56,7 +56,7 @@ dependencies = [
|
|
|
56
56
|
"scikit-learn>=1.8.0",
|
|
57
57
|
"sentence-transformers>=5.3.0",
|
|
58
58
|
"sqlite-utils>=3.39",
|
|
59
|
-
"typer>=0.
|
|
59
|
+
"typer>=0.21.2",
|
|
60
60
|
"xxhash>=3.6.0",
|
|
61
61
|
]
|
|
62
62
|
|
|
@@ -487,10 +487,11 @@ def resolve_many(
|
|
|
487
487
|
prioritize: Optional[list[Categories]] = None,
|
|
488
488
|
avoid: Optional[list[Categories]] = None,
|
|
489
489
|
column_context: bool = True,
|
|
490
|
-
) -> dict[str,
|
|
490
|
+
) -> list[dict[str, Any]]:
|
|
491
491
|
series: pl.Series = pl.Series(col, entities)
|
|
492
492
|
lf: pl.LazyFrame = series.to_frame().lazy()
|
|
493
493
|
|
|
494
|
+
lf = column(lf, add("original ", col), col)
|
|
494
495
|
lf = level_one(lf, col)
|
|
495
496
|
lf = level_two(lf, col)
|
|
496
497
|
|
|
@@ -503,4 +504,4 @@ def resolve_many(
|
|
|
503
504
|
lf = resolve(lf, col, conns, taxon=taxon, prioritize=prioritize, avoid=avoid, column_context=column_context)
|
|
504
505
|
|
|
505
506
|
df: pl.DataFrame = lf.collect()
|
|
506
|
-
return df.
|
|
507
|
+
return df.to_dicts()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|