tablassert 7.4.7__tar.gz → 7.4.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.4.7 → tablassert-7.4.8}/CHANGELOG.md +6 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/PKG-INFO +1 -1
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/changelog.md +2 -2
- {tablassert-7.4.7 → tablassert-7.4.8}/pyproject.toml +1 -2
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/qc.py +24 -11
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_qc.py +38 -6
- {tablassert-7.4.7 → tablassert-7.4.8}/uv.lock +1 -1
- {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/autotag.yml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/docker.yml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/docs.yml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/.gitignore +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/.pre-commit-config.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/AGENTS.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/CITATION.cff +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/CONTRIBUTING.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/Dockerfile +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/LICENSE +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/README.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/fullmap.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/lib.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/qc.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/utils.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/cli.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/advanced-example.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/graph.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/table.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/datassert.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/docker.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/index.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/installation.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/docs/tutorial.md +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/llms.txt +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/mkdocs.yml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/__init__.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/cli.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/downloader.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/enums.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/fullmap.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/ingests.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/lib.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/log.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/models.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/nlp.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/progress.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/utils.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/__init__.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/conftest.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/minimal_section.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_downloader.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_enums.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_fullmap.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_ingests.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_lib.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_models.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_nlp.py +0 -0
- {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_utils.py +0 -0
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.4.8 - 2026-05-12
|
|
6
|
+
|
|
7
|
+
### Changes
|
|
8
|
+
- Expanded `fullmap_audit()` failure logging in `qc.py` to include the underlying score values that caused each rejection. Failed CURIE log lines now carry `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` alongside the existing `STORE`/`CONFIG`/`COL`/`ORIGINAL`/`PREFERRED`/`CURIE` fields, making it easier to diagnose why a term was dropped.
|
|
9
|
+
- Attached the per-row fuzzy and BERT scores as columns on the pending frame before masking, and switched the intermediate `pl.concat()` calls to `how="diagonal"` so the score columns survive concatenation with the already-passed rows.
|
|
10
|
+
|
|
5
11
|
## 7.4.7 - 2026-05-11
|
|
6
12
|
|
|
7
13
|
### Changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.4.
|
|
3
|
+
Version: 7.4.8
|
|
4
4
|
Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -4,10 +4,10 @@ The canonical release history lives in the repository root at [`CHANGELOG.md`](h
|
|
|
4
4
|
|
|
5
5
|
## Current Release Notes
|
|
6
6
|
|
|
7
|
-
## 7.4.
|
|
7
|
+
## 7.4.8 - 2026-05-12
|
|
8
8
|
|
|
9
9
|
### Changes
|
|
10
10
|
|
|
11
|
-
-
|
|
11
|
+
- Expanded `fullmap_audit()` failure logging in `qc.py` so each rejected CURIE log line now carries the underlying `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` score values, making it easier to diagnose why a term was dropped during QC.
|
|
12
12
|
|
|
13
13
|
For older releases and the full project history, open the root `CHANGELOG.md` in the repository.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.4.
|
|
3
|
+
version = "7.4.8"
|
|
4
4
|
description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -106,7 +106,6 @@ markers = ["network: requires internet", "gpu: requires CUDAExecutionProvider"]
|
|
|
106
106
|
line-length = 120
|
|
107
107
|
indent-width = 4
|
|
108
108
|
target-version = "py311"
|
|
109
|
-
exclude = ["*.tcss"]
|
|
110
109
|
|
|
111
110
|
[tool.ruff.format]
|
|
112
111
|
quote-style = "double"
|
|
@@ -133,7 +133,7 @@ def fullmap_audit(
|
|
|
133
133
|
passed: pl.DataFrame = pairs.filter(pl.col(out))
|
|
134
134
|
pending: pl.DataFrame = pairs.filter(~pl.col(out))
|
|
135
135
|
|
|
136
|
-
exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS"
|
|
136
|
+
exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS|^UNII|^PUBCHEM|^MONDO"
|
|
137
137
|
is_exempt: pl.DataFrame = pending.with_columns(pl.col(cols[0]).str.contains(exempt_curies).alias(out))
|
|
138
138
|
pairs = pl.concat((passed, is_exempt))
|
|
139
139
|
|
|
@@ -160,9 +160,11 @@ def fullmap_audit(
|
|
|
160
160
|
ratio_scores: object = cpdist(originals, preferreds, scorer=fuzz.ratio)
|
|
161
161
|
partial_scores: object = cpdist(originals, preferreds, scorer=fuzz.partial_token_sort_ratio)
|
|
162
162
|
|
|
163
|
-
|
|
163
|
+
pending = pending.with_columns([pl.Series("fuzz_ratio", ratio_scores), pl.Series("fuzz_partial", partial_scores)])
|
|
164
|
+
|
|
165
|
+
fuzz_mask: pl.Series = pl.Series(out, (ratio_scores >= 20) | (partial_scores >= 30), dtype=pl.Boolean)
|
|
164
166
|
masked_fuzz: pl.DataFrame = pending.with_columns(fuzz_mask)
|
|
165
|
-
pairs = pl.concat((passed, masked_fuzz))
|
|
167
|
+
pairs = pl.concat((passed, masked_fuzz), how="diagonal")
|
|
166
168
|
|
|
167
169
|
passed = pairs.filter(pl.col(out))
|
|
168
170
|
pending = pairs.filter(~pl.col(out))
|
|
@@ -179,22 +181,33 @@ def fullmap_audit(
|
|
|
179
181
|
n: int = len(originals)
|
|
180
182
|
similarity: object = cosine_similarity(embeddings[:n], embeddings[n:]).diagonal() # pyright: ignore
|
|
181
183
|
|
|
184
|
+
pending = pending.with_columns(pl.Series("bert_similarity", similarity)) # pyright: ignore
|
|
185
|
+
|
|
182
186
|
bert_mask: pl.Series = pl.Series(out, similarity >= 0.2, dtype=pl.Boolean) # pyright: ignore
|
|
183
187
|
BERT_fuzz: pl.DataFrame = pending.with_columns(bert_mask)
|
|
184
|
-
pairs = pl.concat((passed, BERT_fuzz))
|
|
188
|
+
pairs = pl.concat((passed, BERT_fuzz), how="diagonal")
|
|
185
189
|
|
|
186
190
|
passed = pairs.filter(pl.col(out))
|
|
187
191
|
pending = pairs.filter(~pl.col(out))
|
|
188
192
|
|
|
189
193
|
# * Add Logging For Failed CURIES
|
|
190
194
|
if log and pending.height > 0:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
|
|
195
|
+
has_bert: bool = "bert_similarity" in pending.columns
|
|
196
|
+
curies: list[str] = pending.get_column(col).to_list()
|
|
197
|
+
originals_list: list[str] = pending.get_column(original).to_list()
|
|
198
|
+
preferreds_list: list[str] = pending.get_column(preferred).to_list()
|
|
199
|
+
fuzz_ratios: list[object] = pending.get_column("fuzz_ratio").to_list()
|
|
200
|
+
fuzz_partials: list[object] = pending.get_column("fuzz_partial").to_list()
|
|
201
|
+
bert_sims: list[object] = pending.get_column("bert_similarity").to_list() if has_bert else []
|
|
202
|
+
|
|
203
|
+
for i, c in enumerate(curies):
|
|
204
|
+
msg: str = (
|
|
205
|
+
f"FAILED | STORE: {section_hash} | CONFIG: {config_file} | COL: {col}"
|
|
206
|
+
f" | ORIGINAL: {originals_list[i]!r} | PREFERRED: {preferreds_list[i]!r} | CURIE: {c!r}"
|
|
207
|
+
f" | FUZZ_RATIO: {fuzz_ratios[i]} | FUZZ_PARTIAL: {fuzz_partials[i]}"
|
|
198
208
|
)
|
|
209
|
+
if has_bert:
|
|
210
|
+
msg = f"{msg} | BERT_SIMILARITY: {bert_sims[i]}"
|
|
211
|
+
logger.info(msg)
|
|
199
212
|
|
|
200
213
|
return df.join(passed.select(col), on=col, how="semi").lazy()
|
|
@@ -108,9 +108,7 @@ def test_fullmap_audit_suppresses_logs(monkeypatch: Any) -> None:
|
|
|
108
108
|
monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
|
|
109
109
|
monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
|
|
110
110
|
|
|
111
|
-
lf: pl.LazyFrame = pl.DataFrame(
|
|
112
|
-
{"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
|
|
113
|
-
).lazy()
|
|
111
|
+
lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
|
|
114
112
|
result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=False).collect()
|
|
115
113
|
|
|
116
114
|
assert result.height == 0
|
|
@@ -141,9 +139,7 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
|
|
|
141
139
|
monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
|
|
142
140
|
monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
|
|
143
141
|
|
|
144
|
-
lf: pl.LazyFrame = pl.DataFrame(
|
|
145
|
-
{"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
|
|
146
|
-
).lazy()
|
|
142
|
+
lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
|
|
147
143
|
result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=True).collect()
|
|
148
144
|
|
|
149
145
|
assert result.height == 0
|
|
@@ -151,6 +147,42 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
|
|
|
151
147
|
assert "FAILED" in messages[0]
|
|
152
148
|
assert "STORE: store123" in messages[0]
|
|
153
149
|
assert "CONFIG: config.yaml" in messages[0]
|
|
150
|
+
assert "FUZZ_RATIO:" in messages[0]
|
|
151
|
+
assert "FUZZ_PARTIAL:" in messages[0]
|
|
152
|
+
assert "BERT_SIMILARITY:" in messages[0]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ? fullmap_audit Log Message Contains Expected Score Values
|
|
156
|
+
def test_fullmap_audit_log_score_values(monkeypatch: Any) -> None:
|
|
157
|
+
messages: list[str] = []
|
|
158
|
+
|
|
159
|
+
class DummyLogger:
|
|
160
|
+
def info(self, message: str) -> None:
|
|
161
|
+
messages.append(message)
|
|
162
|
+
|
|
163
|
+
class DummyBioBERT:
|
|
164
|
+
def encode(self, values: list[str]) -> object:
|
|
165
|
+
return np.array([[0.0], [1.0]])
|
|
166
|
+
|
|
167
|
+
def fake_cpdist(left: list[str], right: list[str], scorer: Any) -> object:
|
|
168
|
+
return np.array([5.0]) if scorer == fuzz.ratio else np.array([8.0])
|
|
169
|
+
|
|
170
|
+
def fake_cosine_similarity(left: object, right: object) -> object:
|
|
171
|
+
return np.array([[0.05]])
|
|
172
|
+
|
|
173
|
+
monkeypatch.setattr(qc, "logger", DummyLogger())
|
|
174
|
+
monkeypatch.setattr(qc, "get_biobert", lambda provider=None: DummyBioBERT())
|
|
175
|
+
monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
|
|
176
|
+
monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
|
|
177
|
+
|
|
178
|
+
lf: pl.LazyFrame = pl.DataFrame({"subject": ["X:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
|
|
179
|
+
result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store456", "cfg.yaml", log=True).collect()
|
|
180
|
+
|
|
181
|
+
assert result.height == 0
|
|
182
|
+
assert len(messages) == 1
|
|
183
|
+
assert "FUZZ_RATIO: 5.0" in messages[0]
|
|
184
|
+
assert "FUZZ_PARTIAL: 8.0" in messages[0]
|
|
185
|
+
assert "BERT_SIMILARITY: 0.05" in messages[0]
|
|
154
186
|
|
|
155
187
|
|
|
156
188
|
# ? GPU Runtime Can Be Forced To CPU
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|