PyPI - tablassert - Versions diffs - 7.4.7__tar.gz → 7.4.8__tar.gz - Mend

tablassert 7.4.7tar.gz → 7.4.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{tablassert-7.4.7 → tablassert-7.4.8}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,12 @@
 All notable changes to this project are documented in this file.
+## 7.4.8 - 2026-05-12
+### Changes
+- Expanded `fullmap_audit()` failure logging in `qc.py` to include the underlying score values that caused each rejection. Failed CURIE log lines now carry `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` alongside the existing `STORE`/`CONFIG`/`COL`/`ORIGINAL`/`PREFERRED`/`CURIE` fields, making it easier to diagnose why a term was dropped.
+- Attached the per-row fuzzy and BERT scores as columns on the pending frame before masking, and switched the intermediate `pl.concat()` calls to `how="diagonal"` so the score columns survive concatenation with the already-passed rows.
 ## 7.4.7 - 2026-05-11
 ### Changes

{tablassert-7.4.7 → tablassert-7.4.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tablassert
-Version: 7.4.7
+Version: 7.4.8
 Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
 Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
 Project-URL: Source, https://github.com/SkyeAv/Tablassert

{tablassert-7.4.7 → tablassert-7.4.8}/docs/changelog.md RENAMED Viewed

@@ -4,10 +4,10 @@ The canonical release history lives in the repository root at [`CHANGELOG.md`](h
 ## Current Release Notes
-## 7.4.7 - 2026-05-11
+## 7.4.8 - 2026-05-12
 ### Changes
-- Added an `is_valid_pmc_id` model validator on `Provenance` that enforces `publication` starts with `PMC` followed by digits when `repo` is `PMC`. The constraint was previously documented in 7.3.6 but only now enforced at parse time.
+- Expanded `fullmap_audit()` failure logging in `qc.py` so each rejected CURIE log line now carries the underlying `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` score values, making it easier to diagnose why a term was dropped during QC.
 For older releases and the full project history, open the root `CHANGELOG.md` in the repository.

{tablassert-7.4.7 → tablassert-7.4.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "tablassert"
-version = "7.4.7"
+version = "7.4.8"
 description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
 authors = [
     { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
@@ -106,7 +106,6 @@ markers = ["network: requires internet", "gpu: requires CUDAExecutionProvider"]
 line-length = 120
 indent-width = 4
 target-version = "py311"
-exclude = ["*.tcss"]
 [tool.ruff.format]
 quote-style = "double"

{tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/qc.py RENAMED Viewed

@@ -133,7 +133,7 @@ def fullmap_audit(
     passed: pl.DataFrame = pairs.filter(pl.col(out))
     pending: pl.DataFrame = pairs.filter(~pl.col(out))
-    exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS"
+    exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS|^UNII|^PUBCHEM|^MONDO"
     is_exempt: pl.DataFrame = pending.with_columns(pl.col(cols[0]).str.contains(exempt_curies).alias(out))
     pairs = pl.concat((passed, is_exempt))
@@ -160,9 +160,11 @@ def fullmap_audit(
     ratio_scores: object = cpdist(originals, preferreds, scorer=fuzz.ratio)
     partial_scores: object = cpdist(originals, preferreds, scorer=fuzz.partial_token_sort_ratio)
-    fuzz_mask: pl.Series = pl.Series(out, (ratio_scores >= 20) | (partial_scores >= 20), dtype=pl.Boolean)
+    pending = pending.with_columns([pl.Series("fuzz_ratio", ratio_scores), pl.Series("fuzz_partial", partial_scores)])
+    fuzz_mask: pl.Series = pl.Series(out, (ratio_scores >= 20) | (partial_scores >= 30), dtype=pl.Boolean)
     masked_fuzz: pl.DataFrame = pending.with_columns(fuzz_mask)
-    pairs = pl.concat((passed, masked_fuzz))
+    pairs = pl.concat((passed, masked_fuzz), how="diagonal")
     passed = pairs.filter(pl.col(out))
     pending = pairs.filter(~pl.col(out))
@@ -179,22 +181,33 @@ def fullmap_audit(
     n: int = len(originals)
     similarity: object = cosine_similarity(embeddings[:n], embeddings[n:]).diagonal()  # pyright: ignore
+    pending = pending.with_columns(pl.Series("bert_similarity", similarity))  # pyright: ignore
     bert_mask: pl.Series = pl.Series(out, similarity >= 0.2, dtype=pl.Boolean)  # pyright: ignore
     BERT_fuzz: pl.DataFrame = pending.with_columns(bert_mask)
-    pairs = pl.concat((passed, BERT_fuzz))
+    pairs = pl.concat((passed, BERT_fuzz), how="diagonal")
     passed = pairs.filter(pl.col(out))
     pending = pairs.filter(~pl.col(out))
     # * Add Logging For Failed CURIES
     if log and pending.height > 0:
-        for c, o, p in zip(
-            pending.get_column(col).to_list(),
-            pending.get_column(original).to_list(),
-            pending.get_column(preferred).to_list(),
-        ):
-            logger.info(
-                f"FAILED | STORE: {section_hash} | CONFIG: {config_file} | COL: {col} | ORIGINAL: {o!r} | PREFERRED: {p!r} | CURIE: {c!r}"
+        has_bert: bool = "bert_similarity" in pending.columns
+        curies: list[str] = pending.get_column(col).to_list()
+        originals_list: list[str] = pending.get_column(original).to_list()
+        preferreds_list: list[str] = pending.get_column(preferred).to_list()
+        fuzz_ratios: list[object] = pending.get_column("fuzz_ratio").to_list()
+        fuzz_partials: list[object] = pending.get_column("fuzz_partial").to_list()
+        bert_sims: list[object] = pending.get_column("bert_similarity").to_list() if has_bert else []
+        for i, c in enumerate(curies):
+            msg: str = (
+                f"FAILED | STORE: {section_hash} | CONFIG: {config_file} | COL: {col}"
+                f" | ORIGINAL: {originals_list[i]!r} | PREFERRED: {preferreds_list[i]!r} | CURIE: {c!r}"
+                f" | FUZZ_RATIO: {fuzz_ratios[i]} | FUZZ_PARTIAL: {fuzz_partials[i]}"
             )
+            if has_bert:
+                msg = f"{msg} | BERT_SIMILARITY: {bert_sims[i]}"
+            logger.info(msg)
     return df.join(passed.select(col), on=col, how="semi").lazy()

{tablassert-7.4.7 → tablassert-7.4.8}/tests/test_qc.py RENAMED Viewed

@@ -108,9 +108,7 @@ def test_fullmap_audit_suppresses_logs(monkeypatch: Any) -> None:
     monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
     monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
-    lf: pl.LazyFrame = pl.DataFrame(
-        {"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
-    ).lazy()
+    lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
     result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=False).collect()
     assert result.height == 0
@@ -141,9 +139,7 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
     monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
     monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
-    lf: pl.LazyFrame = pl.DataFrame(
-        {"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
-    ).lazy()
+    lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
     result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=True).collect()
     assert result.height == 0
@@ -151,6 +147,42 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
     assert "FAILED" in messages[0]
     assert "STORE: store123" in messages[0]
     assert "CONFIG: config.yaml" in messages[0]
+    assert "FUZZ_RATIO:" in messages[0]
+    assert "FUZZ_PARTIAL:" in messages[0]
+    assert "BERT_SIMILARITY:" in messages[0]
+# ? fullmap_audit Log Message Contains Expected Score Values
+def test_fullmap_audit_log_score_values(monkeypatch: Any) -> None:
+    messages: list[str] = []
+    class DummyLogger:
+        def info(self, message: str) -> None:
+            messages.append(message)
+    class DummyBioBERT:
+        def encode(self, values: list[str]) -> object:
+            return np.array([[0.0], [1.0]])
+    def fake_cpdist(left: list[str], right: list[str], scorer: Any) -> object:
+        return np.array([5.0]) if scorer == fuzz.ratio else np.array([8.0])
+    def fake_cosine_similarity(left: object, right: object) -> object:
+        return np.array([[0.05]])
+    monkeypatch.setattr(qc, "logger", DummyLogger())
+    monkeypatch.setattr(qc, "get_biobert", lambda provider=None: DummyBioBERT())
+    monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
+    monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
+    lf: pl.LazyFrame = pl.DataFrame({"subject": ["X:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
+    result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store456", "cfg.yaml", log=True).collect()
+    assert result.height == 0
+    assert len(messages) == 1
+    assert "FUZZ_RATIO: 5.0" in messages[0]
+    assert "FUZZ_PARTIAL: 8.0" in messages[0]
+    assert "BERT_SIMILARITY: 0.05" in messages[0]
 # ? GPU Runtime Can Be Forced To CPU

{tablassert-7.4.7 → tablassert-7.4.8}/uv.lock RENAMED Viewed

@@ -2360,7 +2360,7 @@ wheels = [
 [[package]]
 name = "tablassert"
-version = "7.4.7"
+version = "7.4.8"
 source = { editable = "." }
 dependencies = [
     { name = "cyclopts" },