tablassert 7.4.7__tar.gz → 7.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {tablassert-7.4.7 → tablassert-7.4.8}/CHANGELOG.md +6 -0
  2. {tablassert-7.4.7 → tablassert-7.4.8}/PKG-INFO +1 -1
  3. {tablassert-7.4.7 → tablassert-7.4.8}/docs/changelog.md +2 -2
  4. {tablassert-7.4.7 → tablassert-7.4.8}/pyproject.toml +1 -2
  5. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/qc.py +24 -11
  6. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_qc.py +38 -6
  7. {tablassert-7.4.7 → tablassert-7.4.8}/uv.lock +1 -1
  8. {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/autotag.yml +0 -0
  9. {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/docker.yml +0 -0
  10. {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/docs.yml +0 -0
  11. {tablassert-7.4.7 → tablassert-7.4.8}/.github/workflows/pipy.yml +0 -0
  12. {tablassert-7.4.7 → tablassert-7.4.8}/.gitignore +0 -0
  13. {tablassert-7.4.7 → tablassert-7.4.8}/.pre-commit-config.yaml +0 -0
  14. {tablassert-7.4.7 → tablassert-7.4.8}/AGENTS.md +0 -0
  15. {tablassert-7.4.7 → tablassert-7.4.8}/CITATION.cff +0 -0
  16. {tablassert-7.4.7 → tablassert-7.4.8}/CONTRIBUTING.md +0 -0
  17. {tablassert-7.4.7 → tablassert-7.4.8}/Dockerfile +0 -0
  18. {tablassert-7.4.7 → tablassert-7.4.8}/LICENSE +0 -0
  19. {tablassert-7.4.7 → tablassert-7.4.8}/README.md +0 -0
  20. {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/fullmap.md +0 -0
  21. {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/lib.md +0 -0
  22. {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/qc.md +0 -0
  23. {tablassert-7.4.7 → tablassert-7.4.8}/docs/api/utils.md +0 -0
  24. {tablassert-7.4.7 → tablassert-7.4.8}/docs/cli.md +0 -0
  25. {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/advanced-example.md +0 -0
  26. {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/graph.md +0 -0
  27. {tablassert-7.4.7 → tablassert-7.4.8}/docs/configuration/table.md +0 -0
  28. {tablassert-7.4.7 → tablassert-7.4.8}/docs/datassert.md +0 -0
  29. {tablassert-7.4.7 → tablassert-7.4.8}/docs/docker.md +0 -0
  30. {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-data.csv +0 -0
  31. {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-graph.yaml +0 -0
  32. {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples/tutorial-table.yaml +0 -0
  33. {tablassert-7.4.7 → tablassert-7.4.8}/docs/examples.md +0 -0
  34. {tablassert-7.4.7 → tablassert-7.4.8}/docs/index.md +0 -0
  35. {tablassert-7.4.7 → tablassert-7.4.8}/docs/installation.md +0 -0
  36. {tablassert-7.4.7 → tablassert-7.4.8}/docs/tutorial.md +0 -0
  37. {tablassert-7.4.7 → tablassert-7.4.8}/llms.txt +0 -0
  38. {tablassert-7.4.7 → tablassert-7.4.8}/mkdocs.yml +0 -0
  39. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/__init__.py +0 -0
  40. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/cli.py +0 -0
  41. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/downloader.py +0 -0
  42. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/enums.py +0 -0
  43. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/fullmap.py +0 -0
  44. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/ingests.py +0 -0
  45. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/lib.py +0 -0
  46. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/log.py +0 -0
  47. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/models.py +0 -0
  48. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/nlp.py +0 -0
  49. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/progress.py +0 -0
  50. {tablassert-7.4.7 → tablassert-7.4.8}/src/tablassert/utils.py +0 -0
  51. {tablassert-7.4.7 → tablassert-7.4.8}/tests/__init__.py +0 -0
  52. {tablassert-7.4.7 → tablassert-7.4.8}/tests/conftest.py +0 -0
  53. {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
  54. {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/minimal_section.yaml +0 -0
  55. {tablassert-7.4.7 → tablassert-7.4.8}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
  56. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_downloader.py +0 -0
  57. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_enums.py +0 -0
  58. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_fullmap.py +0 -0
  59. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_ingests.py +0 -0
  60. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_lib.py +0 -0
  61. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_models.py +0 -0
  62. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_nlp.py +0 -0
  63. {tablassert-7.4.7 → tablassert-7.4.8}/tests/test_utils.py +0 -0
@@ -2,6 +2,12 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.4.8 - 2026-05-12
6
+
7
+ ### Changes
8
+ - Expanded `fullmap_audit()` failure logging in `qc.py` to include the underlying score values that caused each rejection. Failed CURIE log lines now carry `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` alongside the existing `STORE`/`CONFIG`/`COL`/`ORIGINAL`/`PREFERRED`/`CURIE` fields, making it easier to diagnose why a term was dropped.
9
+ - Attached the per-row fuzzy and BERT scores as columns on the pending frame before masking, and switched the intermediate `pl.concat()` calls to `how="diagonal"` so the score columns survive concatenation with the already-passed rows.
10
+
5
11
  ## 7.4.7 - 2026-05-11
6
12
 
7
13
  ### Changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.4.7
3
+ Version: 7.4.8
4
4
  Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
@@ -4,10 +4,10 @@ The canonical release history lives in the repository root at [`CHANGELOG.md`](h
4
4
 
5
5
  ## Current Release Notes
6
6
 
7
- ## 7.4.7 - 2026-05-11
7
+ ## 7.4.8 - 2026-05-12
8
8
 
9
9
  ### Changes
10
10
 
11
- - Added an `is_valid_pmc_id` model validator on `Provenance` that enforces `publication` starts with `PMC` followed by digits when `repo` is `PMC`. The constraint was previously documented in 7.3.6 but only now enforced at parse time.
11
+ - Expanded `fullmap_audit()` failure logging in `qc.py` so each rejected CURIE log line now carries the underlying `FUZZ_RATIO`, `FUZZ_PARTIAL`, and (when the BERT stage ran) `BERT_SIMILARITY` score values, making it easier to diagnose why a term was dropped during QC.
12
12
 
13
13
  For older releases and the full project history, open the root `CHANGELOG.md` in the repository.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.4.7"
3
+ version = "7.4.8"
4
4
  description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
5
5
  authors = [
6
6
  { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
@@ -106,7 +106,6 @@ markers = ["network: requires internet", "gpu: requires CUDAExecutionProvider"]
106
106
  line-length = 120
107
107
  indent-width = 4
108
108
  target-version = "py311"
109
- exclude = ["*.tcss"]
110
109
 
111
110
  [tool.ruff.format]
112
111
  quote-style = "double"
@@ -133,7 +133,7 @@ def fullmap_audit(
133
133
  passed: pl.DataFrame = pairs.filter(pl.col(out))
134
134
  pending: pl.DataFrame = pairs.filter(~pl.col(out))
135
135
 
136
- exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS"
136
+ exempt_curies: str = r"^CHEBI|^PR|^UniProtKB|^NCBIGene|^UMLS|^UNII|^PUBCHEM|^MONDO"
137
137
  is_exempt: pl.DataFrame = pending.with_columns(pl.col(cols[0]).str.contains(exempt_curies).alias(out))
138
138
  pairs = pl.concat((passed, is_exempt))
139
139
 
@@ -160,9 +160,11 @@ def fullmap_audit(
160
160
  ratio_scores: object = cpdist(originals, preferreds, scorer=fuzz.ratio)
161
161
  partial_scores: object = cpdist(originals, preferreds, scorer=fuzz.partial_token_sort_ratio)
162
162
 
163
- fuzz_mask: pl.Series = pl.Series(out, (ratio_scores >= 20) | (partial_scores >= 20), dtype=pl.Boolean)
163
+ pending = pending.with_columns([pl.Series("fuzz_ratio", ratio_scores), pl.Series("fuzz_partial", partial_scores)])
164
+
165
+ fuzz_mask: pl.Series = pl.Series(out, (ratio_scores >= 20) | (partial_scores >= 30), dtype=pl.Boolean)
164
166
  masked_fuzz: pl.DataFrame = pending.with_columns(fuzz_mask)
165
- pairs = pl.concat((passed, masked_fuzz))
167
+ pairs = pl.concat((passed, masked_fuzz), how="diagonal")
166
168
 
167
169
  passed = pairs.filter(pl.col(out))
168
170
  pending = pairs.filter(~pl.col(out))
@@ -179,22 +181,33 @@ def fullmap_audit(
179
181
  n: int = len(originals)
180
182
  similarity: object = cosine_similarity(embeddings[:n], embeddings[n:]).diagonal() # pyright: ignore
181
183
 
184
+ pending = pending.with_columns(pl.Series("bert_similarity", similarity)) # pyright: ignore
185
+
182
186
  bert_mask: pl.Series = pl.Series(out, similarity >= 0.2, dtype=pl.Boolean) # pyright: ignore
183
187
  BERT_fuzz: pl.DataFrame = pending.with_columns(bert_mask)
184
- pairs = pl.concat((passed, BERT_fuzz))
188
+ pairs = pl.concat((passed, BERT_fuzz), how="diagonal")
185
189
 
186
190
  passed = pairs.filter(pl.col(out))
187
191
  pending = pairs.filter(~pl.col(out))
188
192
 
189
193
  # * Add Logging For Failed CURIES
190
194
  if log and pending.height > 0:
191
- for c, o, p in zip(
192
- pending.get_column(col).to_list(),
193
- pending.get_column(original).to_list(),
194
- pending.get_column(preferred).to_list(),
195
- ):
196
- logger.info(
197
- f"FAILED | STORE: {section_hash} | CONFIG: {config_file} | COL: {col} | ORIGINAL: {o!r} | PREFERRED: {p!r} | CURIE: {c!r}"
195
+ has_bert: bool = "bert_similarity" in pending.columns
196
+ curies: list[str] = pending.get_column(col).to_list()
197
+ originals_list: list[str] = pending.get_column(original).to_list()
198
+ preferreds_list: list[str] = pending.get_column(preferred).to_list()
199
+ fuzz_ratios: list[object] = pending.get_column("fuzz_ratio").to_list()
200
+ fuzz_partials: list[object] = pending.get_column("fuzz_partial").to_list()
201
+ bert_sims: list[object] = pending.get_column("bert_similarity").to_list() if has_bert else []
202
+
203
+ for i, c in enumerate(curies):
204
+ msg: str = (
205
+ f"FAILED | STORE: {section_hash} | CONFIG: {config_file} | COL: {col}"
206
+ f" | ORIGINAL: {originals_list[i]!r} | PREFERRED: {preferreds_list[i]!r} | CURIE: {c!r}"
207
+ f" | FUZZ_RATIO: {fuzz_ratios[i]} | FUZZ_PARTIAL: {fuzz_partials[i]}"
198
208
  )
209
+ if has_bert:
210
+ msg = f"{msg} | BERT_SIMILARITY: {bert_sims[i]}"
211
+ logger.info(msg)
199
212
 
200
213
  return df.join(passed.select(col), on=col, how="semi").lazy()
@@ -108,9 +108,7 @@ def test_fullmap_audit_suppresses_logs(monkeypatch: Any) -> None:
108
108
  monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
109
109
  monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
110
110
 
111
- lf: pl.LazyFrame = pl.DataFrame(
112
- {"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
113
- ).lazy()
111
+ lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
114
112
  result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=False).collect()
115
113
 
116
114
  assert result.height == 0
@@ -141,9 +139,7 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
141
139
  monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
142
140
  monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
143
141
 
144
- lf: pl.LazyFrame = pl.DataFrame(
145
- {"subject": ["MONDO:1"], "original subject": ["foo"], "subject name": ["bar"]}
146
- ).lazy()
142
+ lf: pl.LazyFrame = pl.DataFrame({"subject": ["FOO:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
147
143
  result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store123", "config.yaml", log=True).collect()
148
144
 
149
145
  assert result.height == 0
@@ -151,6 +147,42 @@ def test_fullmap_audit_logs_failures(monkeypatch: Any) -> None:
151
147
  assert "FAILED" in messages[0]
152
148
  assert "STORE: store123" in messages[0]
153
149
  assert "CONFIG: config.yaml" in messages[0]
150
+ assert "FUZZ_RATIO:" in messages[0]
151
+ assert "FUZZ_PARTIAL:" in messages[0]
152
+ assert "BERT_SIMILARITY:" in messages[0]
153
+
154
+
155
+ # ? fullmap_audit Log Message Contains Expected Score Values
156
+ def test_fullmap_audit_log_score_values(monkeypatch: Any) -> None:
157
+ messages: list[str] = []
158
+
159
+ class DummyLogger:
160
+ def info(self, message: str) -> None:
161
+ messages.append(message)
162
+
163
+ class DummyBioBERT:
164
+ def encode(self, values: list[str]) -> object:
165
+ return np.array([[0.0], [1.0]])
166
+
167
+ def fake_cpdist(left: list[str], right: list[str], scorer: Any) -> object:
168
+ return np.array([5.0]) if scorer == fuzz.ratio else np.array([8.0])
169
+
170
+ def fake_cosine_similarity(left: object, right: object) -> object:
171
+ return np.array([[0.05]])
172
+
173
+ monkeypatch.setattr(qc, "logger", DummyLogger())
174
+ monkeypatch.setattr(qc, "get_biobert", lambda provider=None: DummyBioBERT())
175
+ monkeypatch.setattr(rf_process, "cpdist", fake_cpdist)
176
+ monkeypatch.setattr(pairwise, "cosine_similarity", fake_cosine_similarity)
177
+
178
+ lf: pl.LazyFrame = pl.DataFrame({"subject": ["X:1"], "original subject": ["foo"], "subject name": ["bar"]}).lazy()
179
+ result: pl.DataFrame = qc.fullmap_audit(lf, "subject", "store456", "cfg.yaml", log=True).collect()
180
+
181
+ assert result.height == 0
182
+ assert len(messages) == 1
183
+ assert "FUZZ_RATIO: 5.0" in messages[0]
184
+ assert "FUZZ_PARTIAL: 8.0" in messages[0]
185
+ assert "BERT_SIMILARITY: 0.05" in messages[0]
154
186
 
155
187
 
156
188
  # ? GPU Runtime Can Be Forced To CPU
@@ -2360,7 +2360,7 @@ wheels = [
2360
2360
 
2361
2361
  [[package]]
2362
2362
  name = "tablassert"
2363
- version = "7.4.7"
2363
+ version = "7.4.8"
2364
2364
  source = { editable = "." }
2365
2365
  dependencies = [
2366
2366
  { name = "cyclopts" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes