tablassert 7.0.1__tar.gz → 7.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. tablassert-7.0.2/.pre-commit-config.yaml +15 -0
  2. {tablassert-7.0.1 → tablassert-7.0.2}/CHANGELOG.md +15 -0
  3. {tablassert-7.0.1 → tablassert-7.0.2}/PKG-INFO +2 -3
  4. {tablassert-7.0.1 → tablassert-7.0.2}/docs/api/fullmap.md +28 -15
  5. {tablassert-7.0.1 → tablassert-7.0.2}/docs/api/qc.md +7 -9
  6. {tablassert-7.0.1 → tablassert-7.0.2}/docs/cli.md +1 -1
  7. {tablassert-7.0.1 → tablassert-7.0.2}/pyproject.toml +3 -5
  8. tablassert-7.0.2/src/tablassert/downloader.py +37 -0
  9. tablassert-7.0.2/src/tablassert/enums.py +521 -0
  10. tablassert-7.0.2/src/tablassert/fullmap.py +177 -0
  11. tablassert-7.0.2/src/tablassert/ingests.py +43 -0
  12. tablassert-7.0.2/src/tablassert/lib.py +586 -0
  13. tablassert-7.0.2/src/tablassert/log.py +16 -0
  14. tablassert-7.0.2/src/tablassert/models.py +132 -0
  15. tablassert-7.0.2/src/tablassert/qc.py +129 -0
  16. tablassert-7.0.2/src/tablassert/utils.py +43 -0
  17. {tablassert-7.0.1 → tablassert-7.0.2}/uv.lock +1 -1
  18. tablassert-7.0.1/.planning/PROJECT.md +0 -47
  19. tablassert-7.0.1/.planning/REQUIREMENTS.md +0 -73
  20. tablassert-7.0.1/.planning/ROADMAP.md +0 -66
  21. tablassert-7.0.1/.planning/STATE.md +0 -79
  22. tablassert-7.0.1/.planning/config.json +0 -15
  23. tablassert-7.0.1/.planning/quick/1-please-add-a-github-action-that-runs-uv-/1-PLAN.md +0 -90
  24. tablassert-7.0.1/.planning/quick/1-please-add-a-github-action-that-runs-uv-/1-SUMMARY.md +0 -80
  25. tablassert-7.0.1/.planning/research/ARCHITECTURE.md +0 -220
  26. tablassert-7.0.1/.planning/research/FEATURES.md +0 -134
  27. tablassert-7.0.1/.planning/research/PITFALLS.md +0 -219
  28. tablassert-7.0.1/.planning/research/STACK.md +0 -140
  29. tablassert-7.0.1/.planning/research/SUMMARY.md +0 -146
  30. tablassert-7.0.1/.pre-commit-config.yaml +0 -15
  31. tablassert-7.0.1/src/tablassert/downloader.py +0 -35
  32. tablassert-7.0.1/src/tablassert/enums.py +0 -521
  33. tablassert-7.0.1/src/tablassert/fullmap.py +0 -167
  34. tablassert-7.0.1/src/tablassert/ingests.py +0 -43
  35. tablassert-7.0.1/src/tablassert/lib.py +0 -602
  36. tablassert-7.0.1/src/tablassert/log.py +0 -15
  37. tablassert-7.0.1/src/tablassert/models.py +0 -131
  38. tablassert-7.0.1/src/tablassert/qc.py +0 -124
  39. tablassert-7.0.1/src/tablassert/utils.py +0 -43
  40. {tablassert-7.0.1 → tablassert-7.0.2}/.github/workflows/docs.yml +0 -0
  41. {tablassert-7.0.1 → tablassert-7.0.2}/.github/workflows/pipy.yml +0 -0
  42. {tablassert-7.0.1 → tablassert-7.0.2}/.gitignore +0 -0
  43. {tablassert-7.0.1 → tablassert-7.0.2}/.python-version +0 -0
  44. {tablassert-7.0.1 → tablassert-7.0.2}/.vscode/settings.json +0 -0
  45. {tablassert-7.0.1 → tablassert-7.0.2}/LICENSE +0 -0
  46. {tablassert-7.0.1 → tablassert-7.0.2}/README.md +0 -0
  47. {tablassert-7.0.1 → tablassert-7.0.2}/docs/api/utils.md +0 -0
  48. {tablassert-7.0.1 → tablassert-7.0.2}/docs/configuration/advanced-example.md +0 -0
  49. {tablassert-7.0.1 → tablassert-7.0.2}/docs/configuration/graph.md +0 -0
  50. {tablassert-7.0.1 → tablassert-7.0.2}/docs/configuration/table.md +0 -0
  51. {tablassert-7.0.1 → tablassert-7.0.2}/docs/examples/tutorial-data.csv +0 -0
  52. {tablassert-7.0.1 → tablassert-7.0.2}/docs/examples/tutorial-graph.yaml +0 -0
  53. {tablassert-7.0.1 → tablassert-7.0.2}/docs/examples/tutorial-table.yaml +0 -0
  54. {tablassert-7.0.1 → tablassert-7.0.2}/docs/index.md +0 -0
  55. {tablassert-7.0.1 → tablassert-7.0.2}/docs/installation.md +0 -0
  56. {tablassert-7.0.1 → tablassert-7.0.2}/docs/tutorial.md +0 -0
  57. {tablassert-7.0.1 → tablassert-7.0.2}/mkdocs.yml +0 -0
  58. {tablassert-7.0.1 → tablassert-7.0.2}/src/tablassert/__init__.py +0 -0
@@ -0,0 +1,15 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.9.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: local
9
+ hooks:
10
+ - id: pyright
11
+ name: pyright
12
+ entry: uv run pyright
13
+ language: system
14
+ types: [python]
15
+ pass_filenames: false
@@ -2,6 +2,21 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.0.2 - 2026-03-23
6
+
7
+ ### Changes
8
+ - Updated package metadata for the 7.0.2 release.
9
+ - Added optional `log` and `column_context` controls to `fullmap.version4()` for more configurable entity-resolution behavior.
10
+
11
+ ### Bug Fixes
12
+ - Reworked entity-resolution querying to register terms directly in DuckDB instead of writing temporary parquet files, removing tempfile lifecycle issues in `fullmap` query execution.
13
+ - Isolated unmatched-entity logging into a dedicated helper and gated it behind an explicit logging flag.
14
+
15
+ ### Documentation
16
+ - Updated API reference docs to match the current `version4()` function signature and behavior.
17
+ - Corrected QC documentation to reflect the implemented fuzzy/BERT validation pipeline.
18
+ - Fixed documentation path typos for cache/store artifact directories.
19
+
5
20
  ## 7.0.1 - 2026-03-17
6
21
 
7
22
  ### Documentation
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.0.1
3
+ Version: 7.0.2
4
4
  Summary: Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
7
7
  Project-URL: Documentation, https://skyeav.github.io/Tablassert/
8
- Author: Jared C. Roach
9
- Author-email: Skye Lane Goetz <sgoetz@isbscience.org>, Gwennen Glusman <gglusman@isbscience.org>
8
+ Author-email: Skye Lane Goetz <sgoetz@isbscience.org>
10
9
  License-Expression: Apache-2.0
11
10
  License-File: LICENSE
12
11
  Keywords: declarative pipeline,knowledge graph,natural language processing,ncats translator,ner,tablassert,table mining,yaml configuration
@@ -13,11 +13,13 @@ def version4(
13
13
  lf: pl.LazyFrame,
14
14
  col: str,
15
15
  conn: object,
16
- taxon: Optional[str],
17
- prioritize: Optional[list[Categories]],
18
- avoid: Optional[list[Categories]],
19
- section_hash: str,
20
- config_file: str,
16
+ taxon: Optional[str] = None,
17
+ prioritize: Optional[list[Categories]] = None,
18
+ avoid: Optional[list[Categories]] = None,
19
+ log: bool = True,
20
+ section_hash: Optional[str] = None,
21
+ config_file: Optional[str] = None,
22
+ column_context: bool = True,
21
23
  tag: str = " one"
22
24
  ) -> pl.LazyFrame
23
25
  ```
@@ -61,6 +63,18 @@ Optional list of Biolink categories to exclude from results.
61
63
 
62
64
  Example: `[Categories.Gene]` prevents gene mappings.
63
65
 
66
+ **`log: bool` (default: `True`)**
67
+
68
+ Controls unmatched-value logging. When enabled, unresolved terms are logged with section/config/column context.
69
+
70
+ **`section_hash: Optional[str]` / `config_file: Optional[str]`**
71
+
72
+ Optional context fields used for operational logging when unmatched values are encountered.
73
+
74
+ **`column_context: bool` (default: `True`)**
75
+
76
+ Controls category-frequency tie-breaking when multiple matches exist for a term. When `True`, the query result adds a category frequency score and prefers more frequent category hits.
77
+
64
78
  **`tag: str` (default: `" one"`)**
65
79
 
66
80
  Suffix for NLP processing level column.
@@ -71,10 +85,6 @@ The function looks for both:
71
85
 
72
86
  Default `" one"` means it uses level-one text processing (lowercase, stripped).
73
87
 
74
- **`section_hash: str` / `config_file: str`**
75
-
76
- Context fields used for operational logging when unmatched values are encountered.
77
-
78
88
  ### Return Value
79
89
 
80
90
  Returns a Polars LazyFrame with these columns added:
@@ -91,25 +101,26 @@ Returns a Polars LazyFrame with these columns added:
91
101
 
92
102
  ### DuckDB Query
93
103
 
94
- The function executes a complex SQL query that:
104
+ The function executes a SQL query that:
105
+
106
+ 1. **Builds an in-memory term table** by collecting distinct terms from both NLP levels and registering them in DuckDB as `PARQUET` via `conn.register("PARQUET", df.to_arrow())`.
95
107
 
96
- 1. **Ranks matches** by:
108
+ 2. **Ranks matches** by:
97
109
  - Category priority (if `prioritize` specified)
98
110
  - NLP level (exact case match preferred over normalized)
99
- - Source confidence
111
+ - Category frequency (if `column_context=True`)
100
112
 
101
- 2. **Filters by:**
113
+ 3. **Filters by:**
102
114
  - Taxon ID (if specified)
103
115
  - Category avoidance (if specified)
104
116
 
105
- 3. **Deduplicates** to one CURIE per row per input string
117
+ 4. **Deduplicates** to one CURIE per input string
106
118
 
107
119
  ### Example Usage
108
120
 
109
121
  ```python
110
122
  from tablassert.fullmap import version4
111
123
  from tablassert.enums import Categories
112
- from pathlib import Path
113
124
  import duckdb
114
125
  import polars as pl
115
126
 
@@ -127,8 +138,10 @@ result = version4(
127
138
  taxon="9606", # Human only
128
139
  prioritize=[Categories.Gene],
129
140
  avoid=[Categories.Protein],
141
+ log=True,
130
142
  section_hash="tutorial-section",
131
143
  config_file="tutorial-table.yaml",
144
+ column_context=True,
132
145
  tag=" one"
133
146
  )
134
147
 
@@ -72,28 +72,26 @@ original == preferred_name
72
72
 
73
73
  **Performance:** O(1) string comparison
74
74
 
75
+ Before fuzzy matching, the function also applies rule-based pass-through checks for known safe patterns (for example CHEBI/PR/UniProtKB CURIE families and selected exception prefixes).
76
+
75
77
  #### Stage 2: Fuzzy Matching
76
78
 
77
79
  **Medium confidence using RapidFuzz.**
78
80
 
79
- Four fuzzy matching algorithms:
81
+ Two fuzzy matching algorithms:
80
82
  1. **Ratio:** Overall string similarity
81
- 2. **Partial ratio:** Substring matching
82
- 3. **Token sort ratio:** Order-independent word matching
83
- 4. **Partial token sort ratio:** Combined approach
83
+ 2. **Partial token sort ratio:** Combined token/subsequence matching
84
84
 
85
85
  **Threshold:** Default 20% similarity (configurable)
86
86
 
87
87
  ```python
88
88
  fuzz.ratio(original, preferred) >= 20
89
- or fuzz.ratio(original, curie) >= 20
90
89
  or fuzz.partial_token_sort_ratio(original, preferred) >= 20
91
- or fuzz.partial_token_sort_ratio(original, curie) >= 20
92
90
  ```
93
91
 
94
92
  **Example passes:**
95
93
  - Original: `"breast ca"` → Preferred: `"breast cancer"` ✓
96
- - Original: `"T53"` → CURIE: `"HGNC:11998"` (TP53) ✗ (goes to Stage 3)
94
+ - Original: `"T53"` → Preferred: `"tumor protein p53"` ✗ (goes to Stage 3)
97
95
 
98
96
  **Performance:** O(n) string operations, cached via `@DISKCACHE.memoize()`
99
97
 
@@ -128,7 +126,7 @@ return similarity >= 0.2
128
126
  - ONNX session caching
129
127
  - Disk cache for embeddings (~100MB LRU)
130
128
 
131
- Loaded once at module import, reused for all calls.
129
+ Lazy-loaded on first `BERT_audit()` call, then reused for subsequent calls.
132
130
 
133
131
  ### Disk Caching
134
132
 
@@ -142,7 +140,7 @@ def fuzz_audit(...): ...
142
140
  def BERT_audit(...): ...
143
141
  ```
144
142
 
145
- **Cache location:** `cachessert/` directory
143
+ **Cache location:** `./.cachassert` directory
146
144
 
147
145
  **Cache strategy:** LRU eviction when size exceeds limit
148
146
 
@@ -40,7 +40,7 @@ Final output files are written to the current working directory as:
40
40
  - `{name}_{version}.nodes.ndjson` - Node file (entities)
41
41
  - `{name}_{version}.edges.ndjson` - Edge file (relationships)
42
42
 
43
- Intermediate parquet artifacts are written to `storessert/` during section processing.
43
+ Intermediate parquet artifacts are written to `.storassert/` during section processing.
44
44
 
45
45
  See [Graph Configuration](configuration/graph.md) for details on the YAML schema.
46
46
 
@@ -1,11 +1,9 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.0.1"
3
+ version = "7.0.2"
4
4
  description = "Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON."
5
5
  authors = [
6
- { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" },
7
- { name = "Gwennen Glusman", email = "gglusman@isbscience.org" },
8
- { name = "Jared C. Roach" },
6
+ { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
9
7
  ]
10
8
  keywords = [
11
9
  "knowledge graph",
@@ -77,7 +75,7 @@ dev = [
77
75
 
78
76
  [tool.ruff]
79
77
  line-length = 120
80
- indent-width = 2
78
+ indent-width = 4
81
79
  target-version = "py313"
82
80
 
83
81
  [tool.ruff.format]
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ from time import sleep
3
+ from typing import Optional
4
+
5
+ import pyexcel
6
+ from playwright.sync_api import sync_playwright
7
+
8
+
9
+ def modernize_xls(p: Path) -> Path:
10
+ xlsx: Path = p.with_suffix(".xlsx")
11
+ pyexcel.save_book_as(file_name=str(p), dest_file_name=str(xlsx))
12
+ return xlsx
13
+
14
+
15
+ def from_url(website: str, p: Path, timeout: int = 60_000, retries: int = 3) -> Path:
16
+ p.parent.mkdir(parents=True, exist_ok=True)
17
+ if p.is_file():
18
+ return p
19
+
20
+ last: Optional[Exception] = None
21
+ for attempt in range(retries):
22
+ try:
23
+ with sync_playwright() as pw:
24
+ browser = pw.chromium.launch(headless=True)
25
+ page = browser.new_page()
26
+ page.goto(website, wait_until="networkidle", timeout=timeout)
27
+ with page.expect_download(timeout=timeout) as info:
28
+ download = info.value
29
+ download.save_as(p)
30
+ browser.close()
31
+ return p
32
+ except Exception as e:
33
+ last = e
34
+ if attempt < retries - 1:
35
+ sleep(2**attempt)
36
+
37
+ raise RuntimeError(f"01 | Download Failed After {retries} Attempts: {last}")