tablassert 7.0.2__tar.gz → 7.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.0.2 → tablassert-7.1.0}/CHANGELOG.md +6 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/PKG-INFO +2 -1
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/fullmap.md +2 -1
- {tablassert-7.0.2 → tablassert-7.1.0}/pyproject.toml +2 -1
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/downloader.py +9 -2
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/enums.py +2 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/fullmap.py +17 -4
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/ingests.py +9 -2
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/lib.py +17 -6
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/log.py +2 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/models.py +2 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/qc.py +17 -6
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/utils.py +11 -3
- {tablassert-7.0.2 → tablassert-7.1.0}/uv.lock +15 -1
- {tablassert-7.0.2 → tablassert-7.1.0}/.github/workflows/docs.yml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/.gitignore +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/.pre-commit-config.yaml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/.python-version +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/.vscode/settings.json +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/LICENSE +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/README.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/qc.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/utils.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/cli.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/advanced-example.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/graph.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/table.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/index.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/installation.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/docs/tutorial.md +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/mkdocs.yml +0 -0
- {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/__init__.py +0 -0
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## Unreleased
|
|
6
|
+
|
|
7
|
+
### Changes
|
|
8
|
+
- Updated `fullmap` ranking to prioritize case-insensitive exact matches between normalized terms and preferred names.
|
|
9
|
+
- Updated `fullmap` term de-duplication to keep first occurrences, improving deterministic output ordering.
|
|
10
|
+
|
|
5
11
|
## 7.0.2 - 2026-03-23
|
|
6
12
|
|
|
7
13
|
### Changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.0
|
|
3
|
+
Version: 7.1.0
|
|
4
4
|
Summary: Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -14,6 +14,7 @@ Requires-Python: >=3.13
|
|
|
14
14
|
Requires-Dist: diskcache>=5.6.3
|
|
15
15
|
Requires-Dist: duckdb>=1.5.0
|
|
16
16
|
Requires-Dist: fastexcel>=0.19.0
|
|
17
|
+
Requires-Dist: lazy-loader>=0.5
|
|
17
18
|
Requires-Dist: loguru>=0.7.3
|
|
18
19
|
Requires-Dist: mkdocs>=1.6.1
|
|
19
20
|
Requires-Dist: onnxruntime>=1.24.3
|
|
@@ -103,10 +103,11 @@ Returns a Polars LazyFrame with these columns added:
|
|
|
103
103
|
|
|
104
104
|
The function executes a SQL query that:
|
|
105
105
|
|
|
106
|
-
1. **Builds an in-memory term table** by collecting
|
|
106
|
+
1. **Builds an in-memory term table** by collecting terms from both NLP levels, deduplicating by keeping first occurrences for deterministic ordering, and registering them in DuckDB as `PARQUET` via `conn.register("PARQUET", df.to_arrow())`.
|
|
107
107
|
|
|
108
108
|
2. **Ranks matches** by:
|
|
109
109
|
- Category priority (if `prioritize` specified)
|
|
110
|
+
- Preferred-name exactness (case-insensitive exact match of normalized term to preferred name)
|
|
110
111
|
- NLP level (exact case match preferred over normalized)
|
|
111
112
|
- Category frequency (if `column_context=True`)
|
|
112
113
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.0
|
|
3
|
+
version = "7.1.0"
|
|
4
4
|
description = "Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -25,6 +25,7 @@ dependencies = [
|
|
|
25
25
|
"diskcache>=5.6.3",
|
|
26
26
|
"duckdb>=1.5.0",
|
|
27
27
|
"fastexcel>=0.19.0",
|
|
28
|
+
"lazy-loader>=0.5",
|
|
28
29
|
"loguru>=0.7.3",
|
|
29
30
|
"mkdocs>=1.6.1",
|
|
30
31
|
"onnxruntime>=1.24.3",
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
from time import sleep
|
|
3
|
-
from typing import Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Optional
|
|
4
6
|
|
|
5
|
-
import
|
|
7
|
+
import lazy_loader as Lazy
|
|
6
8
|
from playwright.sync_api import sync_playwright
|
|
7
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import pyexcel
|
|
12
|
+
else:
|
|
13
|
+
pyexcel = Lazy.load("pyexcel")
|
|
14
|
+
|
|
8
15
|
|
|
9
16
|
def modernize_xls(p: Path) -> Path:
|
|
10
17
|
xlsx: Path = p.with_suffix(".xlsx")
|
|
@@ -1,11 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from operator import add
|
|
2
|
-
from typing import Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
5
|
|
|
4
|
-
import
|
|
6
|
+
import lazy_loader as Lazy
|
|
5
7
|
|
|
6
8
|
from tablassert.enums import Categories
|
|
7
9
|
from tablassert.log import logger
|
|
8
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import polars as pl
|
|
13
|
+
else:
|
|
14
|
+
pl = Lazy.load("polars")
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
def distinct(lf: pl.LazyFrame, l0: str, l1: str) -> pl.LazyFrame:
|
|
11
18
|
# ? Extract Unique Terms From Two Text Normalization Columns As LazyFrame
|
|
@@ -15,7 +22,7 @@ def distinct(lf: pl.LazyFrame, l0: str, l1: str) -> pl.LazyFrame:
|
|
|
15
22
|
t1: pl.LazyFrame = lf.select(pl.col(l1).alias("term")).unique()
|
|
16
23
|
t1 = t1.with_columns(pl.lit(1).alias("nlp level"))
|
|
17
24
|
|
|
18
|
-
terms: pl.LazyFrame = pl.concat([t0, t1]).unique(subset=["term"])
|
|
25
|
+
terms: pl.LazyFrame = pl.concat([t0, t1]).unique(subset=["term"], keep="first")
|
|
19
26
|
|
|
20
27
|
bad: str = r"^\d+$|^(none|nan|na|null|unknown)$|^$"
|
|
21
28
|
return terms.filter(~pl.col("term").str.contains(bad))
|
|
@@ -38,6 +45,9 @@ def query_builder(
|
|
|
38
45
|
CASE
|
|
39
46
|
{priority_case}
|
|
40
47
|
ELSE 50
|
|
48
|
+
END * CASE
|
|
49
|
+
WHEN LOWER(CU.PREFERRED_NAME) = PA.term THEN 1
|
|
50
|
+
ELSE 10
|
|
41
51
|
END AS PR
|
|
42
52
|
FROM SYNONYMS SY
|
|
43
53
|
JOIN SOURCES SO ON SY.SOURCE_ID = SO.SOURCE_ID
|
|
@@ -76,13 +86,16 @@ def query_distinct(
|
|
|
76
86
|
results: pl.DataFrame = conn.execute(query).pl() # pyright: ignore
|
|
77
87
|
|
|
78
88
|
sort_by: list[str] = ["term", "PR", "NLP_LEVEL"]
|
|
89
|
+
descending: list[bool] = [False, False, False]
|
|
79
90
|
|
|
80
91
|
if column_context:
|
|
81
92
|
frequency: pl.DataFrame = results.group_by("CATEGORY_NAME").agg(pl.len().alias("FREQUENCY"))
|
|
82
93
|
results = results.join(frequency, on="CATEGORY_NAME", how="left")
|
|
94
|
+
|
|
83
95
|
sort_by += ["FREQUENCY"]
|
|
96
|
+
descending += [True]
|
|
84
97
|
|
|
85
|
-
results = results.sort(sort_by, descending=
|
|
98
|
+
results = results.sort(sort_by, descending=descending)
|
|
86
99
|
results = results.unique(subset=["term"], keep="first")
|
|
87
100
|
|
|
88
101
|
return results
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from copy import deepcopy
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Any, Union
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
4
6
|
|
|
5
|
-
import
|
|
7
|
+
import lazy_loader as Lazy
|
|
6
8
|
from yaml import CLoader
|
|
7
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import yaml
|
|
12
|
+
else:
|
|
13
|
+
yaml = Lazy.load("yaml")
|
|
14
|
+
|
|
8
15
|
|
|
9
16
|
def fastmerge(a: Union[list[Any], dict[str, Any]], b: Union[list[Any], dict[str, Any]]) -> Any:
|
|
10
17
|
# ? Streamlined (Fast) Implementation Of Deepmerge Config
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import math
|
|
2
4
|
import operator
|
|
3
5
|
from functools import reduce
|
|
@@ -6,13 +8,9 @@ from multiprocessing import Pool
|
|
|
6
8
|
from operator import add, eq, le
|
|
7
9
|
from os.path import basename
|
|
8
10
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Callable, Literal, Optional, Self, Union
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Self, Union
|
|
10
12
|
|
|
11
|
-
import
|
|
12
|
-
import orjson
|
|
13
|
-
import polars as pl
|
|
14
|
-
import typer
|
|
15
|
-
import xxhash
|
|
13
|
+
import lazy_loader as Lazy
|
|
16
14
|
from pydantic import Field, NonNegativeInt, PositiveInt
|
|
17
15
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn, TimeElapsedColumn
|
|
18
16
|
from sqlite_utils import Database
|
|
@@ -26,6 +24,19 @@ from tablassert.models import Encoding, Graph, NodeEncoding, Section
|
|
|
26
24
|
from tablassert.qc import fullmap_audit
|
|
27
25
|
from tablassert.utils import STORE, mkhash, namespace_uuid
|
|
28
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import duckdb
|
|
29
|
+
import orjson
|
|
30
|
+
import polars as pl
|
|
31
|
+
import typer
|
|
32
|
+
import xxhash
|
|
33
|
+
else:
|
|
34
|
+
duckdb = Lazy.load("duckdb")
|
|
35
|
+
orjson = Lazy.load("orjson")
|
|
36
|
+
pl = Lazy.load("polars")
|
|
37
|
+
typer = Lazy.load("typer")
|
|
38
|
+
xxhash = Lazy.load("xxhash")
|
|
39
|
+
|
|
29
40
|
# ? Newline To Make Progress Bar More Readable
|
|
30
41
|
print("\n")
|
|
31
42
|
|
|
@@ -1,13 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from operator import add, eq, ge
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Literal, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Literal, Optional
|
|
4
6
|
|
|
5
|
-
import
|
|
6
|
-
import polars as pl
|
|
7
|
+
import lazy_loader as Lazy
|
|
7
8
|
from rapidfuzz import fuzz
|
|
8
|
-
from sentence_transformers import SentenceTransformer
|
|
9
9
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import onnxruntime as ort
|
|
13
|
+
import polars as pl
|
|
14
|
+
import sentence_transformers
|
|
15
|
+
else:
|
|
16
|
+
ort = Lazy.load("onnxruntime")
|
|
17
|
+
sentence_transformers = Lazy.load("sentence_transformers")
|
|
18
|
+
pl = Lazy.load("polars")
|
|
19
|
+
|
|
11
20
|
from tablassert.log import logger
|
|
12
21
|
from tablassert.utils import DISKCACHE
|
|
13
22
|
|
|
@@ -28,9 +37,11 @@ def get_biobert() -> object:
|
|
|
28
37
|
if BIOBERT:
|
|
29
38
|
return BIOBERT
|
|
30
39
|
elif not BIOBERT and MODEL.exists():
|
|
31
|
-
BIOBERT = SentenceTransformer(
|
|
40
|
+
BIOBERT = sentence_transformers.SentenceTransformer(
|
|
41
|
+
str(MODEL), backend=MODEL_BACKEND, model_kwargs=MODEL_KWARGS
|
|
42
|
+
) # pyright: ignore
|
|
32
43
|
else:
|
|
33
|
-
BIOBERT = SentenceTransformer(
|
|
44
|
+
BIOBERT = sentence_transformers.SentenceTransformer(
|
|
34
45
|
"pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb", backend=MODEL_BACKEND, model_kwargs=MODEL_KWARGS
|
|
35
46
|
) # pyright: ignore
|
|
36
47
|
MODEL.mkdir(parents=True, exist_ok=True)
|
|
@@ -1,12 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from functools import cache
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
from uuid import UUID, uuid3
|
|
5
7
|
|
|
6
|
-
import
|
|
7
|
-
import xxhash
|
|
8
|
+
import lazy_loader as Lazy
|
|
8
9
|
from diskcache import Cache
|
|
9
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import polars as pl
|
|
13
|
+
import xxhash
|
|
14
|
+
else:
|
|
15
|
+
pl = Lazy.load("polars")
|
|
16
|
+
xxhash = Lazy.load("xxhash")
|
|
17
|
+
|
|
10
18
|
STORE: Path = Path("./.storassert")
|
|
11
19
|
STORE.mkdir(parents=True, exist_ok=True)
|
|
12
20
|
|
|
@@ -363,6 +363,18 @@ wheels = [
|
|
|
363
363
|
{ url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
|
|
364
364
|
]
|
|
365
365
|
|
|
366
|
+
[[package]]
|
|
367
|
+
name = "lazy-loader"
|
|
368
|
+
version = "0.5"
|
|
369
|
+
source = { registry = "https://pypi.org/simple" }
|
|
370
|
+
dependencies = [
|
|
371
|
+
{ name = "packaging" },
|
|
372
|
+
]
|
|
373
|
+
sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" }
|
|
374
|
+
wheels = [
|
|
375
|
+
{ url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" },
|
|
376
|
+
]
|
|
377
|
+
|
|
366
378
|
[[package]]
|
|
367
379
|
name = "lml"
|
|
368
380
|
version = "0.2.0"
|
|
@@ -1649,12 +1661,13 @@ wheels = [
|
|
|
1649
1661
|
|
|
1650
1662
|
[[package]]
|
|
1651
1663
|
name = "tablassert"
|
|
1652
|
-
version = "7.0
|
|
1664
|
+
version = "7.1.0"
|
|
1653
1665
|
source = { editable = "." }
|
|
1654
1666
|
dependencies = [
|
|
1655
1667
|
{ name = "diskcache" },
|
|
1656
1668
|
{ name = "duckdb" },
|
|
1657
1669
|
{ name = "fastexcel" },
|
|
1670
|
+
{ name = "lazy-loader" },
|
|
1658
1671
|
{ name = "loguru" },
|
|
1659
1672
|
{ name = "mkdocs" },
|
|
1660
1673
|
{ name = "onnxruntime" },
|
|
@@ -1693,6 +1706,7 @@ requires-dist = [
|
|
|
1693
1706
|
{ name = "diskcache", specifier = ">=5.6.3" },
|
|
1694
1707
|
{ name = "duckdb", specifier = ">=1.5.0" },
|
|
1695
1708
|
{ name = "fastexcel", specifier = ">=0.19.0" },
|
|
1709
|
+
{ name = "lazy-loader", specifier = ">=0.5" },
|
|
1696
1710
|
{ name = "loguru", specifier = ">=0.7.3" },
|
|
1697
1711
|
{ name = "mkdocs", specifier = ">=1.6.1" },
|
|
1698
1712
|
{ name = "onnxruntime", specifier = ">=1.24.3" },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|