tablassert 7.0.2__tar.gz → 7.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {tablassert-7.0.2 → tablassert-7.1.0}/CHANGELOG.md +6 -0
  2. {tablassert-7.0.2 → tablassert-7.1.0}/PKG-INFO +2 -1
  3. {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/fullmap.md +2 -1
  4. {tablassert-7.0.2 → tablassert-7.1.0}/pyproject.toml +2 -1
  5. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/downloader.py +9 -2
  6. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/enums.py +2 -0
  7. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/fullmap.py +17 -4
  8. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/ingests.py +9 -2
  9. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/lib.py +17 -6
  10. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/log.py +2 -0
  11. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/models.py +2 -0
  12. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/qc.py +17 -6
  13. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/utils.py +11 -3
  14. {tablassert-7.0.2 → tablassert-7.1.0}/uv.lock +15 -1
  15. {tablassert-7.0.2 → tablassert-7.1.0}/.github/workflows/docs.yml +0 -0
  16. {tablassert-7.0.2 → tablassert-7.1.0}/.github/workflows/pipy.yml +0 -0
  17. {tablassert-7.0.2 → tablassert-7.1.0}/.gitignore +0 -0
  18. {tablassert-7.0.2 → tablassert-7.1.0}/.pre-commit-config.yaml +0 -0
  19. {tablassert-7.0.2 → tablassert-7.1.0}/.python-version +0 -0
  20. {tablassert-7.0.2 → tablassert-7.1.0}/.vscode/settings.json +0 -0
  21. {tablassert-7.0.2 → tablassert-7.1.0}/LICENSE +0 -0
  22. {tablassert-7.0.2 → tablassert-7.1.0}/README.md +0 -0
  23. {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/qc.md +0 -0
  24. {tablassert-7.0.2 → tablassert-7.1.0}/docs/api/utils.md +0 -0
  25. {tablassert-7.0.2 → tablassert-7.1.0}/docs/cli.md +0 -0
  26. {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/advanced-example.md +0 -0
  27. {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/graph.md +0 -0
  28. {tablassert-7.0.2 → tablassert-7.1.0}/docs/configuration/table.md +0 -0
  29. {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-data.csv +0 -0
  30. {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-graph.yaml +0 -0
  31. {tablassert-7.0.2 → tablassert-7.1.0}/docs/examples/tutorial-table.yaml +0 -0
  32. {tablassert-7.0.2 → tablassert-7.1.0}/docs/index.md +0 -0
  33. {tablassert-7.0.2 → tablassert-7.1.0}/docs/installation.md +0 -0
  34. {tablassert-7.0.2 → tablassert-7.1.0}/docs/tutorial.md +0 -0
  35. {tablassert-7.0.2 → tablassert-7.1.0}/mkdocs.yml +0 -0
  36. {tablassert-7.0.2 → tablassert-7.1.0}/src/tablassert/__init__.py +0 -0
@@ -2,6 +2,12 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## Unreleased
6
+
7
+ ### Changes
8
+ - Updated `fullmap` ranking to prioritize case-insensitive exact matches between normalized terms and preferred names.
9
+ - Updated `fullmap` term de-duplication to keep first occurrences, improving deterministic output ordering.
10
+
5
11
  ## 7.0.2 - 2026-03-23
6
12
 
7
13
  ### Changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.0.2
3
+ Version: 7.1.0
4
4
  Summary: Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
@@ -14,6 +14,7 @@ Requires-Python: >=3.13
14
14
  Requires-Dist: diskcache>=5.6.3
15
15
  Requires-Dist: duckdb>=1.5.0
16
16
  Requires-Dist: fastexcel>=0.19.0
17
+ Requires-Dist: lazy-loader>=0.5
17
18
  Requires-Dist: loguru>=0.7.3
18
19
  Requires-Dist: mkdocs>=1.6.1
19
20
  Requires-Dist: onnxruntime>=1.24.3
@@ -103,10 +103,11 @@ Returns a Polars LazyFrame with these columns added:
103
103
 
104
104
  The function executes a SQL query that:
105
105
 
106
- 1. **Builds an in-memory term table** by collecting distinct terms from both NLP levels and registering them in DuckDB as `PARQUET` via `conn.register("PARQUET", df.to_arrow())`.
106
+ 1. **Builds an in-memory term table** by collecting terms from both NLP levels, deduplicating by keeping first occurrences for deterministic ordering, and registering them in DuckDB as `PARQUET` via `conn.register("PARQUET", df.to_arrow())`.
107
107
 
108
108
  2. **Ranks matches** by:
109
109
  - Category priority (if `prioritize` specified)
110
+ - Preferred-name exactness (case-insensitive exact match of normalized term to preferred name)
110
111
  - NLP level (exact case match preferred over normalized)
111
112
  - Category frequency (if `column_context=True`)
112
113
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.0.2"
3
+ version = "7.1.0"
4
4
  description = "Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON."
5
5
  authors = [
6
6
  { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
@@ -25,6 +25,7 @@ dependencies = [
25
25
  "diskcache>=5.6.3",
26
26
  "duckdb>=1.5.0",
27
27
  "fastexcel>=0.19.0",
28
+ "lazy-loader>=0.5",
28
29
  "loguru>=0.7.3",
29
30
  "mkdocs>=1.6.1",
30
31
  "onnxruntime>=1.24.3",
@@ -1,10 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from time import sleep
3
- from typing import Optional
5
+ from typing import TYPE_CHECKING, Optional
4
6
 
5
- import pyexcel
7
+ import lazy_loader as Lazy
6
8
  from playwright.sync_api import sync_playwright
7
9
 
10
+ if TYPE_CHECKING:
11
+ import pyexcel
12
+ else:
13
+ pyexcel = Lazy.load("pyexcel")
14
+
8
15
 
9
16
  def modernize_xls(p: Path) -> Path:
10
17
  xlsx: Path = p.with_suffix(".xlsx")
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from enum import Enum
2
4
 
3
5
 
@@ -1,11 +1,18 @@
1
+ from __future__ import annotations
2
+
1
3
  from operator import add
2
- from typing import Optional
4
+ from typing import TYPE_CHECKING, Optional
3
5
 
4
- import polars as pl
6
+ import lazy_loader as Lazy
5
7
 
6
8
  from tablassert.enums import Categories
7
9
  from tablassert.log import logger
8
10
 
11
+ if TYPE_CHECKING:
12
+ import polars as pl
13
+ else:
14
+ pl = Lazy.load("polars")
15
+
9
16
 
10
17
  def distinct(lf: pl.LazyFrame, l0: str, l1: str) -> pl.LazyFrame:
11
18
  # ? Extract Unique Terms From Two Text Normalization Columns As LazyFrame
@@ -15,7 +22,7 @@ def distinct(lf: pl.LazyFrame, l0: str, l1: str) -> pl.LazyFrame:
15
22
  t1: pl.LazyFrame = lf.select(pl.col(l1).alias("term")).unique()
16
23
  t1 = t1.with_columns(pl.lit(1).alias("nlp level"))
17
24
 
18
- terms: pl.LazyFrame = pl.concat([t0, t1]).unique(subset=["term"])
25
+ terms: pl.LazyFrame = pl.concat([t0, t1]).unique(subset=["term"], keep="first")
19
26
 
20
27
  bad: str = r"^\d+$|^(none|nan|na|null|unknown)$|^$"
21
28
  return terms.filter(~pl.col("term").str.contains(bad))
@@ -38,6 +45,9 @@ def query_builder(
38
45
  CASE
39
46
  {priority_case}
40
47
  ELSE 50
48
+ END * CASE
49
+ WHEN LOWER(CU.PREFERRED_NAME) = PA.term THEN 1
50
+ ELSE 10
41
51
  END AS PR
42
52
  FROM SYNONYMS SY
43
53
  JOIN SOURCES SO ON SY.SOURCE_ID = SO.SOURCE_ID
@@ -76,13 +86,16 @@ def query_distinct(
76
86
  results: pl.DataFrame = conn.execute(query).pl() # pyright: ignore
77
87
 
78
88
  sort_by: list[str] = ["term", "PR", "NLP_LEVEL"]
89
+ descending: list[bool] = [False, False, False]
79
90
 
80
91
  if column_context:
81
92
  frequency: pl.DataFrame = results.group_by("CATEGORY_NAME").agg(pl.len().alias("FREQUENCY"))
82
93
  results = results.join(frequency, on="CATEGORY_NAME", how="left")
94
+
83
95
  sort_by += ["FREQUENCY"]
96
+ descending += [True]
84
97
 
85
- results = results.sort(sort_by, descending=[False, False, False, True])
98
+ results = results.sort(sort_by, descending=descending)
86
99
  results = results.unique(subset=["term"], keep="first")
87
100
 
88
101
  return results
@@ -1,10 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  from copy import deepcopy
2
4
  from pathlib import Path
3
- from typing import Any, Union
5
+ from typing import TYPE_CHECKING, Any, Union
4
6
 
5
- import yaml
7
+ import lazy_loader as Lazy
6
8
  from yaml import CLoader
7
9
 
10
+ if TYPE_CHECKING:
11
+ import yaml
12
+ else:
13
+ yaml = Lazy.load("yaml")
14
+
8
15
 
9
16
  def fastmerge(a: Union[list[Any], dict[str, Any]], b: Union[list[Any], dict[str, Any]]) -> Any:
10
17
  # ? Streamlined (Fast) Implementation Of Deepmerge Config
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import math
2
4
  import operator
3
5
  from functools import reduce
@@ -6,13 +8,9 @@ from multiprocessing import Pool
6
8
  from operator import add, eq, le
7
9
  from os.path import basename
8
10
  from pathlib import Path
9
- from typing import Any, Callable, Literal, Optional, Self, Union
11
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Self, Union
10
12
 
11
- import duckdb
12
- import orjson
13
- import polars as pl
14
- import typer
15
- import xxhash
13
+ import lazy_loader as Lazy
16
14
  from pydantic import Field, NonNegativeInt, PositiveInt
17
15
  from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn, TimeElapsedColumn
18
16
  from sqlite_utils import Database
@@ -26,6 +24,19 @@ from tablassert.models import Encoding, Graph, NodeEncoding, Section
26
24
  from tablassert.qc import fullmap_audit
27
25
  from tablassert.utils import STORE, mkhash, namespace_uuid
28
26
 
27
+ if TYPE_CHECKING:
28
+ import duckdb
29
+ import orjson
30
+ import polars as pl
31
+ import typer
32
+ import xxhash
33
+ else:
34
+ duckdb = Lazy.load("duckdb")
35
+ orjson = Lazy.load("orjson")
36
+ pl = Lazy.load("polars")
37
+ typer = Lazy.load("typer")
38
+ xxhash = Lazy.load("xxhash")
39
+
29
40
  # ? Newline To Make Progress Bar More Readable
30
41
  print("\n")
31
42
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  from loguru import logger
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Literal, Optional, Union
3
5
 
@@ -1,13 +1,22 @@
1
+ from __future__ import annotations
2
+
1
3
  from operator import add, eq, ge
2
4
  from pathlib import Path
3
- from typing import Literal, Optional
5
+ from typing import TYPE_CHECKING, Literal, Optional
4
6
 
5
- import onnxruntime as ort
6
- import polars as pl
7
+ import lazy_loader as Lazy
7
8
  from rapidfuzz import fuzz
8
- from sentence_transformers import SentenceTransformer
9
9
  from sklearn.metrics.pairwise import cosine_similarity
10
10
 
11
+ if TYPE_CHECKING:
12
+ import onnxruntime as ort
13
+ import polars as pl
14
+ import sentence_transformers
15
+ else:
16
+ ort = Lazy.load("onnxruntime")
17
+ sentence_transformers = Lazy.load("sentence_transformers")
18
+ pl = Lazy.load("polars")
19
+
11
20
  from tablassert.log import logger
12
21
  from tablassert.utils import DISKCACHE
13
22
 
@@ -28,9 +37,11 @@ def get_biobert() -> object:
28
37
  if BIOBERT:
29
38
  return BIOBERT
30
39
  elif not BIOBERT and MODEL.exists():
31
- BIOBERT = SentenceTransformer(str(MODEL), backend=MODEL_BACKEND, model_kwargs=MODEL_KWARGS) # pyright: ignore
40
+ BIOBERT = sentence_transformers.SentenceTransformer(
41
+ str(MODEL), backend=MODEL_BACKEND, model_kwargs=MODEL_KWARGS
42
+ ) # pyright: ignore
32
43
  else:
33
- BIOBERT = SentenceTransformer(
44
+ BIOBERT = sentence_transformers.SentenceTransformer(
34
45
  "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb", backend=MODEL_BACKEND, model_kwargs=MODEL_KWARGS
35
46
  ) # pyright: ignore
36
47
  MODEL.mkdir(parents=True, exist_ok=True)
@@ -1,12 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  from functools import cache
2
4
  from pathlib import Path
3
- from typing import Any
5
+ from typing import TYPE_CHECKING, Any
4
6
  from uuid import UUID, uuid3
5
7
 
6
- import polars as pl
7
- import xxhash
8
+ import lazy_loader as Lazy
8
9
  from diskcache import Cache
9
10
 
11
+ if TYPE_CHECKING:
12
+ import polars as pl
13
+ import xxhash
14
+ else:
15
+ pl = Lazy.load("polars")
16
+ xxhash = Lazy.load("xxhash")
17
+
10
18
  STORE: Path = Path("./.storassert")
11
19
  STORE.mkdir(parents=True, exist_ok=True)
12
20
 
@@ -363,6 +363,18 @@ wheels = [
363
363
  { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
364
364
  ]
365
365
 
366
+ [[package]]
367
+ name = "lazy-loader"
368
+ version = "0.5"
369
+ source = { registry = "https://pypi.org/simple" }
370
+ dependencies = [
371
+ { name = "packaging" },
372
+ ]
373
+ sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" }
374
+ wheels = [
375
+ { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" },
376
+ ]
377
+
366
378
  [[package]]
367
379
  name = "lml"
368
380
  version = "0.2.0"
@@ -1649,12 +1661,13 @@ wheels = [
1649
1661
 
1650
1662
  [[package]]
1651
1663
  name = "tablassert"
1652
- version = "7.0.2"
1664
+ version = "7.1.0"
1653
1665
  source = { editable = "." }
1654
1666
  dependencies = [
1655
1667
  { name = "diskcache" },
1656
1668
  { name = "duckdb" },
1657
1669
  { name = "fastexcel" },
1670
+ { name = "lazy-loader" },
1658
1671
  { name = "loguru" },
1659
1672
  { name = "mkdocs" },
1660
1673
  { name = "onnxruntime" },
@@ -1693,6 +1706,7 @@ requires-dist = [
1693
1706
  { name = "diskcache", specifier = ">=5.6.3" },
1694
1707
  { name = "duckdb", specifier = ">=1.5.0" },
1695
1708
  { name = "fastexcel", specifier = ">=0.19.0" },
1709
+ { name = "lazy-loader", specifier = ">=0.5" },
1696
1710
  { name = "loguru", specifier = ">=0.7.3" },
1697
1711
  { name = "mkdocs", specifier = ">=1.6.1" },
1698
1712
  { name = "onnxruntime", specifier = ">=1.24.3" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes