tablassert 7.3.1__tar.gz → 7.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.3.1 → tablassert-7.3.3}/AGENTS.md +1 -1
- {tablassert-7.3.1 → tablassert-7.3.3}/CHANGELOG.md +13 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/CONTRIBUTING.md +3 -6
- {tablassert-7.3.1 → tablassert-7.3.3}/PKG-INFO +3 -3
- {tablassert-7.3.1 → tablassert-7.3.3}/README.md +1 -1
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/api/fullmap.md +5 -5
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/api/lib.md +4 -4
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/configuration/graph.md +4 -1
- tablassert-7.3.3/docs/datassert.md +106 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/docker.md +2 -2
- {tablassert-7.3.1 → tablassert-7.3.3}/llms.txt +1 -1
- {tablassert-7.3.1 → tablassert-7.3.3}/pyproject.toml +2 -2
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/fullmap.py +20 -20
- {tablassert-7.3.1 → tablassert-7.3.3}/uv.lock +2 -2
- tablassert-7.3.1/docs/datassert.md +0 -66
- {tablassert-7.3.1 → tablassert-7.3.3}/.github/workflows/autotag.yml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/.github/workflows/docker.yml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/.github/workflows/docs.yml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/.gitignore +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/.pre-commit-config.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/CITATION.cff +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/Dockerfile +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/LICENSE +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/api/qc.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/api/utils.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/cli.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/configuration/advanced-example.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/configuration/table.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/examples.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/index.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/installation.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/docs/tutorial.md +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/mkdocs.yml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/__init__.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/cli.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/downloader.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/enums.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/ingests.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/lib.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/log.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/models.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/nlp.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/qc.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/src/tablassert/utils.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/__init__.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/conftest.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/fixtures/minimal_section.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_enums.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_fullmap.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_ingests.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_lib.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_models.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_nlp.py +0 -0
- {tablassert-7.3.1 → tablassert-7.3.3}/tests/test_utils.py +0 -0
|
@@ -35,7 +35,7 @@ src/tablassert/
|
|
|
35
35
|
lib.py # Core logic: encodings, data loading, Tcode(Section) class
|
|
36
36
|
models.py # Pydantic v2 models (TablaBase base class)
|
|
37
37
|
enums.py # str, Enum subclasses (Tokens, Repositories, Comparisons, etc.)
|
|
38
|
-
fullmap.py # NER / entity resolution (DuckDB,
|
|
38
|
+
fullmap.py # NER / entity resolution (DuckDB, 12 shards)
|
|
39
39
|
qc.py # Quality control (ONNX/BioBERT, sentence_transformers)
|
|
40
40
|
nlp.py # Text normalization (level_one: strip+lowercase, level_two: regex)
|
|
41
41
|
ingests.py # YAML ingestion: from_yaml(), to_sections(), fastmerge()
|
|
@@ -2,6 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.3.3 - 2026-04-08
|
|
6
|
+
|
|
7
|
+
### Bug Fixes
|
|
8
|
+
- Changed datassert shard count from 16 to 12 (`SHARDS` constant in `fullmap.py`) to correspond to the updated datassert database layout.
|
|
9
|
+
|
|
10
|
+
### Documentation
|
|
11
|
+
- Updated all shard count references across documentation and examples to reflect the new 12-shard datassert layout.
|
|
12
|
+
|
|
13
|
+
## 7.3.2 - 2026-04-03
|
|
14
|
+
|
|
15
|
+
### Maintenance
|
|
16
|
+
- Updated dependencies. No API changes.
|
|
17
|
+
|
|
5
18
|
## 7.3.1 - 2026-04-03
|
|
6
19
|
|
|
7
20
|
### Changes
|
|
@@ -20,15 +20,12 @@ cd Tablassert
|
|
|
20
20
|
uv sync
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
-
### Optional
|
|
23
|
+
### Optional Extras
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
All ML, web, and Excel dependencies are included in the core install. The only optional extra is a runtime-compatible Polars build for CPUs without required instructions:
|
|
26
26
|
|
|
27
27
|
```bash
|
|
28
|
-
uv sync --extra
|
|
29
|
-
uv sync --extra web # playwright
|
|
30
|
-
uv sync --extra pyexcel # pyexcel
|
|
31
|
-
uv sync --extra full # all optional deps
|
|
28
|
+
uv sync --extra rtcompat # polars[rtcompat]
|
|
32
29
|
```
|
|
33
30
|
|
|
34
31
|
## Development Workflow
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.3.
|
|
3
|
+
Version: 7.3.3
|
|
4
4
|
Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -42,7 +42,7 @@ Requires-Dist: rapidfuzz>=3.14.3
|
|
|
42
42
|
Requires-Dist: scikit-learn>=1.8.0
|
|
43
43
|
Requires-Dist: sentence-transformers>=5.3.0
|
|
44
44
|
Requires-Dist: sqlite-utils>=3.39
|
|
45
|
-
Requires-Dist: typer>=0.
|
|
45
|
+
Requires-Dist: typer>=0.21.2
|
|
46
46
|
Requires-Dist: xxhash>=3.6.0
|
|
47
47
|
Provides-Extra: rt
|
|
48
48
|
Requires-Dist: polars[rtcompat]>=1.39.0; extra == 'rt'
|
|
@@ -99,7 +99,7 @@ docker run --rm \
|
|
|
99
99
|
# Build a knowledge graph from a YAML configuration
|
|
100
100
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
101
101
|
⠋ Loading table configurations...
|
|
102
|
-
⠋ Resolving entities across
|
|
102
|
+
⠋ Resolving entities across 12 DuckDB shards...
|
|
103
103
|
⠋ Compiling subgraphs...
|
|
104
104
|
⠋ Deduplicating nodes and edges...
|
|
105
105
|
✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
|
|
@@ -47,7 +47,7 @@ docker run --rm \
|
|
|
47
47
|
# Build a knowledge graph from a YAML configuration
|
|
48
48
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
49
49
|
⠋ Loading table configurations...
|
|
50
|
-
⠋ Resolving entities across
|
|
50
|
+
⠋ Resolving entities across 12 DuckDB shards...
|
|
51
51
|
⠋ Compiling subgraphs...
|
|
52
52
|
⠋ Deduplicating nodes and edges...
|
|
53
53
|
✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
|
|
@@ -36,7 +36,7 @@ Column name containing text strings to resolve.
|
|
|
36
36
|
|
|
37
37
|
**`conns: list[object]`**
|
|
38
38
|
|
|
39
|
-
List of
|
|
39
|
+
List of 12 DuckDB shard connections to the datassert database.
|
|
40
40
|
|
|
41
41
|
Each shard contains:
|
|
42
42
|
- Synonym mappings (text → CURIE)
|
|
@@ -125,11 +125,11 @@ from tablassert.enums import Categories
|
|
|
125
125
|
import duckdb
|
|
126
126
|
import polars as pl
|
|
127
127
|
|
|
128
|
-
# Open all
|
|
128
|
+
# Open all 12 shard connections
|
|
129
129
|
datassert_dir = "/path/to/datassert"
|
|
130
130
|
conns = [
|
|
131
131
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
132
|
-
for i in range(
|
|
132
|
+
for i in range(12)
|
|
133
133
|
]
|
|
134
134
|
|
|
135
135
|
# LazyFrame with data to resolve
|
|
@@ -167,11 +167,11 @@ from tablassert.fullmap import resolve
|
|
|
167
167
|
from tablassert.nlp import level_one, level_two
|
|
168
168
|
from tablassert.enums import Categories
|
|
169
169
|
|
|
170
|
-
# Open all
|
|
170
|
+
# Open all 12 shard connections
|
|
171
171
|
datassert_dir = "/path/to/datassert"
|
|
172
172
|
conns = [
|
|
173
173
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
174
|
-
for i in range(
|
|
174
|
+
for i in range(12)
|
|
175
175
|
]
|
|
176
176
|
|
|
177
177
|
# Map a list of gene symbols to CURIEs
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
The `lib` module exposes `resolve_many()`, a high-level convenience function for resolving an iterable of entity strings to CURIEs without requiring manual LazyFrame construction, NLP preprocessing, or DuckDB shard management.
|
|
4
4
|
|
|
5
|
-
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all
|
|
5
|
+
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 12 DuckDB shard connections, executing entity resolution, and returning results as a plain Python dictionary.
|
|
6
6
|
|
|
7
7
|
## resolve_many()
|
|
8
8
|
|
|
@@ -38,7 +38,7 @@ Examples: `["TP53", "BRCA1", "EGFR"]`, `("aspirin", "ibuprofen")`, or a generato
|
|
|
38
38
|
|
|
39
39
|
**`datassert: Path`**
|
|
40
40
|
|
|
41
|
-
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing
|
|
41
|
+
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 12 DuckDB shard files (`0.duckdb` through `11.duckdb`).
|
|
42
42
|
|
|
43
43
|
Each shard contains:
|
|
44
44
|
- Synonym mappings (text → CURIE)
|
|
@@ -98,7 +98,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
|
|
|
98
98
|
|
|
99
99
|
2. **NLP normalization** — Applies `level_one()` (whitespace stripping + lowercasing) and `level_two()` (non-word character removal via `\W+`) to produce the two normalized columns required by `resolve()`.
|
|
100
100
|
|
|
101
|
-
3. **DuckDB connection management** — Opens all
|
|
101
|
+
3. **DuckDB connection management** — Opens all 12 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
|
|
102
102
|
|
|
103
103
|
4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
|
|
104
104
|
|
|
@@ -225,7 +225,7 @@ Both levels are queried during resolution. Level one (exact case-insensitive mat
|
|
|
225
225
|
|
|
226
226
|
- If the `datassert` path does not contain the expected shard files, `duckdb.connect()` will raise an `IOException`.
|
|
227
227
|
- If `entities` is empty, the function returns a dictionary with empty lists for all output columns.
|
|
228
|
-
- The `ExitStack` ensures all
|
|
228
|
+
- The `ExitStack` ensures all 12 DuckDB connections are closed even if resolution raises an exception.
|
|
229
229
|
- Unresolved entities are silently filtered from the output (logged at INFO level by default via `resolve()`).
|
|
230
230
|
|
|
231
231
|
## Integration
|
|
@@ -60,12 +60,14 @@ See [Table Configuration](table.md) for details.
|
|
|
60
60
|
|
|
61
61
|
**`datassert: path`**
|
|
62
62
|
|
|
63
|
-
Path to the datassert directory for entity resolution. Tablassert opens
|
|
63
|
+
Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens 12 shard files from `datassert/data/{0..11}.duckdb`. This database contains:
|
|
64
64
|
- Synonym mappings (text → CURIE)
|
|
65
65
|
- Biolink categories
|
|
66
66
|
- Taxonomic information
|
|
67
67
|
- Source provenance (which database provided the mapping)
|
|
68
68
|
|
|
69
|
+
See [Datassert](../datassert.md) for installation, build commands, and database schema.
|
|
70
|
+
|
|
69
71
|
**`pubmed_db: path`**
|
|
70
72
|
|
|
71
73
|
Optional path to SQLite database with PubMed metadata:
|
|
@@ -165,4 +167,5 @@ This processes a single table configuration (ALAMV6.yaml) into a knowledge graph
|
|
|
165
167
|
## Next Steps
|
|
166
168
|
|
|
167
169
|
- **[Table Configuration](table.md)** - Learn how to define table transformations
|
|
170
|
+
- **[Datassert](../datassert.md)** - Entity-resolution database installation and build
|
|
168
171
|
- **[Tutorial](../tutorial.md)** - Complete example walkthrough
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Datassert
|
|
2
|
+
|
|
3
|
+
Datassert is a high-performance CLI for building a DuckDB-backed assertion store from NCATS Translator BABEL export files, with a focus on fast local builds and simple command-driven workflows. It produces the entity-resolution database used by Tablassert, containing biological synonyms, CURIEs, Biolink categories, taxon IDs, and source provenance, enabling `resolve()` to map free-text strings to standardized identifiers.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install CLI from GitHub
|
|
9
|
+
go install github.com/SkyeAv/datassert@latest
|
|
10
|
+
|
|
11
|
+
# Verify install
|
|
12
|
+
datassert --help
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Build Command
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Build a Datassert database (downloads BABEL data automatically)
|
|
19
|
+
datassert build
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
The build command automatically downloads BABEL exports from RENCI (`https://stars.renci.org/var/babel_outputs`), processes them, and produces sharded DuckDB databases.
|
|
23
|
+
|
|
24
|
+
### Flags
|
|
25
|
+
|
|
26
|
+
| Flag | Required | Default | Description |
|
|
27
|
+
|------|----------|---------|-------------|
|
|
28
|
+
| `--skip-downloads` / `-s` | No | `false` | Skip the BABEL download phase (use previously downloaded files) |
|
|
29
|
+
| `--use-existing-parquets` / `-p` | No | `false` | Use existing Parquet files to rebuild DuckDB databases |
|
|
30
|
+
|
|
31
|
+
### Data Pipeline
|
|
32
|
+
|
|
33
|
+
1. **Download** — BABEL class and synonym files are downloaded from RENCI and split into LZ4-compressed NDJSON chunks under `./datassert/downloads/`.
|
|
34
|
+
2. **Lookup** — Class files (`*.ndjson.lz4`) are read to build an in-memory equivalent-identifier lookup.
|
|
35
|
+
3. **Parquet Staging** — Synonym files are processed with the lookup, quality-controlled, and written as sharded Parquet files to `./datassert/parquets/`.
|
|
36
|
+
4. **DuckDB Generation** — Parquet files are loaded into 12 sharded DuckDB databases under `./datassert/data/`.
|
|
37
|
+
|
|
38
|
+
### Examples
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Full build (download, process, and generate databases)
|
|
42
|
+
datassert build
|
|
43
|
+
|
|
44
|
+
# Skip downloads if BABEL files were already fetched
|
|
45
|
+
datassert build --skip-downloads
|
|
46
|
+
|
|
47
|
+
# Rebuild DuckDB databases from existing Parquet files
|
|
48
|
+
datassert build --use-existing-parquets
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Runtime Behavior
|
|
52
|
+
|
|
53
|
+
- Displays progress bars for download, class lookup, synonym processing, and DuckDB build phases.
|
|
54
|
+
- Uses 90% of available CPUs for concurrent processing.
|
|
55
|
+
- Downloads are retried up to 3 times on failure with a 10-second backoff.
|
|
56
|
+
- All working files are stored under `./datassert/`.
|
|
57
|
+
|
|
58
|
+
## Output Artifacts
|
|
59
|
+
|
|
60
|
+
- 12 sharded DuckDB databases are written to `./datassert/data/{0..11}.duckdb`.
|
|
61
|
+
- Each shard contains `SOURCES`, `CATEGORIES`, `CURIES`, and `SYNONYMS` tables, deduplicated, sorted, and indexed for query performance.
|
|
62
|
+
- Staging Parquet files are written to `./datassert/parquets/{0..11}/`.
|
|
63
|
+
|
|
64
|
+
Terms are routed to shards deterministically via `xxhash64(term) % 12`, so a given string always hits the same shard.
|
|
65
|
+
|
|
66
|
+
### Schema
|
|
67
|
+
|
|
68
|
+
Each shard contains four tables:
|
|
69
|
+
|
|
70
|
+
| Table | Key Columns | Description |
|
|
71
|
+
|-------|-------------|-------------|
|
|
72
|
+
| `SYNONYMS` | `SYNONYM`, `CURIE_ID`, `SOURCE_ID` | Text synonym → CURIE mapping |
|
|
73
|
+
| `CURIES` | `CURIE_ID`, `CURIE`, `PREFERRED_NAME`, `TAXON_ID`, `CATEGORY_ID` | Canonical identifiers and preferred names |
|
|
74
|
+
| `CATEGORIES` | `CATEGORY_ID`, `CATEGORY_NAME` | Biolink category names |
|
|
75
|
+
| `SOURCES` | `SOURCE_ID`, `SOURCE_NAME`, `SOURCE_VERSION` | Source database and version provenance |
|
|
76
|
+
|
|
77
|
+
## Usage in Graph Config
|
|
78
|
+
|
|
79
|
+
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 12 shards at startup and passes the connections to `resolve()`.
|
|
80
|
+
|
|
81
|
+
```yaml
|
|
82
|
+
# graph-config.yaml (GC2)
|
|
83
|
+
syntax: GC2
|
|
84
|
+
name: my-graph
|
|
85
|
+
version: "1.0"
|
|
86
|
+
datassert: /path/to/datassert/ # directory containing data/0..11.duckdb
|
|
87
|
+
tables:
|
|
88
|
+
- ./TABLE/my-table.yaml
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Programmatic Usage
|
|
92
|
+
|
|
93
|
+
When calling `resolve()` directly, open the shard connections yourself:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import duckdb
|
|
97
|
+
from tablassert.fullmap import resolve
|
|
98
|
+
|
|
99
|
+
datassert_dir = "/path/to/datassert"
|
|
100
|
+
conns = [
|
|
101
|
+
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
102
|
+
for i in range(12)
|
|
103
|
+
]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
See [Entity Resolution](api/fullmap.md) for the full `resolve()` API.
|
|
@@ -81,8 +81,8 @@ docker run --rm \
|
|
|
81
81
|
|
|
82
82
|
- **Datassert path** — The graph configuration YAML specifies the `datassert` path for the entity-resolution database. Ensure it is accessible inside the container.
|
|
83
83
|
- **Multiprocessing** — `src/tablassert/cli.py:63` uses `multiprocessing.Pool` for parallel table loading and section extraction.
|
|
84
|
-
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all
|
|
85
|
-
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across
|
|
84
|
+
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all 12 Datassert DuckDB shards concurrently.
|
|
85
|
+
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across 12 DuckDB shards (`SHARDS = 12`) using xxhash64.
|
|
86
86
|
- **Text normalization** — `src/tablassert/nlp.py` provides `level_one` (strip + lowercase) and `level_two` (regex-based cleanup).
|
|
87
87
|
|
|
88
88
|
## CI/CD Integration
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
This file is for two audiences: (1) YAML configuration authors and (2) package contributors.
|
|
5
5
|
When source code and prose docs disagree, treat `src/tablassert/models.py` and `src/tablassert/cli.py` as the current authority.
|
|
6
6
|
If you encounter older configurations, migrate `dbssert` to `datassert` (directory path).
|
|
7
|
-
Current CLI behavior opens shard files at `datassert/data/{0..
|
|
7
|
+
Current CLI behavior opens shard files at `datassert/data/{0..11}.duckdb`.
|
|
8
8
|
|
|
9
9
|
## Quickstart
|
|
10
10
|
- [README](README.md): high-level overview, install snippets, and one-command graph build.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.3.
|
|
3
|
+
version = "7.3.3"
|
|
4
4
|
description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -56,7 +56,7 @@ dependencies = [
|
|
|
56
56
|
"scikit-learn>=1.8.0",
|
|
57
57
|
"sentence-transformers>=5.3.0",
|
|
58
58
|
"sqlite-utils>=3.39",
|
|
59
|
-
"typer>=0.
|
|
59
|
+
"typer>=0.21.2",
|
|
60
60
|
"xxhash>=3.6.0",
|
|
61
61
|
]
|
|
62
62
|
|
|
@@ -16,7 +16,7 @@ else:
|
|
|
16
16
|
plh = Lazy.load("polars_hash")
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
SHARDS: int =
|
|
19
|
+
SHARDS: int = 10
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def empty_matches(column_context: bool) -> pl.DataFrame:
|
|
@@ -39,15 +39,15 @@ def empty_matches(column_context: bool) -> pl.DataFrame:
|
|
|
39
39
|
return pl.DataFrame(schema=schema) # pyright: ignore
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def distinct(lf: pl.LazyFrame,
|
|
42
|
+
def distinct(lf: pl.LazyFrame, l1: str, l2: str, col: str = "term") -> pl.LazyFrame:
|
|
43
43
|
# ? Extract Unique Terms From Two Text Normalization Columns As LazyFrame
|
|
44
|
-
t0: pl.LazyFrame = lf.select(pl.col(l0).alias(col)).unique()
|
|
45
|
-
t0 = t0.with_columns(pl.lit(0).alias("nlp level"))
|
|
46
|
-
|
|
47
44
|
t1: pl.LazyFrame = lf.select(pl.col(l1).alias(col)).unique()
|
|
48
45
|
t1 = t1.with_columns(pl.lit(1).alias("nlp level"))
|
|
49
46
|
|
|
50
|
-
|
|
47
|
+
t2: pl.LazyFrame = lf.select(pl.col(l2).alias(col)).unique()
|
|
48
|
+
t2 = t2.with_columns(pl.lit(2).alias("nlp level"))
|
|
49
|
+
|
|
50
|
+
terms: pl.LazyFrame = pl.concat([t1, t2]).unique(subset=[col], keep="first")
|
|
51
51
|
|
|
52
52
|
bad: str = r"^\d+$|^(none|nan|na|null|unknown)$|^$"
|
|
53
53
|
terms = terms.filter(~pl.col(col).str.contains(bad))
|
|
@@ -172,10 +172,10 @@ def resolve(
|
|
|
172
172
|
tag: str = " two",
|
|
173
173
|
) -> pl.LazyFrame:
|
|
174
174
|
# ? Case Dependant, Provenance Rich Name Entity Recognition
|
|
175
|
-
|
|
176
|
-
|
|
175
|
+
l1: str = col
|
|
176
|
+
l2: str = add(l1, tag)
|
|
177
177
|
|
|
178
|
-
terms: pl.LazyFrame = distinct(lf,
|
|
178
|
+
terms: pl.LazyFrame = distinct(lf, l1, l2)
|
|
179
179
|
matches: pl.DataFrame = query_distinct(terms, conns, taxon, prioritize, avoid, column_context)
|
|
180
180
|
|
|
181
181
|
if log:
|
|
@@ -184,45 +184,45 @@ def resolve(
|
|
|
184
184
|
# ! Collection Point: Join After DuckDB Query, Then Re-Lazy
|
|
185
185
|
df: pl.DataFrame = lf.collect()
|
|
186
186
|
result: pl.DataFrame = df.join(
|
|
187
|
-
matches.filter(pl.col("NLP_LEVEL").eq(
|
|
187
|
+
matches.filter(pl.col("NLP_LEVEL").eq(1)), left_on=l1, right_on="term", how="left", suffix=" l1"
|
|
188
188
|
)
|
|
189
189
|
|
|
190
|
-
|
|
191
|
-
result = result.join(
|
|
190
|
+
l2_matches: pl.DataFrame = matches.filter(pl.col("NLP_LEVEL").eq(2))
|
|
191
|
+
result = result.join(l2_matches, left_on=l2, right_on="term", how="left", suffix=" l2")
|
|
192
192
|
|
|
193
193
|
result = result.with_columns(
|
|
194
194
|
[
|
|
195
|
-
pl.when(pl.col("CURIE").is_not_null()).then(pl.col("CURIE")).otherwise(pl.col("CURIE
|
|
195
|
+
pl.when(pl.col("CURIE").is_not_null()).then(pl.col("CURIE")).otherwise(pl.col("CURIE l2")).alias(col),
|
|
196
196
|
pl.when(pl.col("PREFERRED_NAME").is_not_null())
|
|
197
197
|
.then(pl.col("PREFERRED_NAME"))
|
|
198
|
-
.otherwise(pl.col("PREFERRED_NAME
|
|
198
|
+
.otherwise(pl.col("PREFERRED_NAME l2"))
|
|
199
199
|
.alias(add(col, " name")),
|
|
200
200
|
pl.when(pl.col("CATEGORY_NAME").is_not_null())
|
|
201
201
|
.then(add(pl.lit("biolink:"), pl.col("CATEGORY_NAME")))
|
|
202
|
-
.otherwise(add(pl.lit("biolink:"), pl.col("CATEGORY_NAME
|
|
202
|
+
.otherwise(add(pl.lit("biolink:"), pl.col("CATEGORY_NAME l2")))
|
|
203
203
|
.alias(add(col, " category")),
|
|
204
204
|
pl.when(pl.col("TAXON_ID").is_not_null())
|
|
205
205
|
.then(add(pl.lit("NCBITaxon:"), pl.col("TAXON_ID").cast(pl.String)))
|
|
206
|
-
.otherwise(add(pl.lit("NCBITaxon:"), pl.col("TAXON_ID
|
|
206
|
+
.otherwise(add(pl.lit("NCBITaxon:"), pl.col("TAXON_ID l2").cast(pl.String)))
|
|
207
207
|
.alias(add(col, " taxon")),
|
|
208
208
|
pl.when(pl.col("SOURCE_NAME").is_not_null())
|
|
209
209
|
.then(pl.col("SOURCE_NAME"))
|
|
210
|
-
.otherwise(pl.col("SOURCE_NAME
|
|
210
|
+
.otherwise(pl.col("SOURCE_NAME l2"))
|
|
211
211
|
.alias(add(col, " source")),
|
|
212
212
|
pl.when(pl.col("SOURCE_VERSION").is_not_null())
|
|
213
213
|
.then(pl.col("SOURCE_VERSION"))
|
|
214
|
-
.otherwise(pl.col("SOURCE_VERSION
|
|
214
|
+
.otherwise(pl.col("SOURCE_VERSION l2"))
|
|
215
215
|
.alias(add(col, " source version")),
|
|
216
216
|
pl.when(pl.col("NLP_LEVEL").is_not_null())
|
|
217
217
|
.then(pl.col("NLP_LEVEL"))
|
|
218
|
-
.otherwise(pl.col("NLP_LEVEL
|
|
218
|
+
.otherwise(pl.col("NLP_LEVEL l2"))
|
|
219
219
|
.alias(add(col, " nlp level")),
|
|
220
220
|
]
|
|
221
221
|
)
|
|
222
222
|
|
|
223
223
|
result = result.select(
|
|
224
224
|
pl.exclude(
|
|
225
|
-
r"^(CURIE|PREFERRED_NAME|CATEGORY_NAME|TAXON_ID|SOURCE_NAME|SOURCE_VERSION|NLP_LEVEL|PR|FREQUENCY)(
|
|
225
|
+
r"^(CURIE|PREFERRED_NAME|CATEGORY_NAME|TAXON_ID|SOURCE_NAME|SOURCE_VERSION|NLP_LEVEL|PR|FREQUENCY)( l2)?$"
|
|
226
226
|
)
|
|
227
227
|
)
|
|
228
228
|
result = result.select(pl.exclude(add(col, " two")))
|
|
@@ -2211,7 +2211,7 @@ wheels = [
|
|
|
2211
2211
|
|
|
2212
2212
|
[[package]]
|
|
2213
2213
|
name = "tablassert"
|
|
2214
|
-
version = "7.3.
|
|
2214
|
+
version = "7.3.3"
|
|
2215
2215
|
source = { editable = "." }
|
|
2216
2216
|
dependencies = [
|
|
2217
2217
|
{ name = "duckdb" },
|
|
@@ -2276,7 +2276,7 @@ requires-dist = [
|
|
|
2276
2276
|
{ name = "sentence-transformers", specifier = ">=5.3.0" },
|
|
2277
2277
|
{ name = "sqlite-utils", specifier = ">=3.39" },
|
|
2278
2278
|
{ name = "tablassert", extras = ["rtcompat"], marker = "extra == 'rt'" },
|
|
2279
|
-
{ name = "typer", specifier = ">=0.
|
|
2279
|
+
{ name = "typer", specifier = ">=0.21.2" },
|
|
2280
2280
|
{ name = "xxhash", specifier = ">=3.6.0" },
|
|
2281
2281
|
]
|
|
2282
2282
|
provides-extras = ["rtcompat", "rt"]
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
# Datassert
|
|
2
|
-
|
|
3
|
-
Datassert is the entity-resolution database used by Tablassert. It contains biological synonyms, CURIEs, Biolink categories, taxon IDs, and source provenance, enabling `resolve()` to map free-text strings to standardized identifiers.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
git clone https://github.com/SkyeAv/datassert
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
## Structure
|
|
12
|
-
|
|
13
|
-
Datassert is split into 16 DuckDB shard files for parallel querying:
|
|
14
|
-
|
|
15
|
-
```
|
|
16
|
-
datassert/
|
|
17
|
-
data/
|
|
18
|
-
0.duckdb
|
|
19
|
-
1.duckdb
|
|
20
|
-
...
|
|
21
|
-
15.duckdb
|
|
22
|
-
```
|
|
23
|
-
|
|
24
|
-
Terms are routed to shards deterministically via `xxhash64(term) % 16`, so a given string always hits the same shard.
|
|
25
|
-
|
|
26
|
-
### Schema
|
|
27
|
-
|
|
28
|
-
Each shard contains four tables:
|
|
29
|
-
|
|
30
|
-
| Table | Key Columns | Description |
|
|
31
|
-
|-------|-------------|-------------|
|
|
32
|
-
| `SYNONYMS` | `SYNONYM`, `CURIE_ID`, `SOURCE_ID` | Text synonym → CURIE mapping |
|
|
33
|
-
| `CURIES` | `CURIE_ID`, `CURIE`, `PREFERRED_NAME`, `TAXON_ID`, `CATEGORY_ID` | Canonical identifiers and preferred names |
|
|
34
|
-
| `CATEGORIES` | `CATEGORY_ID`, `CATEGORY_NAME` | Biolink category names |
|
|
35
|
-
| `SOURCES` | `SOURCE_ID`, `SOURCE_NAME`, `SOURCE_VERSION` | Source database and version provenance |
|
|
36
|
-
|
|
37
|
-
## Usage in Graph Config
|
|
38
|
-
|
|
39
|
-
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 16 shards at startup and passes the connections to `resolve()`.
|
|
40
|
-
|
|
41
|
-
```yaml
|
|
42
|
-
# graph-config.yaml (GC2)
|
|
43
|
-
syntax: GC2
|
|
44
|
-
name: my-graph
|
|
45
|
-
version: "1.0"
|
|
46
|
-
datassert: /path/to/datassert/ # directory containing data/0..15.duckdb
|
|
47
|
-
tables:
|
|
48
|
-
- ./TABLE/my-table.yaml
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
## Programmatic Usage
|
|
52
|
-
|
|
53
|
-
When calling `resolve()` directly, open the shard connections yourself:
|
|
54
|
-
|
|
55
|
-
```python
|
|
56
|
-
import duckdb
|
|
57
|
-
from tablassert.fullmap import resolve
|
|
58
|
-
|
|
59
|
-
datassert_dir = "/path/to/datassert"
|
|
60
|
-
conns = [
|
|
61
|
-
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
62
|
-
for i in range(16)
|
|
63
|
-
]
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
See [Entity Resolution](api/fullmap.md) for the full `resolve()` API.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|