tablassert 7.3.3__tar.gz → 7.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.3.3 → tablassert-7.3.4}/CHANGELOG.md +12 -2
- {tablassert-7.3.3 → tablassert-7.3.4}/PKG-INFO +4 -4
- {tablassert-7.3.3 → tablassert-7.3.4}/README.md +3 -3
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/fullmap.md +6 -6
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/lib.md +9 -9
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/qc.md +3 -3
- tablassert-7.3.4/docs/changelog.md +13 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/advanced-example.md +10 -4
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/graph.md +1 -1
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/table.md +17 -5
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/datassert.md +7 -7
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/docker.md +2 -2
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples.md +11 -7
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/tutorial.md +1 -1
- {tablassert-7.3.3 → tablassert-7.3.4}/mkdocs.yml +1 -1
- {tablassert-7.3.3 → tablassert-7.3.4}/pyproject.toml +1 -1
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/downloader.py +15 -4
- {tablassert-7.3.3 → tablassert-7.3.4}/uv.lock +1 -1
- {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/autotag.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/docker.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/docs.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/.gitignore +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/.pre-commit-config.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/AGENTS.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/CITATION.cff +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/CONTRIBUTING.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/Dockerfile +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/LICENSE +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/utils.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/cli.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/index.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/docs/installation.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/llms.txt +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/__init__.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/cli.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/enums.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/fullmap.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/ingests.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/lib.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/log.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/models.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/nlp.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/qc.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/utils.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/__init__.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/conftest.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/minimal_section.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_enums.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_fullmap.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_ingests.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_lib.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_models.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_nlp.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_utils.py +0 -0
|
@@ -2,13 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.3.4 - 2026-04-28
|
|
6
|
+
|
|
7
|
+
### Bug Fixes
|
|
8
|
+
- Fixed `downloader.from_url()` failing on URLs that trigger an immediate download. The Playwright session now opens a browser context with `accept_downloads=True`, wraps `page.goto()` inside `page.expect_download()`, and tolerates the expected `net::ERR_ABORTED` navigation error that fires when the response is a download rather than a page.
|
|
9
|
+
|
|
10
|
+
### Documentation
|
|
11
|
+
- Documented `miscellaneous notes` as a freetext catch-all annotation in the table configuration and advanced-example pages — used for assay caveats, non-standard units, and qualitative observations that don't map cleanly to a structured field. Supports both `method: value` (constant) and `method: column` (per-row).
|
|
12
|
+
- Documented Polars regex constraints for the `regex` and `remove` transforms: patterns are passed to Polars `str.replace_all()` (Rust `regex` crate), so capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not supported and will raise at parse time. Chain simple substitutions instead, or capture residual context in a `miscellaneous notes` annotation.
|
|
13
|
+
|
|
5
14
|
## 7.3.3 - 2026-04-08
|
|
6
15
|
|
|
7
16
|
### Bug Fixes
|
|
8
|
-
- Changed datassert shard count
|
|
17
|
+
- Changed datassert shard count to 10 (`SHARDS` constant in `fullmap.py`) to correspond to the current datassert database layout.
|
|
9
18
|
|
|
10
19
|
### Documentation
|
|
11
|
-
- Updated
|
|
20
|
+
- Updated shard count references across documentation and examples to reflect the current 10-shard datassert layout.
|
|
21
|
+
- Corrected provenance examples so `repo` carries the namespace prefix and `publication` carries the repository-local identifier.
|
|
12
22
|
|
|
13
23
|
## 7.3.2 - 2026-04-03
|
|
14
24
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.3.
|
|
3
|
+
Version: 7.3.4
|
|
4
4
|
Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -99,13 +99,13 @@ docker run --rm \
|
|
|
99
99
|
# Build a knowledge graph from a YAML configuration
|
|
100
100
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
101
101
|
⠋ Loading table configurations...
|
|
102
|
-
⠋ Resolving entities across
|
|
102
|
+
⠋ Resolving entities across 10 DuckDB shards...
|
|
103
103
|
⠋ Compiling subgraphs...
|
|
104
104
|
⠋ Deduplicating nodes and edges...
|
|
105
|
-
✓
|
|
105
|
+
✓ Finished — wrote MY_GRAPH_1.0.0.nodes.ndjson and MY_GRAPH_1.0.0.edges.ndjson
|
|
106
106
|
```
|
|
107
107
|
|
|
108
|
-
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
108
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
|
|
109
109
|
|
|
110
110
|
## Key Features
|
|
111
111
|
|
|
@@ -47,13 +47,13 @@ docker run --rm \
|
|
|
47
47
|
# Build a knowledge graph from a YAML configuration
|
|
48
48
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
49
49
|
⠋ Loading table configurations...
|
|
50
|
-
⠋ Resolving entities across
|
|
50
|
+
⠋ Resolving entities across 10 DuckDB shards...
|
|
51
51
|
⠋ Compiling subgraphs...
|
|
52
52
|
⠋ Deduplicating nodes and edges...
|
|
53
|
-
✓
|
|
53
|
+
✓ Finished — wrote MY_GRAPH_1.0.0.nodes.ndjson and MY_GRAPH_1.0.0.edges.ndjson
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
-
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
56
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
|
|
57
57
|
|
|
58
58
|
## Key Features
|
|
59
59
|
|
|
@@ -36,7 +36,7 @@ Column name containing text strings to resolve.
|
|
|
36
36
|
|
|
37
37
|
**`conns: list[object]`**
|
|
38
38
|
|
|
39
|
-
List of
|
|
39
|
+
List of 10 DuckDB shard connections to the datassert database.
|
|
40
40
|
|
|
41
41
|
Each shard contains:
|
|
42
42
|
- Synonym mappings (text → CURIE)
|
|
@@ -97,7 +97,7 @@ Returns a Polars LazyFrame with these columns added:
|
|
|
97
97
|
| `{col} taxon` | NCBI Taxon ID | `"NCBITaxon:9606"` |
|
|
98
98
|
| `{col} source` | Source database | `"HGNC"` |
|
|
99
99
|
| `{col} source version` | Database version | `"2025-01"` |
|
|
100
|
-
| `{col} nlp level` | NLP processing level | `
|
|
100
|
+
| `{col} nlp level` | NLP processing level | `1` or `2` |
|
|
101
101
|
|
|
102
102
|
### DuckDB Query
|
|
103
103
|
|
|
@@ -125,11 +125,11 @@ from tablassert.enums import Categories
|
|
|
125
125
|
import duckdb
|
|
126
126
|
import polars as pl
|
|
127
127
|
|
|
128
|
-
# Open all
|
|
128
|
+
# Open all 10 shard connections
|
|
129
129
|
datassert_dir = "/path/to/datassert"
|
|
130
130
|
conns = [
|
|
131
131
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
132
|
-
for i in range(
|
|
132
|
+
for i in range(10)
|
|
133
133
|
]
|
|
134
134
|
|
|
135
135
|
# LazyFrame with data to resolve
|
|
@@ -167,11 +167,11 @@ from tablassert.fullmap import resolve
|
|
|
167
167
|
from tablassert.nlp import level_one, level_two
|
|
168
168
|
from tablassert.enums import Categories
|
|
169
169
|
|
|
170
|
-
# Open all
|
|
170
|
+
# Open all 10 shard connections
|
|
171
171
|
datassert_dir = "/path/to/datassert"
|
|
172
172
|
conns = [
|
|
173
173
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
174
|
-
for i in range(
|
|
174
|
+
for i in range(10)
|
|
175
175
|
]
|
|
176
176
|
|
|
177
177
|
# Map a list of gene symbols to CURIEs
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
The `lib` module exposes `resolve_many()`, a high-level convenience function for resolving an iterable of entity strings to CURIEs without requiring manual LazyFrame construction, NLP preprocessing, or DuckDB shard management.
|
|
4
4
|
|
|
5
|
-
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all
|
|
5
|
+
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 10 DuckDB shard connections, executing entity resolution, and returning results as a plain Python list of row dictionaries.
|
|
6
6
|
|
|
7
7
|
## resolve_many()
|
|
8
8
|
|
|
9
|
-
Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a
|
|
9
|
+
Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a list of row dictionaries.
|
|
10
10
|
|
|
11
11
|
### Function Signature
|
|
12
12
|
|
|
@@ -26,9 +26,9 @@ def resolve_many(
|
|
|
26
26
|
|
|
27
27
|
**`col: str`**
|
|
28
28
|
|
|
29
|
-
Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in
|
|
29
|
+
Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in each returned row dictionary.
|
|
30
30
|
|
|
31
|
-
For example, if `col="gene"`,
|
|
31
|
+
For example, if `col="gene"`, each returned row dictionary will contain keys like `"gene"`, `"gene name"`, `"gene category"`, etc.
|
|
32
32
|
|
|
33
33
|
**`entities: Iterable[str]`**
|
|
34
34
|
|
|
@@ -38,7 +38,7 @@ Examples: `["TP53", "BRCA1", "EGFR"]`, `("aspirin", "ibuprofen")`, or a generato
|
|
|
38
38
|
|
|
39
39
|
**`datassert: Path`**
|
|
40
40
|
|
|
41
|
-
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing
|
|
41
|
+
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 10 DuckDB shard files (`0.duckdb` through `9.duckdb`).
|
|
42
42
|
|
|
43
43
|
Each shard contains:
|
|
44
44
|
- Synonym mappings (text → CURIE)
|
|
@@ -86,7 +86,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
|
|
|
86
86
|
| `{col} taxon` | NCBI Taxon ID (prefixed) | `"NCBITaxon:9606"` |
|
|
87
87
|
| `{col} source` | Source database | `"HGNC"` |
|
|
88
88
|
| `{col} source version` | Database version | `"2025-01"` |
|
|
89
|
-
| `{col} nlp level` | NLP processing level used for match | `
|
|
89
|
+
| `{col} nlp level` | NLP processing level used for match | `1` or `2` |
|
|
90
90
|
|
|
91
91
|
**Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned list may therefore be shorter than the input iterable.
|
|
92
92
|
|
|
@@ -98,7 +98,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
|
|
|
98
98
|
|
|
99
99
|
2. **NLP normalization** — Applies `level_one()` (whitespace stripping + lowercasing) and `level_two()` (non-word character removal via `\W+`) to produce the two normalized columns required by `resolve()`.
|
|
100
100
|
|
|
101
|
-
3. **DuckDB connection management** — Opens all
|
|
101
|
+
3. **DuckDB connection management** — Opens all 10 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
|
|
102
102
|
|
|
103
103
|
4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
|
|
104
104
|
|
|
@@ -224,8 +224,8 @@ Both levels are queried during resolution. Level one (exact case-insensitive mat
|
|
|
224
224
|
### Error Handling
|
|
225
225
|
|
|
226
226
|
- If the `datassert` path does not contain the expected shard files, `duckdb.connect()` will raise an `IOException`.
|
|
227
|
-
- If `entities` is empty, the function returns
|
|
228
|
-
- The `ExitStack` ensures all
|
|
227
|
+
- If `entities` is empty, the function returns `[]`.
|
|
228
|
+
- The `ExitStack` ensures all 10 DuckDB connections are closed even if resolution raises an exception.
|
|
229
229
|
- Unresolved entities are silently filtered from the output (logged at INFO level by default via `resolve()`).
|
|
230
230
|
|
|
231
231
|
## Integration
|
|
@@ -82,7 +82,7 @@ Two fuzzy matching algorithms:
|
|
|
82
82
|
1. **Ratio:** Overall string similarity
|
|
83
83
|
2. **Partial token sort ratio:** Combined token/subsequence matching
|
|
84
84
|
|
|
85
|
-
**Threshold:**
|
|
85
|
+
**Threshold:** 20% similarity
|
|
86
86
|
|
|
87
87
|
```python
|
|
88
88
|
fuzz.ratio(original, preferred) >= 20
|
|
@@ -125,7 +125,7 @@ return similarity >= 0.2
|
|
|
125
125
|
- Graph optimization level: ALL
|
|
126
126
|
- ONNX session caching
|
|
127
127
|
|
|
128
|
-
Lazy-loaded on first `
|
|
128
|
+
Lazy-loaded on first `fullmap_audit()` call that reaches the embedding stage, then reused for subsequent calls.
|
|
129
129
|
|
|
130
130
|
### Model Caching
|
|
131
131
|
|
|
@@ -135,7 +135,7 @@ BioBERT is lazy-loaded on first use and cached globally for the lifetime of the
|
|
|
135
135
|
# ? Lazy-loads BioBERT once on first batch audit call, then caches globally
|
|
136
136
|
```
|
|
137
137
|
|
|
138
|
-
**Cache location:**
|
|
138
|
+
**Cache location:** Downloaded model files are cached on disk in `.onnxassert/`, and the loaded model object is cached in memory for the lifetime of the process.
|
|
139
139
|
|
|
140
140
|
**Cache strategy:** BioBERT model loaded once on first batch audit, then reused globally
|
|
141
141
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
The canonical release history lives in the repository root at [`CHANGELOG.md`](https://github.com/SkyeAv/Tablassert/blob/main/CHANGELOG.md).
|
|
4
|
+
|
|
5
|
+
## Current Release Notes
|
|
6
|
+
|
|
7
|
+
### 7.3.4 - 2026-04-28
|
|
8
|
+
|
|
9
|
+
- `downloader.from_url()` now handles URLs that respond with an immediate download instead of a navigable page — the Playwright session uses a download-aware browser context and tolerates the expected `net::ERR_ABORTED` navigation error.
|
|
10
|
+
- Table configuration docs now describe `miscellaneous notes` as a freetext catch-all annotation for source context that doesn't map cleanly to a structured field.
|
|
11
|
+
- Regex transform documentation now spells out the Polars `str.replace_all()` constraints — no capturing groups or lookarounds — so authors know to chain simple substitutions or fall back to `miscellaneous notes`.
|
|
12
|
+
|
|
13
|
+
For older releases and the full project history, open the root `CHANGELOG.md` in the repository.
|
|
@@ -66,7 +66,7 @@ template:
|
|
|
66
66
|
# Provenance: Publication and curation info
|
|
67
67
|
provenance:
|
|
68
68
|
repo: PMC
|
|
69
|
-
publication:
|
|
69
|
+
publication: 11708054
|
|
70
70
|
contributors:
|
|
71
71
|
- kind: curation
|
|
72
72
|
name: Skye Lane Goetz
|
|
@@ -103,12 +103,16 @@ template:
|
|
|
103
103
|
method: value
|
|
104
104
|
encoding: Spearman correlation
|
|
105
105
|
|
|
106
|
-
#
|
|
106
|
+
# Freetext catch-all — anything that doesn't map cleanly to a structured
|
|
107
|
+
# annotation (study design caveats, non-standard units, qualitative
|
|
108
|
+
# observations) belongs here rather than being dropped.
|
|
107
109
|
- annotation: miscellaneous notes
|
|
108
110
|
method: value
|
|
109
111
|
encoding: Correlation analysis between microbial composition and 13C-tamoxifen abundance after FDR correction
|
|
110
112
|
```
|
|
111
113
|
|
|
114
|
+
> **`miscellaneous notes` is a freetext escape hatch.** Use it whenever the source carries context you can't otherwise cleanly encode — assay variants, post-hoc qualifiers, "values are log-transformed", etc. It accepts `method: value` for a constant note across the whole table or `method: column` to pull per-row notes from the source.
|
|
115
|
+
|
|
112
116
|
## Key Techniques
|
|
113
117
|
|
|
114
118
|
### Excel Column References
|
|
@@ -143,6 +147,8 @@ The subject field uses three regex transformations in sequence:
|
|
|
143
147
|
```
|
|
144
148
|
`"Lactobacillus sp"` → `"Lactobacillus sp. "`
|
|
145
149
|
|
|
150
|
+
> **Regex constraint:** Each `pattern` is handed to Polars `str.replace_all()` (Rust `regex` crate). **Capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not allowed** and will fail validation. Express transformations as a sequence of simple anchored / character-class substitutions instead — the pipeline above is a deliberate three-step chain because no single capturing-group pattern is permitted. If the transformation can't be expressed without those features, capture the leftover context in a `miscellaneous notes` annotation rather than fighting the regex engine.
|
|
151
|
+
|
|
146
152
|
### Taxonomic Filtering
|
|
147
153
|
|
|
148
154
|
Prevent incorrect entity resolution:
|
|
@@ -297,7 +303,7 @@ template:
|
|
|
297
303
|
|
|
298
304
|
provenance:
|
|
299
305
|
repo: PMC
|
|
300
|
-
publication:
|
|
306
|
+
publication: 12345678
|
|
301
307
|
contributors:
|
|
302
308
|
- kind: curation
|
|
303
309
|
name: Skye Lane Goetz
|
|
@@ -358,7 +364,7 @@ template:
|
|
|
358
364
|
|
|
359
365
|
provenance:
|
|
360
366
|
repo: PMC
|
|
361
|
-
publication:
|
|
367
|
+
publication: 87654321
|
|
362
368
|
contributors:
|
|
363
369
|
- kind: curation
|
|
364
370
|
name: Skye Lane Goetz
|
|
@@ -60,7 +60,7 @@ See [Table Configuration](table.md) for details.
|
|
|
60
60
|
|
|
61
61
|
**`datassert: path`**
|
|
62
62
|
|
|
63
|
-
Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens
|
|
63
|
+
Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens 10 shard files from `datassert/data/{0..9}.duckdb`. This database contains:
|
|
64
64
|
- Synonym mappings (text → CURIE)
|
|
65
65
|
- Biolink categories
|
|
66
66
|
- Taxonomic information
|
|
@@ -100,7 +100,7 @@ template:
|
|
|
100
100
|
```yaml
|
|
101
101
|
template:
|
|
102
102
|
source: {kind: excel, local: data.xlsx}
|
|
103
|
-
provenance: {publication:
|
|
103
|
+
provenance: {repo: PMC, publication: 123}
|
|
104
104
|
|
|
105
105
|
sections:
|
|
106
106
|
- statement: {predicate: treats}
|
|
@@ -111,7 +111,7 @@ sections:
|
|
|
111
111
|
```yaml
|
|
112
112
|
template:
|
|
113
113
|
source: {kind: text, local: data.csv}
|
|
114
|
-
provenance: {publication:
|
|
114
|
+
provenance: {repo: PMID, publication: 456}
|
|
115
115
|
statement:
|
|
116
116
|
subject: {encoding: gene_symbol}
|
|
117
117
|
|
|
@@ -330,6 +330,8 @@ subject:
|
|
|
330
330
|
|
|
331
331
|
Executed in order.
|
|
332
332
|
|
|
333
|
+
> **Regex dialect:** Patterns are passed directly to Polars `str.replace_all()`, which uses the Rust [`regex`](https://docs.rs/regex/) crate. Only features supported by that engine work — in particular, **capturing groups (`(...)`, `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)` are not supported** and will raise an error at parse time. Stick to character classes, anchors (`^`, `$`), quantifiers, alternation (`a|b`), and non-capturing groups (`(?:...)`) if grouping is needed. If a transformation is too complex to express, prefer chaining several simple substitutions or capturing the residual context in a `miscellaneous notes` annotation instead.
|
|
334
|
+
|
|
333
335
|
**`remove: list[string]`** - Filter out specific strings
|
|
334
336
|
|
|
335
337
|
```yaml
|
|
@@ -339,6 +341,8 @@ subject:
|
|
|
339
341
|
- "^NA " # Remove rows starting with "NA "
|
|
340
342
|
```
|
|
341
343
|
|
|
344
|
+
Same regex constraints apply as the `regex` field — Polars-compatible patterns only, no capturing groups or lookarounds.
|
|
345
|
+
|
|
342
346
|
**`prefix` / `suffix`** - Add text
|
|
343
347
|
|
|
344
348
|
```yaml
|
|
@@ -416,7 +420,7 @@ Required metadata about data source.
|
|
|
416
420
|
| Field | Type | Required | Description |
|
|
417
421
|
|-------|------|----------|-------------|
|
|
418
422
|
| `repo` | String | Yes | Repository: `"PMC"`, `"PMID"` |
|
|
419
|
-
| `publication` | String | Yes |
|
|
423
|
+
| `publication` | String | Yes | Repository-local identifier appended to `repo:` (e.g., `"11708054"`, `"123"`) |
|
|
420
424
|
| `contributors` | List[Contributor] | Yes | Curation information |
|
|
421
425
|
|
|
422
426
|
**Contributor fields:**
|
|
@@ -433,7 +437,7 @@ Required metadata about data source.
|
|
|
433
437
|
```yaml
|
|
434
438
|
provenance:
|
|
435
439
|
repo: PMC
|
|
436
|
-
publication:
|
|
440
|
+
publication: 11708054
|
|
437
441
|
contributors:
|
|
438
442
|
- kind: curation
|
|
439
443
|
name: Skye Lane Goetz
|
|
@@ -467,8 +471,16 @@ annotations:
|
|
|
467
471
|
- annotation: multiple testing correction method
|
|
468
472
|
method: value
|
|
469
473
|
encoding: "Benjamini Hochberg"
|
|
474
|
+
|
|
475
|
+
# Freetext catch-all for context that doesn't fit a structured field —
|
|
476
|
+
# study caveats, units, post-hoc notes, anything you'd otherwise lose.
|
|
477
|
+
- annotation: miscellaneous notes
|
|
478
|
+
method: value
|
|
479
|
+
encoding: "Values are log2 fold-change relative to vehicle control; n=3 biological replicates per arm"
|
|
470
480
|
```
|
|
471
481
|
|
|
482
|
+
> **Tip:** When source data carries information that can't be cleanly mapped to a structured annotation (assay-specific caveats, non-standard units, qualitative observations), add a `miscellaneous notes` annotation rather than forcing it into another field or dropping it. It accepts both `method: value` (one note for the whole table) and `method: column` (per-row notes from the source).
|
|
483
|
+
|
|
472
484
|
## Complete Example
|
|
473
485
|
|
|
474
486
|
Minimal table configuration:
|
|
@@ -498,7 +510,7 @@ template:
|
|
|
498
510
|
|
|
499
511
|
provenance:
|
|
500
512
|
repo: PMID
|
|
501
|
-
publication:
|
|
513
|
+
publication: 12345678
|
|
502
514
|
contributors:
|
|
503
515
|
- kind: curation
|
|
504
516
|
name: Example User
|
|
@@ -33,7 +33,7 @@ The build command automatically downloads BABEL exports from RENCI (`https://sta
|
|
|
33
33
|
1. **Download** — BABEL class and synonym files are downloaded from RENCI and split into LZ4-compressed NDJSON chunks under `./datassert/downloads/`.
|
|
34
34
|
2. **Lookup** — Class files (`*.ndjson.lz4`) are read to build an in-memory equivalent-identifier lookup.
|
|
35
35
|
3. **Parquet Staging** — Synonym files are processed with the lookup, quality-controlled, and written as sharded Parquet files to `./datassert/parquets/`.
|
|
36
|
-
4. **DuckDB Generation** — Parquet files are loaded into
|
|
36
|
+
4. **DuckDB Generation** — Parquet files are loaded into 10 sharded DuckDB databases under `./datassert/data/`.
|
|
37
37
|
|
|
38
38
|
### Examples
|
|
39
39
|
|
|
@@ -57,11 +57,11 @@ datassert build --use-existing-parquets
|
|
|
57
57
|
|
|
58
58
|
## Output Artifacts
|
|
59
59
|
|
|
60
|
-
-
|
|
60
|
+
- 10 sharded DuckDB databases are written to `./datassert/data/{0..9}.duckdb`.
|
|
61
61
|
- Each shard contains `SOURCES`, `CATEGORIES`, `CURIES`, and `SYNONYMS` tables, deduplicated, sorted, and indexed for query performance.
|
|
62
|
-
- Staging Parquet files are written to `./datassert/parquets/{0..
|
|
62
|
+
- Staging Parquet files are written to `./datassert/parquets/{0..9}/`.
|
|
63
63
|
|
|
64
|
-
Terms are routed to shards deterministically via `xxhash64(term) %
|
|
64
|
+
Terms are routed to shards deterministically via `xxhash64(term) % 10`, so a given string always hits the same shard.
|
|
65
65
|
|
|
66
66
|
### Schema
|
|
67
67
|
|
|
@@ -76,14 +76,14 @@ Each shard contains four tables:
|
|
|
76
76
|
|
|
77
77
|
## Usage in Graph Config
|
|
78
78
|
|
|
79
|
-
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all
|
|
79
|
+
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 10 shards at startup and passes the connections to `resolve()`.
|
|
80
80
|
|
|
81
81
|
```yaml
|
|
82
82
|
# graph-config.yaml (GC2)
|
|
83
83
|
syntax: GC2
|
|
84
84
|
name: my-graph
|
|
85
85
|
version: "1.0"
|
|
86
|
-
datassert: /path/to/datassert/ # directory containing data/0..
|
|
86
|
+
datassert: /path/to/datassert/ # directory containing data/0..9.duckdb
|
|
87
87
|
tables:
|
|
88
88
|
- ./TABLE/my-table.yaml
|
|
89
89
|
```
|
|
@@ -99,7 +99,7 @@ from tablassert.fullmap import resolve
|
|
|
99
99
|
datassert_dir = "/path/to/datassert"
|
|
100
100
|
conns = [
|
|
101
101
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
102
|
-
for i in range(
|
|
102
|
+
for i in range(10)
|
|
103
103
|
]
|
|
104
104
|
```
|
|
105
105
|
|
|
@@ -81,8 +81,8 @@ docker run --rm \
|
|
|
81
81
|
|
|
82
82
|
- **Datassert path** — The graph configuration YAML specifies the `datassert` path for the entity-resolution database. Ensure it is accessible inside the container.
|
|
83
83
|
- **Multiprocessing** — `src/tablassert/cli.py:63` uses `multiprocessing.Pool` for parallel table loading and section extraction.
|
|
84
|
-
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all
|
|
85
|
-
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across
|
|
84
|
+
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all 10 Datassert DuckDB shards concurrently.
|
|
85
|
+
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across 10 DuckDB shards (`SHARDS = 10`) using xxhash64.
|
|
86
86
|
- **Text normalization** — `src/tablassert/nlp.py` provides `level_one` (strip + lowercase) and `level_two` (regex-based cleanup).
|
|
87
87
|
|
|
88
88
|
## CI/CD Integration
|
|
@@ -33,7 +33,7 @@ template:
|
|
|
33
33
|
- Disease
|
|
34
34
|
provenance:
|
|
35
35
|
repo: PMID
|
|
36
|
-
publication:
|
|
36
|
+
publication: 12345678
|
|
37
37
|
contributors:
|
|
38
38
|
- kind: curation
|
|
39
39
|
name: Your Name
|
|
@@ -85,7 +85,7 @@ template:
|
|
|
85
85
|
taxon: 9606
|
|
86
86
|
provenance:
|
|
87
87
|
repo: PMID
|
|
88
|
-
publication:
|
|
88
|
+
publication: 98765432
|
|
89
89
|
contributors:
|
|
90
90
|
- kind: curation
|
|
91
91
|
name: Your Name
|
|
@@ -146,7 +146,7 @@ template:
|
|
|
146
146
|
encoding: CHEBI:41774
|
|
147
147
|
provenance:
|
|
148
148
|
repo: PMC
|
|
149
|
-
publication:
|
|
149
|
+
publication: 11708054
|
|
150
150
|
contributors:
|
|
151
151
|
- kind: curation
|
|
152
152
|
name: Your Name
|
|
@@ -161,11 +161,15 @@ template:
|
|
|
161
161
|
- annotation: assertion method
|
|
162
162
|
method: value
|
|
163
163
|
encoding: "Spearman correlation"
|
|
164
|
+
# Freetext catch-all for context that doesn't fit a structured field.
|
|
165
|
+
- annotation: miscellaneous notes
|
|
166
|
+
method: value
|
|
167
|
+
encoding: "FDR-corrected; samples pooled across two cohorts"
|
|
164
168
|
```
|
|
165
169
|
|
|
166
170
|
**Key techniques:**
|
|
167
171
|
|
|
168
|
-
- **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`)
|
|
172
|
+
- **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`). Patterns must be Polars `str.replace_all()`-compatible — no capturing groups (`(...)` / `\1`) and no lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`). Chain several simple substitutions instead.
|
|
169
173
|
- **Avoid list** (`avoid: [Gene]`) prevents organism names from resolving to gene entities
|
|
170
174
|
- **Fixed-value object** (`method: value`) assigns the same metabolite CURIE to all rows
|
|
171
175
|
- **Excel source** with sheet name and row slicing
|
|
@@ -199,7 +203,7 @@ template:
|
|
|
199
203
|
encoding: PLACEHOLDER
|
|
200
204
|
provenance:
|
|
201
205
|
repo: PMID
|
|
202
|
-
publication:
|
|
206
|
+
publication: 11223344
|
|
203
207
|
contributors:
|
|
204
208
|
- kind: curation
|
|
205
209
|
name: Your Name
|
|
@@ -277,7 +281,7 @@ template:
|
|
|
277
281
|
- Disease
|
|
278
282
|
provenance:
|
|
279
283
|
repo: PMID
|
|
280
|
-
publication:
|
|
284
|
+
publication: 55667788
|
|
281
285
|
contributors:
|
|
282
286
|
- kind: curation
|
|
283
287
|
name: Your Name
|
|
@@ -330,7 +334,7 @@ template:
|
|
|
330
334
|
- ChemicalEntity
|
|
331
335
|
provenance:
|
|
332
336
|
repo: PMID
|
|
333
|
-
publication:
|
|
337
|
+
publication: 99887766
|
|
334
338
|
contributors:
|
|
335
339
|
- kind: curation
|
|
336
340
|
name: Your Name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.3.
|
|
3
|
+
version = "7.3.4"
|
|
4
4
|
description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -29,13 +29,24 @@ def from_url(website: str, p: Path, timeout: int = 60_000, retries: int = 3) ->
|
|
|
29
29
|
try:
|
|
30
30
|
with sync_playwright() as pw:
|
|
31
31
|
browser = pw.chromium.launch(headless=True)
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
context = browser.new_context(accept_downloads=True)
|
|
33
|
+
|
|
34
|
+
page = context.new_page()
|
|
34
35
|
with page.expect_download(timeout=timeout) as info:
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
try:
|
|
37
|
+
page.goto(website, wait_until="load", timeout=timeout)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
if "net::ERR_ABORTED" not in str(e):
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
download = info.value
|
|
43
|
+
download.save_as(p)
|
|
44
|
+
|
|
45
|
+
context.close()
|
|
37
46
|
browser.close()
|
|
47
|
+
|
|
38
48
|
return p
|
|
49
|
+
|
|
39
50
|
except Exception as e:
|
|
40
51
|
last = e
|
|
41
52
|
if attempt < retries - 1:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|