tablassert 7.3.3__tar.gz → 7.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablassert-7.3.3 → tablassert-7.3.5}/CHANGELOG.md +17 -2
- {tablassert-7.3.3 → tablassert-7.3.5}/PKG-INFO +9 -7
- {tablassert-7.3.3 → tablassert-7.3.5}/README.md +8 -6
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/api/fullmap.md +6 -6
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/api/lib.md +9 -9
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/api/qc.md +3 -3
- tablassert-7.3.5/docs/changelog.md +13 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/configuration/advanced-example.md +10 -4
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/configuration/graph.md +1 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/configuration/table.md +70 -58
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/datassert.md +7 -7
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/docker.md +2 -2
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/examples.md +11 -7
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/tutorial.md +1 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/mkdocs.yml +1 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/pyproject.toml +1 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/downloader.py +15 -4
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/models.py +3 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/uv.lock +1 -1
- {tablassert-7.3.3 → tablassert-7.3.5}/.github/workflows/autotag.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/.github/workflows/docker.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/.github/workflows/docs.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/.gitignore +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/.pre-commit-config.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/AGENTS.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/CITATION.cff +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/CONTRIBUTING.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/Dockerfile +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/LICENSE +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/api/utils.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/cli.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/index.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/docs/installation.md +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/llms.txt +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/__init__.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/cli.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/enums.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/fullmap.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/ingests.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/lib.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/log.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/nlp.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/qc.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/src/tablassert/utils.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/__init__.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/conftest.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/fixtures/minimal_section.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_enums.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_fullmap.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_ingests.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_lib.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_models.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_nlp.py +0 -0
- {tablassert-7.3.3 → tablassert-7.3.5}/tests/test_utils.py +0 -0
|
@@ -2,13 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.3.5 - 2026-04-29
|
|
6
|
+
|
|
7
|
+
### Documentation
|
|
8
|
+
- Tightened the table-configuration reference so field requirements, defaults, accepted enum values, row indexing, and column-reference examples match the strict `Section` schema and section-merging behavior implemented in `models.py`, `ingests.py`, and the runtime loader.
|
|
9
|
+
|
|
10
|
+
## 7.3.4 - 2026-04-28
|
|
11
|
+
|
|
12
|
+
### Bug Fixes
|
|
13
|
+
- Fixed `downloader.from_url()` failing on URLs that trigger an immediate download. The Playwright session now opens a browser context with `accept_downloads=True`, wraps `page.goto()` inside `page.expect_download()`, and tolerates the expected `net::ERR_ABORTED` navigation error that fires when the response is a download rather than a page.
|
|
14
|
+
|
|
15
|
+
### Documentation
|
|
16
|
+
- Documented `miscellaneous notes` as a freetext catch-all annotation in the table configuration and advanced-example pages — used for assay caveats, non-standard units, and qualitative observations that don't map cleanly to a structured field. Supports both `method: value` (constant) and `method: column` (per-row).
|
|
17
|
+
- Documented Polars regex constraints for the `regex` and `remove` transforms: patterns are passed to Polars `str.replace_all()` (Rust `regex` crate), so capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not supported and will raise at parse time. Chain simple substitutions instead, or capture residual context in a `miscellaneous notes` annotation.
|
|
18
|
+
|
|
5
19
|
## 7.3.3 - 2026-04-08
|
|
6
20
|
|
|
7
21
|
### Bug Fixes
|
|
8
|
-
- Changed datassert shard count
|
|
22
|
+
- Changed datassert shard count to 10 (`SHARDS` constant in `fullmap.py`) to correspond to the current datassert database layout.
|
|
9
23
|
|
|
10
24
|
### Documentation
|
|
11
|
-
- Updated
|
|
25
|
+
- Updated shard count references across documentation and examples to reflect the current 10-shard datassert layout.
|
|
26
|
+
- Corrected provenance examples so `repo` carries the namespace prefix and `publication` carries the repository-local identifier.
|
|
12
27
|
|
|
13
28
|
## 7.3.2 - 2026-04-03
|
|
14
29
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.3.
|
|
3
|
+
Version: 7.3.5
|
|
4
4
|
Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
|
|
5
5
|
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
6
|
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
@@ -98,14 +98,16 @@ docker run --rm \
|
|
|
98
98
|
```bash
|
|
99
99
|
# Build a knowledge graph from a YAML configuration
|
|
100
100
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
101
|
-
⠋ Loading
|
|
102
|
-
⠋
|
|
103
|
-
⠋
|
|
104
|
-
⠋
|
|
105
|
-
|
|
101
|
+
⠋ Loading Tables...
|
|
102
|
+
⠋ Extracting Sections...
|
|
103
|
+
⠋ Building TCode...
|
|
104
|
+
⠋ Collecting Instructions...
|
|
105
|
+
⠋ Building Subgraphs...
|
|
106
|
+
⠋ Compiling Graph...
|
|
107
|
+
✓ Finished!
|
|
106
108
|
```
|
|
107
109
|
|
|
108
|
-
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
110
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
|
|
109
111
|
|
|
110
112
|
## Key Features
|
|
111
113
|
|
|
@@ -46,14 +46,16 @@ docker run --rm \
|
|
|
46
46
|
```bash
|
|
47
47
|
# Build a knowledge graph from a YAML configuration
|
|
48
48
|
$ tablassert build-knowledge-graph graph-config.yaml
|
|
49
|
-
⠋ Loading
|
|
50
|
-
⠋
|
|
51
|
-
⠋
|
|
52
|
-
⠋
|
|
53
|
-
|
|
49
|
+
⠋ Loading Tables...
|
|
50
|
+
⠋ Extracting Sections...
|
|
51
|
+
⠋ Building TCode...
|
|
52
|
+
⠋ Collecting Instructions...
|
|
53
|
+
⠋ Building Subgraphs...
|
|
54
|
+
⠋ Compiling Graph...
|
|
55
|
+
✓ Finished!
|
|
54
56
|
```
|
|
55
57
|
|
|
56
|
-
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
|
|
58
|
+
Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
|
|
57
59
|
|
|
58
60
|
## Key Features
|
|
59
61
|
|
|
@@ -36,7 +36,7 @@ Column name containing text strings to resolve.
|
|
|
36
36
|
|
|
37
37
|
**`conns: list[object]`**
|
|
38
38
|
|
|
39
|
-
List of
|
|
39
|
+
List of 10 DuckDB shard connections to the datassert database.
|
|
40
40
|
|
|
41
41
|
Each shard contains:
|
|
42
42
|
- Synonym mappings (text → CURIE)
|
|
@@ -97,7 +97,7 @@ Returns a Polars LazyFrame with these columns added:
|
|
|
97
97
|
| `{col} taxon` | NCBI Taxon ID | `"NCBITaxon:9606"` |
|
|
98
98
|
| `{col} source` | Source database | `"HGNC"` |
|
|
99
99
|
| `{col} source version` | Database version | `"2025-01"` |
|
|
100
|
-
| `{col} nlp level` | NLP processing level | `
|
|
100
|
+
| `{col} nlp level` | NLP processing level | `1` or `2` |
|
|
101
101
|
|
|
102
102
|
### DuckDB Query
|
|
103
103
|
|
|
@@ -125,11 +125,11 @@ from tablassert.enums import Categories
|
|
|
125
125
|
import duckdb
|
|
126
126
|
import polars as pl
|
|
127
127
|
|
|
128
|
-
# Open all
|
|
128
|
+
# Open all 10 shard connections
|
|
129
129
|
datassert_dir = "/path/to/datassert"
|
|
130
130
|
conns = [
|
|
131
131
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
132
|
-
for i in range(
|
|
132
|
+
for i in range(10)
|
|
133
133
|
]
|
|
134
134
|
|
|
135
135
|
# LazyFrame with data to resolve
|
|
@@ -167,11 +167,11 @@ from tablassert.fullmap import resolve
|
|
|
167
167
|
from tablassert.nlp import level_one, level_two
|
|
168
168
|
from tablassert.enums import Categories
|
|
169
169
|
|
|
170
|
-
# Open all
|
|
170
|
+
# Open all 10 shard connections
|
|
171
171
|
datassert_dir = "/path/to/datassert"
|
|
172
172
|
conns = [
|
|
173
173
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
174
|
-
for i in range(
|
|
174
|
+
for i in range(10)
|
|
175
175
|
]
|
|
176
176
|
|
|
177
177
|
# Map a list of gene symbols to CURIEs
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
The `lib` module exposes `resolve_many()`, a high-level convenience function for resolving an iterable of entity strings to CURIEs without requiring manual LazyFrame construction, NLP preprocessing, or DuckDB shard management.
|
|
4
4
|
|
|
5
|
-
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all
|
|
5
|
+
It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 10 DuckDB shard connections, executing entity resolution, and returning results as a plain Python list of row dictionaries.
|
|
6
6
|
|
|
7
7
|
## resolve_many()
|
|
8
8
|
|
|
9
|
-
Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a
|
|
9
|
+
Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a list of row dictionaries.
|
|
10
10
|
|
|
11
11
|
### Function Signature
|
|
12
12
|
|
|
@@ -26,9 +26,9 @@ def resolve_many(
|
|
|
26
26
|
|
|
27
27
|
**`col: str`**
|
|
28
28
|
|
|
29
|
-
Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in
|
|
29
|
+
Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in each returned row dictionary.
|
|
30
30
|
|
|
31
|
-
For example, if `col="gene"`,
|
|
31
|
+
For example, if `col="gene"`, each returned row dictionary will contain keys like `"gene"`, `"gene name"`, `"gene category"`, etc.
|
|
32
32
|
|
|
33
33
|
**`entities: Iterable[str]`**
|
|
34
34
|
|
|
@@ -38,7 +38,7 @@ Examples: `["TP53", "BRCA1", "EGFR"]`, `("aspirin", "ibuprofen")`, or a generato
|
|
|
38
38
|
|
|
39
39
|
**`datassert: Path`**
|
|
40
40
|
|
|
41
|
-
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing
|
|
41
|
+
Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 10 DuckDB shard files (`0.duckdb` through `9.duckdb`).
|
|
42
42
|
|
|
43
43
|
Each shard contains:
|
|
44
44
|
- Synonym mappings (text → CURIE)
|
|
@@ -86,7 +86,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
|
|
|
86
86
|
| `{col} taxon` | NCBI Taxon ID (prefixed) | `"NCBITaxon:9606"` |
|
|
87
87
|
| `{col} source` | Source database | `"HGNC"` |
|
|
88
88
|
| `{col} source version` | Database version | `"2025-01"` |
|
|
89
|
-
| `{col} nlp level` | NLP processing level used for match | `
|
|
89
|
+
| `{col} nlp level` | NLP processing level used for match | `1` or `2` |
|
|
90
90
|
|
|
91
91
|
**Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned list may therefore be shorter than the input iterable.
|
|
92
92
|
|
|
@@ -98,7 +98,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
|
|
|
98
98
|
|
|
99
99
|
2. **NLP normalization** — Applies `level_one()` (whitespace stripping + lowercasing) and `level_two()` (non-word character removal via `\W+`) to produce the two normalized columns required by `resolve()`.
|
|
100
100
|
|
|
101
|
-
3. **DuckDB connection management** — Opens all
|
|
101
|
+
3. **DuckDB connection management** — Opens all 10 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
|
|
102
102
|
|
|
103
103
|
4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
|
|
104
104
|
|
|
@@ -224,8 +224,8 @@ Both levels are queried during resolution. Level one (exact case-insensitive mat
|
|
|
224
224
|
### Error Handling
|
|
225
225
|
|
|
226
226
|
- If the `datassert` path does not contain the expected shard files, `duckdb.connect()` will raise an `IOException`.
|
|
227
|
-
- If `entities` is empty, the function returns
|
|
228
|
-
- The `ExitStack` ensures all
|
|
227
|
+
- If `entities` is empty, the function returns `[]`.
|
|
228
|
+
- The `ExitStack` ensures all 10 DuckDB connections are closed even if resolution raises an exception.
|
|
229
229
|
- Unresolved entities are silently filtered from the output (logged at INFO level by default via `resolve()`).
|
|
230
230
|
|
|
231
231
|
## Integration
|
|
@@ -82,7 +82,7 @@ Two fuzzy matching algorithms:
|
|
|
82
82
|
1. **Ratio:** Overall string similarity
|
|
83
83
|
2. **Partial token sort ratio:** Combined token/subsequence matching
|
|
84
84
|
|
|
85
|
-
**Threshold:**
|
|
85
|
+
**Threshold:** 20% similarity
|
|
86
86
|
|
|
87
87
|
```python
|
|
88
88
|
fuzz.ratio(original, preferred) >= 20
|
|
@@ -125,7 +125,7 @@ return similarity >= 0.2
|
|
|
125
125
|
- Graph optimization level: ALL
|
|
126
126
|
- ONNX session caching
|
|
127
127
|
|
|
128
|
-
Lazy-loaded on first `
|
|
128
|
+
Lazy-loaded on first `fullmap_audit()` call that reaches the embedding stage, then reused for subsequent calls.
|
|
129
129
|
|
|
130
130
|
### Model Caching
|
|
131
131
|
|
|
@@ -135,7 +135,7 @@ BioBERT is lazy-loaded on first use and cached globally for the lifetime of the
|
|
|
135
135
|
# ? Lazy-loads BioBERT once on first batch audit call, then caches globally
|
|
136
136
|
```
|
|
137
137
|
|
|
138
|
-
**Cache location:**
|
|
138
|
+
**Cache location:** Downloaded model files are cached on disk in `.onnxassert/`, and the loaded model object is cached in memory for the lifetime of the process.
|
|
139
139
|
|
|
140
140
|
**Cache strategy:** BioBERT model loaded once on first batch audit, then reused globally
|
|
141
141
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
The canonical release history lives in the repository root at [`CHANGELOG.md`](https://github.com/SkyeAv/Tablassert/blob/main/CHANGELOG.md).
|
|
4
|
+
|
|
5
|
+
## Current Release Notes
|
|
6
|
+
|
|
7
|
+
## 7.3.5 - 2026-04-29
|
|
8
|
+
|
|
9
|
+
### Documentation
|
|
10
|
+
|
|
11
|
+
- The table-configuration reference now matches the strict runtime schema and merge behavior, including field defaults, requiredness, accepted enum values, zero-based row indexing, and valid column-reference examples.
|
|
12
|
+
|
|
13
|
+
For older releases and the full project history, open the root `CHANGELOG.md` in the repository.
|
|
@@ -66,7 +66,7 @@ template:
|
|
|
66
66
|
# Provenance: Publication and curation info
|
|
67
67
|
provenance:
|
|
68
68
|
repo: PMC
|
|
69
|
-
publication:
|
|
69
|
+
publication: 11708054
|
|
70
70
|
contributors:
|
|
71
71
|
- kind: curation
|
|
72
72
|
name: Skye Lane Goetz
|
|
@@ -103,12 +103,16 @@ template:
|
|
|
103
103
|
method: value
|
|
104
104
|
encoding: Spearman correlation
|
|
105
105
|
|
|
106
|
-
#
|
|
106
|
+
# Freetext catch-all — anything that doesn't map cleanly to a structured
|
|
107
|
+
# annotation (study design caveats, non-standard units, qualitative
|
|
108
|
+
# observations) belongs here rather than being dropped.
|
|
107
109
|
- annotation: miscellaneous notes
|
|
108
110
|
method: value
|
|
109
111
|
encoding: Correlation analysis between microbial composition and 13C-tamoxifen abundance after FDR correction
|
|
110
112
|
```
|
|
111
113
|
|
|
114
|
+
> **`miscellaneous notes` is a freetext escape hatch.** Use it whenever the source carries context you can't otherwise cleanly encode — assay variants, post-hoc qualifiers, "values are log-transformed", etc. It accepts `method: value` for a constant note across the whole table or `method: column` to pull per-row notes from the source.
|
|
115
|
+
|
|
112
116
|
## Key Techniques
|
|
113
117
|
|
|
114
118
|
### Excel Column References
|
|
@@ -143,6 +147,8 @@ The subject field uses three regex transformations in sequence:
|
|
|
143
147
|
```
|
|
144
148
|
`"Lactobacillus sp"` → `"Lactobacillus sp. "`
|
|
145
149
|
|
|
150
|
+
> **Regex constraint:** Each `pattern` is handed to Polars `str.replace_all()` (Rust `regex` crate). **Capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not allowed** and will fail validation. Express transformations as a sequence of simple anchored / character-class substitutions instead — the pipeline above is a deliberate three-step chain because no single capturing-group pattern is permitted. If the transformation can't be expressed without those features, capture the leftover context in a `miscellaneous notes` annotation rather than fighting the regex engine.
|
|
151
|
+
|
|
146
152
|
### Taxonomic Filtering
|
|
147
153
|
|
|
148
154
|
Prevent incorrect entity resolution:
|
|
@@ -297,7 +303,7 @@ template:
|
|
|
297
303
|
|
|
298
304
|
provenance:
|
|
299
305
|
repo: PMC
|
|
300
|
-
publication:
|
|
306
|
+
publication: 12345678
|
|
301
307
|
contributors:
|
|
302
308
|
- kind: curation
|
|
303
309
|
name: Skye Lane Goetz
|
|
@@ -358,7 +364,7 @@ template:
|
|
|
358
364
|
|
|
359
365
|
provenance:
|
|
360
366
|
repo: PMC
|
|
361
|
-
publication:
|
|
367
|
+
publication: 87654321
|
|
362
368
|
contributors:
|
|
363
369
|
- kind: curation
|
|
364
370
|
name: Skye Lane Goetz
|
|
@@ -60,7 +60,7 @@ See [Table Configuration](table.md) for details.
|
|
|
60
60
|
|
|
61
61
|
**`datassert: path`**
|
|
62
62
|
|
|
63
|
-
Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens
|
|
63
|
+
Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens 10 shard files from `datassert/data/{0..9}.duckdb`. This database contains:
|
|
64
64
|
- Synonym mappings (text → CURIE)
|
|
65
65
|
- Biolink categories
|
|
66
66
|
- Taxonomic information
|
|
@@ -39,14 +39,14 @@ template:
|
|
|
39
39
|
|
|
40
40
|
sections:
|
|
41
41
|
- statement: # Section 1: Gene-Disease
|
|
42
|
-
subject: {encoding:
|
|
42
|
+
subject: {method: column, encoding: A}
|
|
43
43
|
predicate: associated_with
|
|
44
|
-
object: {encoding:
|
|
44
|
+
object: {method: column, encoding: B}
|
|
45
45
|
|
|
46
46
|
- statement: # Section 2: Gene-Pathway
|
|
47
|
-
subject: {encoding:
|
|
47
|
+
subject: {method: column, encoding: A}
|
|
48
48
|
predicate: participates_in
|
|
49
|
-
object: {encoding:
|
|
49
|
+
object: {method: column, encoding: C}
|
|
50
50
|
```
|
|
51
51
|
|
|
52
52
|
### Merge Behavior (fastmerge)
|
|
@@ -92,15 +92,15 @@ sections:
|
|
|
92
92
|
**Single output:** Template only
|
|
93
93
|
```yaml
|
|
94
94
|
template:
|
|
95
|
-
source: {kind: text, local: data.csv}
|
|
95
|
+
source: {kind: text, local: data.csv, url: https://example.com/data.csv}
|
|
96
96
|
statement: {...}
|
|
97
97
|
```
|
|
98
98
|
|
|
99
99
|
**Multiple predicates, same source:**
|
|
100
100
|
```yaml
|
|
101
101
|
template:
|
|
102
|
-
source: {kind: excel, local: data.xlsx}
|
|
103
|
-
provenance: {publication:
|
|
102
|
+
source: {kind: excel, local: data.xlsx, url: https://example.com/data.xlsx}
|
|
103
|
+
provenance: {repo: PMC, publication: 123, contributors: [{name: Example User, date: 27 JAN 2026}]}
|
|
104
104
|
|
|
105
105
|
sections:
|
|
106
106
|
- statement: {predicate: treats}
|
|
@@ -110,14 +110,14 @@ sections:
|
|
|
110
110
|
**Multiple columns, shared provenance:**
|
|
111
111
|
```yaml
|
|
112
112
|
template:
|
|
113
|
-
source: {kind: text, local: data.csv}
|
|
114
|
-
provenance: {publication:
|
|
113
|
+
source: {kind: text, local: data.csv, url: https://example.com/data.csv}
|
|
114
|
+
provenance: {repo: PMID, publication: 456, contributors: [{name: Example User, date: 27 JAN 2026}]}
|
|
115
115
|
statement:
|
|
116
|
-
subject: {encoding:
|
|
116
|
+
subject: {method: column, encoding: A}
|
|
117
117
|
|
|
118
118
|
sections:
|
|
119
|
-
- statement: {object: {encoding:
|
|
120
|
-
- statement: {object: {encoding:
|
|
119
|
+
- statement: {object: {method: column, encoding: B}}
|
|
120
|
+
- statement: {object: {method: column, encoding: C}}
|
|
121
121
|
```
|
|
122
122
|
|
|
123
123
|
## Configuration Schema
|
|
@@ -126,8 +126,8 @@ sections:
|
|
|
126
126
|
|
|
127
127
|
| Field | Type | Required | Description |
|
|
128
128
|
|-------|------|----------|-------------|
|
|
129
|
-
| `syntax` | String |
|
|
130
|
-
| `status` | String | No | Development status
|
|
129
|
+
| `syntax` | String | No | Configuration version. Defaults to `"TC3"`. |
|
|
130
|
+
| `status` | String | No | Development status. Defaults to `"alpha"`; allowed values are `"alpha"`, `"beta"`, `"primetime"`. |
|
|
131
131
|
|
|
132
132
|
### Source
|
|
133
133
|
|
|
@@ -137,12 +137,12 @@ Defines the data file location and format.
|
|
|
137
137
|
|
|
138
138
|
| Field | Type | Required | Description |
|
|
139
139
|
|-------|------|----------|-------------|
|
|
140
|
-
| `kind` | String |
|
|
140
|
+
| `kind` | String | No | Source kind. Model default is `"excel"`, but specify it explicitly in configs. |
|
|
141
141
|
| `local` | Path | Yes | Local file path for caching |
|
|
142
142
|
| `url` | URL | Yes | Download URL (HTTP/HTTPS) |
|
|
143
|
-
| `sheet` | String | No | Sheet name
|
|
144
|
-
| `row_slice` | List[Int\|"auto"] | No |
|
|
145
|
-
| `rows` | List[Int] | No |
|
|
143
|
+
| `sheet` | String | No | Sheet name. Defaults to `"Sheet1"`. |
|
|
144
|
+
| `row_slice` | List[Int\|"auto"] | No | Two-value zero-based crop bounds: `[start, stop]`. Each value may be an integer or `"auto"`. |
|
|
145
|
+
| `rows` | List[Int] | No | Zero-based row indices to keep after any `row_slice` crop. |
|
|
146
146
|
| `reindex` | List[Reindex] | No | Conditional row filtering |
|
|
147
147
|
|
|
148
148
|
**Example:**
|
|
@@ -153,7 +153,7 @@ source:
|
|
|
153
153
|
url: https://example.com/data.xlsx
|
|
154
154
|
sheet: "Sheet1"
|
|
155
155
|
row_slice:
|
|
156
|
-
-
|
|
156
|
+
- 1 # Start at the second physical row
|
|
157
157
|
- auto # Read to end
|
|
158
158
|
```
|
|
159
159
|
|
|
@@ -161,12 +161,12 @@ source:
|
|
|
161
161
|
|
|
162
162
|
| Field | Type | Required | Description |
|
|
163
163
|
|-------|------|----------|-------------|
|
|
164
|
-
| `kind` | String |
|
|
164
|
+
| `kind` | String | No | Source kind. Model default is `"text"`, but specify it explicitly in configs. |
|
|
165
165
|
| `local` | Path | Yes | Local file path for caching |
|
|
166
166
|
| `url` | URL | Yes | Download URL |
|
|
167
|
-
| `delimiter` | String | No |
|
|
168
|
-
| `row_slice` | List[Int\|"auto"] | No |
|
|
169
|
-
| `rows` | List[Int] | No |
|
|
167
|
+
| `delimiter` | String | No | Field delimiter. Defaults to `","`. |
|
|
168
|
+
| `row_slice` | List[Int\|"auto"] | No | Two-value zero-based crop bounds: `[start, stop]`. Each value may be an integer or `"auto"`. |
|
|
169
|
+
| `rows` | List[Int] | No | Zero-based row indices to keep after any `row_slice` crop. |
|
|
170
170
|
| `reindex` | List[Reindex] | No | Conditional filtering |
|
|
171
171
|
|
|
172
172
|
**Example:**
|
|
@@ -187,16 +187,16 @@ Filter rows based on column values.
|
|
|
187
187
|
|
|
188
188
|
| Field | Type | Description |
|
|
189
189
|
|-------|------|-------------|
|
|
190
|
-
| `column` | String |
|
|
191
|
-
| `comparison` | String | Operator
|
|
190
|
+
| `column` | String | Source column letters to evaluate (`A`-`ZZZ`) |
|
|
191
|
+
| `comparison` | String | Operator. Defaults to `"ne"`; allowed values are `"eq"`, `"ne"`, `"lt"`, `"le"`, `"gt"`, `"ge"`. |
|
|
192
192
|
| `comparator` | String\|Int\|Float | Value to compare against |
|
|
193
193
|
|
|
194
194
|
**Example:**
|
|
195
195
|
```yaml
|
|
196
196
|
reindex:
|
|
197
|
-
- column:
|
|
197
|
+
- column: C
|
|
198
198
|
comparison: lt
|
|
199
|
-
comparator: 0.05 # Keep rows where
|
|
199
|
+
comparator: 0.05 # Keep rows where column C < 0.05
|
|
200
200
|
```
|
|
201
201
|
|
|
202
202
|
### Statement (Triple Definition)
|
|
@@ -206,7 +206,7 @@ Defines subject-predicate-object relationships.
|
|
|
206
206
|
| Field | Type | Required | Description |
|
|
207
207
|
|-------|------|----------|-------------|
|
|
208
208
|
| `subject` | NodeEncoding | Yes | Subject entity configuration |
|
|
209
|
-
| `predicate` | String |
|
|
209
|
+
| `predicate` | String | No | Biolink predicate. Defaults to `"related_to"`. |
|
|
210
210
|
| `object` | NodeEncoding | Yes | Object entity configuration |
|
|
211
211
|
| `qualifiers` | List[Qualifier] | No | Edge qualifiers (context) |
|
|
212
212
|
|
|
@@ -215,12 +215,12 @@ Defines subject-predicate-object relationships.
|
|
|
215
215
|
statement:
|
|
216
216
|
subject:
|
|
217
217
|
method: column
|
|
218
|
-
encoding:
|
|
218
|
+
encoding: A
|
|
219
219
|
prioritize: [Gene]
|
|
220
220
|
predicate: treats
|
|
221
221
|
object:
|
|
222
222
|
method: column
|
|
223
|
-
encoding:
|
|
223
|
+
encoding: B
|
|
224
224
|
prioritize: [Disease]
|
|
225
225
|
```
|
|
226
226
|
|
|
@@ -230,11 +230,11 @@ Defines how to extract and resolve entities.
|
|
|
230
230
|
|
|
231
231
|
| Field | Type | Required | Description |
|
|
232
232
|
|-------|------|----------|-------------|
|
|
233
|
-
| `method` | String |
|
|
234
|
-
| `encoding` | String\|Int\|Float | Yes | Literal value or column
|
|
233
|
+
| `method` | String | No | `"value"` (literal) or `"column"` (source column letters). Defaults to `"value"`. |
|
|
234
|
+
| `encoding` | String\|Int\|Float | Yes | Literal value or source column letters, depending on `method` |
|
|
235
235
|
| `taxon` | Int | No | NCBI Taxon ID for filtering (e.g., `9606` for human) |
|
|
236
|
-
| `prioritize` | List[String] | No | Preferred Biolink categories |
|
|
237
|
-
| `avoid` | List[String] | No | Excluded Biolink categories |
|
|
236
|
+
| `prioritize` | List[String] | No | Preferred Biolink categories (must be valid `Categories` enum values such as `Gene`, `Protein`) |
|
|
237
|
+
| `avoid` | List[String] | No | Excluded Biolink categories (must be valid `Categories` enum values) |
|
|
238
238
|
| `regex` | List[Regex] | No | Pattern replacements |
|
|
239
239
|
| `fill` | String | No | Null-filling strategy: `"forward"`, `"backward"`, `"min"`, `"max"`, `"mean"`, `"zero"`, `"one"` |
|
|
240
240
|
| `remove` | List[String] | No | Strings to filter out |
|
|
@@ -253,11 +253,12 @@ subject:
|
|
|
253
253
|
encoding: CHEBI:41774 # All rows get this CURIE
|
|
254
254
|
```
|
|
255
255
|
|
|
256
|
-
**`method: column`** - Reference a column
|
|
256
|
+
**`method: column`** - Reference a source column
|
|
257
257
|
|
|
258
|
-
|
|
259
|
-
- Column A
|
|
260
|
-
- Column B
|
|
258
|
+
Source files are read without headers, so column references are Excel-style letters:
|
|
259
|
+
- Column A -> `"A"`
|
|
260
|
+
- Column B -> `"B"`
|
|
261
|
+
- Column AA -> `"AA"`
|
|
261
262
|
|
|
262
263
|
```yaml
|
|
263
264
|
subject:
|
|
@@ -265,12 +266,7 @@ subject:
|
|
|
265
266
|
encoding: A # Read from column A
|
|
266
267
|
```
|
|
267
268
|
|
|
268
|
-
|
|
269
|
-
```yaml
|
|
270
|
-
subject:
|
|
271
|
-
method: column
|
|
272
|
-
encoding: gene_symbol # Read from "gene_symbol" column
|
|
273
|
-
```
|
|
269
|
+
At runtime those letters are converted internally to Polars column names such as `column_1`, but those internal names are not valid configuration values.
|
|
274
270
|
|
|
275
271
|
#### Taxonomic Filtering
|
|
276
272
|
|
|
@@ -278,7 +274,8 @@ subject:
|
|
|
278
274
|
|
|
279
275
|
```yaml
|
|
280
276
|
subject:
|
|
281
|
-
|
|
277
|
+
method: column
|
|
278
|
+
encoding: A
|
|
282
279
|
taxon: 9606 # Only human genes (Homo sapiens)
|
|
283
280
|
```
|
|
284
281
|
|
|
@@ -305,7 +302,8 @@ If "TP53" maps to both Gene and Protein, prefer Gene.
|
|
|
305
302
|
|
|
306
303
|
```yaml
|
|
307
304
|
subject:
|
|
308
|
-
|
|
305
|
+
method: column
|
|
306
|
+
encoding: A
|
|
309
307
|
prioritize:
|
|
310
308
|
- OrganismTaxon
|
|
311
309
|
avoid:
|
|
@@ -330,6 +328,8 @@ subject:
|
|
|
330
328
|
|
|
331
329
|
Executed in order.
|
|
332
330
|
|
|
331
|
+
> **Regex dialect:** Patterns are passed directly to Polars `str.replace_all()`, which uses the Rust [`regex`](https://docs.rs/regex/) crate. Only features supported by that engine work — in particular, **capturing groups (`(...)`, `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)` are not supported** and will raise an error at parse time. Stick to character classes, anchors (`^`, `$`), quantifiers, alternation (`a|b`), and non-capturing groups (`(?:...)`) if grouping is needed. If a transformation is too complex to express, prefer chaining several simple substitutions or capturing the residual context in a `miscellaneous notes` annotation instead.
|
|
332
|
+
|
|
333
333
|
**`remove: list[string]`** - Filter out specific strings
|
|
334
334
|
|
|
335
335
|
```yaml
|
|
@@ -339,6 +339,8 @@ subject:
|
|
|
339
339
|
- "^NA " # Remove rows starting with "NA "
|
|
340
340
|
```
|
|
341
341
|
|
|
342
|
+
Same regex constraints apply as the `regex` field — Polars-compatible patterns only, no capturing groups or lookarounds.
|
|
343
|
+
|
|
342
344
|
**`prefix` / `suffix`** - Add text
|
|
343
345
|
|
|
344
346
|
```yaml
|
|
@@ -362,7 +364,8 @@ Available strategies:
|
|
|
362
364
|
|
|
363
365
|
```yaml
|
|
364
366
|
subject:
|
|
365
|
-
|
|
367
|
+
method: column
|
|
368
|
+
encoding: A
|
|
366
369
|
fill: forward # Propagate values down through null rows
|
|
367
370
|
```
|
|
368
371
|
|
|
@@ -370,7 +373,7 @@ subject:
|
|
|
370
373
|
annotations:
|
|
371
374
|
- annotation: expression_level
|
|
372
375
|
method: column
|
|
373
|
-
encoding:
|
|
376
|
+
encoding: C
|
|
374
377
|
fill: mean # Replace nulls with column average
|
|
375
378
|
```
|
|
376
379
|
|
|
@@ -380,7 +383,8 @@ annotations:
|
|
|
380
383
|
|
|
381
384
|
```yaml
|
|
382
385
|
object:
|
|
383
|
-
|
|
386
|
+
method: column
|
|
387
|
+
encoding: B
|
|
384
388
|
explode_by: ";" # "P1;P2;P3" → 3 separate edges
|
|
385
389
|
```
|
|
386
390
|
|
|
@@ -398,7 +402,7 @@ Add context to edges (anatomical location, species, etc.).
|
|
|
398
402
|
|
|
399
403
|
| Field | Type | Description |
|
|
400
404
|
|-------|------|-------------|
|
|
401
|
-
| `qualifier` | String | Biolink qualifier (e.g., `"species_context_qualifier"`) |
|
|
405
|
+
| `qualifier` | String | Biolink qualifier from the `Qualifiers` enum (e.g., `"species_context_qualifier"`) |
|
|
402
406
|
| (inherits NodeEncoding) | | All NodeEncoding fields available |
|
|
403
407
|
|
|
404
408
|
**Example:**
|
|
@@ -415,15 +419,15 @@ Required metadata about data source.
|
|
|
415
419
|
|
|
416
420
|
| Field | Type | Required | Description |
|
|
417
421
|
|-------|------|----------|-------------|
|
|
418
|
-
| `repo` | String |
|
|
419
|
-
| `publication` | String | Yes |
|
|
422
|
+
| `repo` | String | No | Repository. Defaults to `"PMC"`; allowed values are `"PMC"`, `"PMID"`. |
|
|
423
|
+
| `publication` | String | Yes | Repository-local identifier appended to `repo:` (e.g., `"11708054"`, `"123"`) |
|
|
420
424
|
| `contributors` | List[Contributor] | Yes | Curation information |
|
|
421
425
|
|
|
422
426
|
**Contributor fields:**
|
|
423
427
|
|
|
424
428
|
| Field | Type | Required | Description |
|
|
425
429
|
|-------|------|----------|-------------|
|
|
426
|
-
| `kind` | String |
|
|
430
|
+
| `kind` | String | No | Contributor role. Defaults to `"curation"`; allowed values are `"curation"`, `"validation"`, `"tool"`. |
|
|
427
431
|
| `name` | String | Yes | Contributor name |
|
|
428
432
|
| `date` | String | Yes | Date (free format) |
|
|
429
433
|
| `organizations` | List[String] | No | Affiliations |
|
|
@@ -433,7 +437,7 @@ Required metadata about data source.
|
|
|
433
437
|
```yaml
|
|
434
438
|
provenance:
|
|
435
439
|
repo: PMC
|
|
436
|
-
publication:
|
|
440
|
+
publication: 11708054
|
|
437
441
|
contributors:
|
|
438
442
|
- kind: curation
|
|
439
443
|
name: Skye Lane Goetz
|
|
@@ -467,8 +471,16 @@ annotations:
|
|
|
467
471
|
- annotation: multiple testing correction method
|
|
468
472
|
method: value
|
|
469
473
|
encoding: "Benjamini Hochberg"
|
|
474
|
+
|
|
475
|
+
# Freetext catch-all for context that doesn't fit a structured field —
|
|
476
|
+
# study caveats, units, post-hoc notes, anything you'd otherwise lose.
|
|
477
|
+
- annotation: miscellaneous notes
|
|
478
|
+
method: value
|
|
479
|
+
encoding: "Values are log2 fold-change relative to vehicle control; n=3 biological replicates per arm"
|
|
470
480
|
```
|
|
471
481
|
|
|
482
|
+
> **Tip:** When source data carries information that can't be cleanly mapped to a structured annotation (assay-specific caveats, non-standard units, qualitative observations), add a `miscellaneous notes` annotation rather than forcing it into another field or dropping it. It accepts both `method: value` (one note for the whole table) and `method: column` (per-row notes from the source).
|
|
483
|
+
|
|
472
484
|
## Complete Example
|
|
473
485
|
|
|
474
486
|
Minimal table configuration:
|
|
@@ -488,17 +500,17 @@ template:
|
|
|
488
500
|
statement:
|
|
489
501
|
subject:
|
|
490
502
|
method: column
|
|
491
|
-
encoding:
|
|
503
|
+
encoding: A
|
|
492
504
|
prioritize: [Gene]
|
|
493
505
|
predicate: associated_with
|
|
494
506
|
object:
|
|
495
507
|
method: column
|
|
496
|
-
encoding:
|
|
508
|
+
encoding: B
|
|
497
509
|
prioritize: [Disease]
|
|
498
510
|
|
|
499
511
|
provenance:
|
|
500
512
|
repo: PMID
|
|
501
|
-
publication:
|
|
513
|
+
publication: 12345678
|
|
502
514
|
contributors:
|
|
503
515
|
- kind: curation
|
|
504
516
|
name: Example User
|
|
@@ -507,7 +519,7 @@ template:
|
|
|
507
519
|
annotations:
|
|
508
520
|
- annotation: p value
|
|
509
521
|
method: column
|
|
510
|
-
encoding:
|
|
522
|
+
encoding: C
|
|
511
523
|
```
|
|
512
524
|
|
|
513
525
|
## Next Steps
|
|
@@ -33,7 +33,7 @@ The build command automatically downloads BABEL exports from RENCI (`https://sta
|
|
|
33
33
|
1. **Download** — BABEL class and synonym files are downloaded from RENCI and split into LZ4-compressed NDJSON chunks under `./datassert/downloads/`.
|
|
34
34
|
2. **Lookup** — Class files (`*.ndjson.lz4`) are read to build an in-memory equivalent-identifier lookup.
|
|
35
35
|
3. **Parquet Staging** — Synonym files are processed with the lookup, quality-controlled, and written as sharded Parquet files to `./datassert/parquets/`.
|
|
36
|
-
4. **DuckDB Generation** — Parquet files are loaded into
|
|
36
|
+
4. **DuckDB Generation** — Parquet files are loaded into 10 sharded DuckDB databases under `./datassert/data/`.
|
|
37
37
|
|
|
38
38
|
### Examples
|
|
39
39
|
|
|
@@ -57,11 +57,11 @@ datassert build --use-existing-parquets
|
|
|
57
57
|
|
|
58
58
|
## Output Artifacts
|
|
59
59
|
|
|
60
|
-
-
|
|
60
|
+
- 10 sharded DuckDB databases are written to `./datassert/data/{0..9}.duckdb`.
|
|
61
61
|
- Each shard contains `SOURCES`, `CATEGORIES`, `CURIES`, and `SYNONYMS` tables, deduplicated, sorted, and indexed for query performance.
|
|
62
|
-
- Staging Parquet files are written to `./datassert/parquets/{0..
|
|
62
|
+
- Staging Parquet files are written to `./datassert/parquets/{0..9}/`.
|
|
63
63
|
|
|
64
|
-
Terms are routed to shards deterministically via `xxhash64(term) %
|
|
64
|
+
Terms are routed to shards deterministically via `xxhash64(term) % 10`, so a given string always hits the same shard.
|
|
65
65
|
|
|
66
66
|
### Schema
|
|
67
67
|
|
|
@@ -76,14 +76,14 @@ Each shard contains four tables:
|
|
|
76
76
|
|
|
77
77
|
## Usage in Graph Config
|
|
78
78
|
|
|
79
|
-
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all
|
|
79
|
+
The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 10 shards at startup and passes the connections to `resolve()`.
|
|
80
80
|
|
|
81
81
|
```yaml
|
|
82
82
|
# graph-config.yaml (GC2)
|
|
83
83
|
syntax: GC2
|
|
84
84
|
name: my-graph
|
|
85
85
|
version: "1.0"
|
|
86
|
-
datassert: /path/to/datassert/ # directory containing data/0..
|
|
86
|
+
datassert: /path/to/datassert/ # directory containing data/0..9.duckdb
|
|
87
87
|
tables:
|
|
88
88
|
- ./TABLE/my-table.yaml
|
|
89
89
|
```
|
|
@@ -99,7 +99,7 @@ from tablassert.fullmap import resolve
|
|
|
99
99
|
datassert_dir = "/path/to/datassert"
|
|
100
100
|
conns = [
|
|
101
101
|
duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
|
|
102
|
-
for i in range(
|
|
102
|
+
for i in range(10)
|
|
103
103
|
]
|
|
104
104
|
```
|
|
105
105
|
|
|
@@ -81,8 +81,8 @@ docker run --rm \
|
|
|
81
81
|
|
|
82
82
|
- **Datassert path** — The graph configuration YAML specifies the `datassert` path for the entity-resolution database. Ensure it is accessible inside the container.
|
|
83
83
|
- **Multiprocessing** — `src/tablassert/cli.py:63` uses `multiprocessing.Pool` for parallel table loading and section extraction.
|
|
84
|
-
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all
|
|
85
|
-
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across
|
|
84
|
+
- **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all 10 Datassert DuckDB shards concurrently.
|
|
85
|
+
- **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across 10 DuckDB shards (`SHARDS = 10`) using xxhash64.
|
|
86
86
|
- **Text normalization** — `src/tablassert/nlp.py` provides `level_one` (strip + lowercase) and `level_two` (regex-based cleanup).
|
|
87
87
|
|
|
88
88
|
## CI/CD Integration
|
|
@@ -33,7 +33,7 @@ template:
|
|
|
33
33
|
- Disease
|
|
34
34
|
provenance:
|
|
35
35
|
repo: PMID
|
|
36
|
-
publication:
|
|
36
|
+
publication: 12345678
|
|
37
37
|
contributors:
|
|
38
38
|
- kind: curation
|
|
39
39
|
name: Your Name
|
|
@@ -85,7 +85,7 @@ template:
|
|
|
85
85
|
taxon: 9606
|
|
86
86
|
provenance:
|
|
87
87
|
repo: PMID
|
|
88
|
-
publication:
|
|
88
|
+
publication: 98765432
|
|
89
89
|
contributors:
|
|
90
90
|
- kind: curation
|
|
91
91
|
name: Your Name
|
|
@@ -146,7 +146,7 @@ template:
|
|
|
146
146
|
encoding: CHEBI:41774
|
|
147
147
|
provenance:
|
|
148
148
|
repo: PMC
|
|
149
|
-
publication:
|
|
149
|
+
publication: 11708054
|
|
150
150
|
contributors:
|
|
151
151
|
- kind: curation
|
|
152
152
|
name: Your Name
|
|
@@ -161,11 +161,15 @@ template:
|
|
|
161
161
|
- annotation: assertion method
|
|
162
162
|
method: value
|
|
163
163
|
encoding: "Spearman correlation"
|
|
164
|
+
# Freetext catch-all for context that doesn't fit a structured field.
|
|
165
|
+
- annotation: miscellaneous notes
|
|
166
|
+
method: value
|
|
167
|
+
encoding: "FDR-corrected; samples pooled across two cohorts"
|
|
164
168
|
```
|
|
165
169
|
|
|
166
170
|
**Key techniques:**
|
|
167
171
|
|
|
168
|
-
- **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`)
|
|
172
|
+
- **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`). Patterns must be Polars `str.replace_all()`-compatible — no capturing groups (`(...)` / `\1`) and no lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`). Chain several simple substitutions instead.
|
|
169
173
|
- **Avoid list** (`avoid: [Gene]`) prevents organism names from resolving to gene entities
|
|
170
174
|
- **Fixed-value object** (`method: value`) assigns the same metabolite CURIE to all rows
|
|
171
175
|
- **Excel source** with sheet name and row slicing
|
|
@@ -199,7 +203,7 @@ template:
|
|
|
199
203
|
encoding: PLACEHOLDER
|
|
200
204
|
provenance:
|
|
201
205
|
repo: PMID
|
|
202
|
-
publication:
|
|
206
|
+
publication: 11223344
|
|
203
207
|
contributors:
|
|
204
208
|
- kind: curation
|
|
205
209
|
name: Your Name
|
|
@@ -277,7 +281,7 @@ template:
|
|
|
277
281
|
- Disease
|
|
278
282
|
provenance:
|
|
279
283
|
repo: PMID
|
|
280
|
-
publication:
|
|
284
|
+
publication: 55667788
|
|
281
285
|
contributors:
|
|
282
286
|
- kind: curation
|
|
283
287
|
name: Your Name
|
|
@@ -330,7 +334,7 @@ template:
|
|
|
330
334
|
- ChemicalEntity
|
|
331
335
|
provenance:
|
|
332
336
|
repo: PMID
|
|
333
|
-
publication:
|
|
337
|
+
publication: 99887766
|
|
334
338
|
contributors:
|
|
335
339
|
- kind: curation
|
|
336
340
|
name: Your Name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.3.
|
|
3
|
+
version = "7.3.5"
|
|
4
4
|
description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
@@ -29,13 +29,24 @@ def from_url(website: str, p: Path, timeout: int = 60_000, retries: int = 3) ->
|
|
|
29
29
|
try:
|
|
30
30
|
with sync_playwright() as pw:
|
|
31
31
|
browser = pw.chromium.launch(headless=True)
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
context = browser.new_context(accept_downloads=True)
|
|
33
|
+
|
|
34
|
+
page = context.new_page()
|
|
34
35
|
with page.expect_download(timeout=timeout) as info:
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
try:
|
|
37
|
+
page.goto(website, wait_until="load", timeout=timeout)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
if "net::ERR_ABORTED" not in str(e):
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
download = info.value
|
|
43
|
+
download.save_as(p)
|
|
44
|
+
|
|
45
|
+
context.close()
|
|
37
46
|
browser.close()
|
|
47
|
+
|
|
38
48
|
return p
|
|
49
|
+
|
|
39
50
|
except Exception as e:
|
|
40
51
|
last = e
|
|
41
52
|
if attempt < retries - 1:
|
|
@@ -33,7 +33,9 @@ class TablaBase(BaseModel):
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class Reindex(TablaBase):
|
|
36
|
-
column: str = Field(
|
|
36
|
+
column: str = Field(
|
|
37
|
+
..., pattern=r"^[A-Z]{1,3}$", description="Source column letters used for row filtering.", examples=["A", "AA"]
|
|
38
|
+
)
|
|
37
39
|
comparison: Comparisons = Field(
|
|
38
40
|
Comparisons.NE,
|
|
39
41
|
description="Comparison operator used in reindex filtering.",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|