tablassert 7.3.3__tar.gz → 7.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {tablassert-7.3.3 → tablassert-7.3.4}/CHANGELOG.md +12 -2
  2. {tablassert-7.3.3 → tablassert-7.3.4}/PKG-INFO +4 -4
  3. {tablassert-7.3.3 → tablassert-7.3.4}/README.md +3 -3
  4. {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/fullmap.md +6 -6
  5. {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/lib.md +9 -9
  6. {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/qc.md +3 -3
  7. tablassert-7.3.4/docs/changelog.md +13 -0
  8. {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/advanced-example.md +10 -4
  9. {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/graph.md +1 -1
  10. {tablassert-7.3.3 → tablassert-7.3.4}/docs/configuration/table.md +17 -5
  11. {tablassert-7.3.3 → tablassert-7.3.4}/docs/datassert.md +7 -7
  12. {tablassert-7.3.3 → tablassert-7.3.4}/docs/docker.md +2 -2
  13. {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples.md +11 -7
  14. {tablassert-7.3.3 → tablassert-7.3.4}/docs/tutorial.md +1 -1
  15. {tablassert-7.3.3 → tablassert-7.3.4}/mkdocs.yml +1 -1
  16. {tablassert-7.3.3 → tablassert-7.3.4}/pyproject.toml +1 -1
  17. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/downloader.py +15 -4
  18. {tablassert-7.3.3 → tablassert-7.3.4}/uv.lock +1 -1
  19. {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/autotag.yml +0 -0
  20. {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/docker.yml +0 -0
  21. {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/docs.yml +0 -0
  22. {tablassert-7.3.3 → tablassert-7.3.4}/.github/workflows/pipy.yml +0 -0
  23. {tablassert-7.3.3 → tablassert-7.3.4}/.gitignore +0 -0
  24. {tablassert-7.3.3 → tablassert-7.3.4}/.pre-commit-config.yaml +0 -0
  25. {tablassert-7.3.3 → tablassert-7.3.4}/AGENTS.md +0 -0
  26. {tablassert-7.3.3 → tablassert-7.3.4}/CITATION.cff +0 -0
  27. {tablassert-7.3.3 → tablassert-7.3.4}/CONTRIBUTING.md +0 -0
  28. {tablassert-7.3.3 → tablassert-7.3.4}/Dockerfile +0 -0
  29. {tablassert-7.3.3 → tablassert-7.3.4}/LICENSE +0 -0
  30. {tablassert-7.3.3 → tablassert-7.3.4}/docs/api/utils.md +0 -0
  31. {tablassert-7.3.3 → tablassert-7.3.4}/docs/cli.md +0 -0
  32. {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-data.csv +0 -0
  33. {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-graph.yaml +0 -0
  34. {tablassert-7.3.3 → tablassert-7.3.4}/docs/examples/tutorial-table.yaml +0 -0
  35. {tablassert-7.3.3 → tablassert-7.3.4}/docs/index.md +0 -0
  36. {tablassert-7.3.3 → tablassert-7.3.4}/docs/installation.md +0 -0
  37. {tablassert-7.3.3 → tablassert-7.3.4}/llms.txt +0 -0
  38. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/__init__.py +0 -0
  39. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/cli.py +0 -0
  40. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/enums.py +0 -0
  41. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/fullmap.py +0 -0
  42. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/ingests.py +0 -0
  43. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/lib.py +0 -0
  44. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/log.py +0 -0
  45. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/models.py +0 -0
  46. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/nlp.py +0 -0
  47. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/qc.py +0 -0
  48. {tablassert-7.3.3 → tablassert-7.3.4}/src/tablassert/utils.py +0 -0
  49. {tablassert-7.3.3 → tablassert-7.3.4}/tests/__init__.py +0 -0
  50. {tablassert-7.3.3 → tablassert-7.3.4}/tests/conftest.py +0 -0
  51. {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
  52. {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/minimal_section.yaml +0 -0
  53. {tablassert-7.3.3 → tablassert-7.3.4}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
  54. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_enums.py +0 -0
  55. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_fullmap.py +0 -0
  56. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_ingests.py +0 -0
  57. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_lib.py +0 -0
  58. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_models.py +0 -0
  59. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_nlp.py +0 -0
  60. {tablassert-7.3.3 → tablassert-7.3.4}/tests/test_utils.py +0 -0
@@ -2,13 +2,23 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.3.4 - 2026-04-28
6
+
7
+ ### Bug Fixes
8
+ - Fixed `downloader.from_url()` failing on URLs that trigger an immediate download. The Playwright session now opens a browser context with `accept_downloads=True`, wraps `page.goto()` inside `page.expect_download()`, and tolerates the expected `net::ERR_ABORTED` navigation error that fires when the response is a download rather than a page.
9
+
10
+ ### Documentation
11
+ - Documented `miscellaneous notes` as a freetext catch-all annotation in the table configuration and advanced-example pages — used for assay caveats, non-standard units, and qualitative observations that don't map cleanly to a structured field. Supports both `method: value` (constant) and `method: column` (per-row).
12
+ - Documented Polars regex constraints for the `regex` and `remove` transforms: patterns are passed to Polars `str.replace_all()` (Rust `regex` crate), so capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not supported and will raise at parse time. Chain simple substitutions instead, or capture residual context in a `miscellaneous notes` annotation.
13
+
5
14
  ## 7.3.3 - 2026-04-08
6
15
 
7
16
  ### Bug Fixes
8
- - Changed datassert shard count from 16 to 12 (`SHARDS` constant in `fullmap.py`) to correspond to the updated datassert database layout.
17
+ - Changed datassert shard count to 10 (`SHARDS` constant in `fullmap.py`) to correspond to the current datassert database layout.
9
18
 
10
19
  ### Documentation
11
- - Updated all shard count references across documentation and examples to reflect the new 12-shard datassert layout.
20
+ - Updated shard count references across documentation and examples to reflect the current 10-shard datassert layout.
21
+ - Corrected provenance examples so `repo` carries the namespace prefix and `publication` carries the repository-local identifier.
12
22
 
13
23
  ## 7.3.2 - 2026-04-03
14
24
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.3.3
3
+ Version: 7.3.4
4
4
  Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
@@ -99,13 +99,13 @@ docker run --rm \
99
99
  # Build a knowledge graph from a YAML configuration
100
100
  $ tablassert build-knowledge-graph graph-config.yaml
101
101
  ⠋ Loading table configurations...
102
- ⠋ Resolving entities across 12 DuckDB shards...
102
+ ⠋ Resolving entities across 10 DuckDB shards...
103
103
  ⠋ Compiling subgraphs...
104
104
  ⠋ Deduplicating nodes and edges...
105
- Done — wrote nodes.ndjson and edges.ndjson to .storassert/
105
+ Finished — wrote MY_GRAPH_1.0.0.nodes.ndjson and MY_GRAPH_1.0.0.edges.ndjson
106
106
  ```
107
107
 
108
- Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
108
+ Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
109
109
 
110
110
  ## Key Features
111
111
 
@@ -47,13 +47,13 @@ docker run --rm \
47
47
  # Build a knowledge graph from a YAML configuration
48
48
  $ tablassert build-knowledge-graph graph-config.yaml
49
49
  ⠋ Loading table configurations...
50
- ⠋ Resolving entities across 12 DuckDB shards...
50
+ ⠋ Resolving entities across 10 DuckDB shards...
51
51
  ⠋ Compiling subgraphs...
52
52
  ⠋ Deduplicating nodes and edges...
53
- Done — wrote nodes.ndjson and edges.ndjson to .storassert/
53
+ Finished — wrote MY_GRAPH_1.0.0.nodes.ndjson and MY_GRAPH_1.0.0.edges.ndjson
54
54
  ```
55
55
 
56
- Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
56
+ Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required. Intermediate section artifacts are staged in `.storassert/` during the build.
57
57
 
58
58
  ## Key Features
59
59
 
@@ -36,7 +36,7 @@ Column name containing text strings to resolve.
36
36
 
37
37
  **`conns: list[object]`**
38
38
 
39
- List of 12 DuckDB shard connections to the datassert database.
39
+ List of 10 DuckDB shard connections to the datassert database.
40
40
 
41
41
  Each shard contains:
42
42
  - Synonym mappings (text → CURIE)
@@ -97,7 +97,7 @@ Returns a Polars LazyFrame with these columns added:
97
97
  | `{col} taxon` | NCBI Taxon ID | `"NCBITaxon:9606"` |
98
98
  | `{col} source` | Source database | `"HGNC"` |
99
99
  | `{col} source version` | Database version | `"2025-01"` |
100
- | `{col} nlp level` | NLP processing level | `0` or `1` |
100
+ | `{col} nlp level` | NLP processing level | `1` or `2` |
101
101
 
102
102
  ### DuckDB Query
103
103
 
@@ -125,11 +125,11 @@ from tablassert.enums import Categories
125
125
  import duckdb
126
126
  import polars as pl
127
127
 
128
- # Open all 12 shard connections
128
+ # Open all 10 shard connections
129
129
  datassert_dir = "/path/to/datassert"
130
130
  conns = [
131
131
  duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
132
- for i in range(12)
132
+ for i in range(10)
133
133
  ]
134
134
 
135
135
  # LazyFrame with data to resolve
@@ -167,11 +167,11 @@ from tablassert.fullmap import resolve
167
167
  from tablassert.nlp import level_one, level_two
168
168
  from tablassert.enums import Categories
169
169
 
170
- # Open all 12 shard connections
170
+ # Open all 10 shard connections
171
171
  datassert_dir = "/path/to/datassert"
172
172
  conns = [
173
173
  duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
174
- for i in range(12)
174
+ for i in range(10)
175
175
  ]
176
176
 
177
177
  # Map a list of gene symbols to CURIEs
@@ -2,11 +2,11 @@
2
2
 
3
3
  The `lib` module exposes `resolve_many()`, a high-level convenience function for resolving an iterable of entity strings to CURIEs without requiring manual LazyFrame construction, NLP preprocessing, or DuckDB shard management.
4
4
 
5
- It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 12 DuckDB shard connections, executing entity resolution, and returning results as a plain Python dictionary.
5
+ It wraps the lower-level [`resolve()`](fullmap.md) pipeline — applying `level_one` and `level_two` normalization, opening all 10 DuckDB shard connections, executing entity resolution, and returning results as a plain Python list of row dictionaries.
6
6
 
7
7
  ## resolve_many()
8
8
 
9
- Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a dictionary of lists.
9
+ Standalone batch entity resolution function. Accepts a column name, an iterable of text strings, and a path to the datassert database, then returns resolved CURIEs and metadata as a list of row dictionaries.
10
10
 
11
11
  ### Function Signature
12
12
 
@@ -26,9 +26,9 @@ def resolve_many(
26
26
 
27
27
  **`col: str`**
28
28
 
29
- Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in the returned dictionary.
29
+ Column name used internally to label the Polars Series and DataFrame columns during resolution. This name propagates through the NLP and resolution pipeline and determines the keys in each returned row dictionary.
30
30
 
31
- For example, if `col="gene"`, the returned dictionary will contain keys like `"gene"`, `"gene name"`, `"gene category"`, etc.
31
+ For example, if `col="gene"`, each returned row dictionary will contain keys like `"gene"`, `"gene name"`, `"gene category"`, etc.
32
32
 
33
33
  **`entities: Iterable[str]`**
34
34
 
@@ -38,7 +38,7 @@ Examples: `["TP53", "BRCA1", "EGFR"]`, `("aspirin", "ibuprofen")`, or a generato
38
38
 
39
39
  **`datassert: Path`**
40
40
 
41
- Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 12 DuckDB shard files (`0.duckdb` through `11.duckdb`).
41
+ Filesystem path to the root of the datassert database directory. The function expects a `data/` subdirectory containing 10 DuckDB shard files (`0.duckdb` through `9.duckdb`).
42
42
 
43
43
  Each shard contains:
44
44
  - Synonym mappings (text → CURIE)
@@ -86,7 +86,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
86
86
  | `{col} taxon` | NCBI Taxon ID (prefixed) | `"NCBITaxon:9606"` |
87
87
  | `{col} source` | Source database | `"HGNC"` |
88
88
  | `{col} source version` | Database version | `"2025-01"` |
89
- | `{col} nlp level` | NLP processing level used for match | `0` or `1` |
89
+ | `{col} nlp level` | NLP processing level used for match | `1` or `2` |
90
90
 
91
91
  **Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned list may therefore be shorter than the input iterable.
92
92
 
@@ -98,7 +98,7 @@ Each dictionary contains the following keys (where `{col}` is the value of the `
98
98
 
99
99
  2. **NLP normalization** — Applies `level_one()` (whitespace stripping + lowercasing) and `level_two()` (non-word character removal via `\W+`) to produce the two normalized columns required by `resolve()`.
100
100
 
101
- 3. **DuckDB connection management** — Opens all 12 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
101
+ 3. **DuckDB connection management** — Opens all 10 shard connections inside a `contextlib.ExitStack`, ensuring every connection is properly closed when resolution completes or if an error occurs.
102
102
 
103
103
  4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
104
104
 
@@ -224,8 +224,8 @@ Both levels are queried during resolution. Level one (exact case-insensitive mat
224
224
  ### Error Handling
225
225
 
226
226
  - If the `datassert` path does not contain the expected shard files, `duckdb.connect()` will raise an `IOException`.
227
- - If `entities` is empty, the function returns a dictionary with empty lists for all output columns.
228
- - The `ExitStack` ensures all 12 DuckDB connections are closed even if resolution raises an exception.
227
+ - If `entities` is empty, the function returns `[]`.
228
+ - The `ExitStack` ensures all 10 DuckDB connections are closed even if resolution raises an exception.
229
229
  - Unresolved entities are silently filtered from the output (logged at INFO level by default via `resolve()`).
230
230
 
231
231
  ## Integration
@@ -82,7 +82,7 @@ Two fuzzy matching algorithms:
82
82
  1. **Ratio:** Overall string similarity
83
83
  2. **Partial token sort ratio:** Combined token/subsequence matching
84
84
 
85
- **Threshold:** Default 20% similarity (configurable)
85
+ **Threshold:** 20% similarity
86
86
 
87
87
  ```python
88
88
  fuzz.ratio(original, preferred) >= 20
@@ -125,7 +125,7 @@ return similarity >= 0.2
125
125
  - Graph optimization level: ALL
126
126
  - ONNX session caching
127
127
 
128
- Lazy-loaded on first `BERT_audit()` call, then reused for subsequent calls.
128
+ Lazy-loaded on first `fullmap_audit()` call that reaches the embedding stage, then reused for subsequent calls.
129
129
 
130
130
  ### Model Caching
131
131
 
@@ -135,7 +135,7 @@ BioBERT is lazy-loaded on first use and cached globally for the lifetime of the
135
135
  # ? Lazy-loads BioBERT once on first batch audit call, then caches globally
136
136
  ```
137
137
 
138
- **Cache location:** In-memory (global model cache)
138
+ **Cache location:** Downloaded model files are cached on disk in `.onnxassert/`, and the loaded model object is cached in memory for the lifetime of the process.
139
139
 
140
140
  **Cache strategy:** BioBERT model loaded once on first batch audit, then reused globally
141
141
 
@@ -0,0 +1,13 @@
1
+ # Changelog
2
+
3
+ The canonical release history lives in the repository root at [`CHANGELOG.md`](https://github.com/SkyeAv/Tablassert/blob/main/CHANGELOG.md).
4
+
5
+ ## Current Release Notes
6
+
7
+ ### 7.3.4 - 2026-04-28
8
+
9
+ - `downloader.from_url()` now handles URLs that respond with an immediate download instead of a navigable page — the Playwright session uses a download-aware browser context and tolerates the expected `net::ERR_ABORTED` navigation error.
10
+ - Table configuration docs now describe `miscellaneous notes` as a freetext catch-all annotation for source context that doesn't map cleanly to a structured field.
11
+ - Regex transform documentation now spells out the Polars `str.replace_all()` constraints — no capturing groups or lookarounds — so authors know to chain simple substitutions or fall back to `miscellaneous notes`.
12
+
13
+ For older releases and the full project history, open the root `CHANGELOG.md` in the repository.
@@ -66,7 +66,7 @@ template:
66
66
  # Provenance: Publication and curation info
67
67
  provenance:
68
68
  repo: PMC
69
- publication: PMC11708054
69
+ publication: 11708054
70
70
  contributors:
71
71
  - kind: curation
72
72
  name: Skye Lane Goetz
@@ -103,12 +103,16 @@ template:
103
103
  method: value
104
104
  encoding: Spearman correlation
105
105
 
106
- # Descriptive note
106
+ # Freetext catch-all — anything that doesn't map cleanly to a structured
107
+ # annotation (study design caveats, non-standard units, qualitative
108
+ # observations) belongs here rather than being dropped.
107
109
  - annotation: miscellaneous notes
108
110
  method: value
109
111
  encoding: Correlation analysis between microbial composition and 13C-tamoxifen abundance after FDR correction
110
112
  ```
111
113
 
114
+ > **`miscellaneous notes` is a freetext escape hatch.** Use it whenever the source carries context you can't otherwise cleanly encode — assay variants, post-hoc qualifiers, "values are log-transformed", etc. It accepts `method: value` for a constant note across the whole table or `method: column` to pull per-row notes from the source.
115
+
112
116
  ## Key Techniques
113
117
 
114
118
  ### Excel Column References
@@ -143,6 +147,8 @@ The subject field uses three regex transformations in sequence:
143
147
  ```
144
148
  `"Lactobacillus sp"` → `"Lactobacillus sp. "`
145
149
 
150
+ > **Regex constraint:** Each `pattern` is handed to Polars `str.replace_all()` (Rust `regex` crate). **Capturing groups (`(...)` / `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`) are not allowed** and will fail validation. Express transformations as a sequence of simple anchored / character-class substitutions instead — the pipeline above is a deliberate three-step chain because no single capturing-group pattern is permitted. If the transformation can't be expressed without those features, capture the leftover context in a `miscellaneous notes` annotation rather than fighting the regex engine.
151
+
146
152
  ### Taxonomic Filtering
147
153
 
148
154
  Prevent incorrect entity resolution:
@@ -297,7 +303,7 @@ template:
297
303
 
298
304
  provenance:
299
305
  repo: PMC
300
- publication: PMC12345678
306
+ publication: 12345678
301
307
  contributors:
302
308
  - kind: curation
303
309
  name: Skye Lane Goetz
@@ -358,7 +364,7 @@ template:
358
364
 
359
365
  provenance:
360
366
  repo: PMC
361
- publication: PMC87654321
367
+ publication: 87654321
362
368
  contributors:
363
369
  - kind: curation
364
370
  name: Skye Lane Goetz
@@ -60,7 +60,7 @@ See [Table Configuration](table.md) for details.
60
60
 
61
61
  **`datassert: path`**
62
62
 
63
- Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens 12 shard files from `datassert/data/{0..11}.duckdb`. This database contains:
63
+ Path to the [datassert](../datassert.md) directory for entity resolution. Tablassert opens 10 shard files from `datassert/data/{0..9}.duckdb`. This database contains:
64
64
  - Synonym mappings (text → CURIE)
65
65
  - Biolink categories
66
66
  - Taxonomic information
@@ -100,7 +100,7 @@ template:
100
100
  ```yaml
101
101
  template:
102
102
  source: {kind: excel, local: data.xlsx}
103
- provenance: {publication: PMC123}
103
+ provenance: {repo: PMC, publication: 123}
104
104
 
105
105
  sections:
106
106
  - statement: {predicate: treats}
@@ -111,7 +111,7 @@ sections:
111
111
  ```yaml
112
112
  template:
113
113
  source: {kind: text, local: data.csv}
114
- provenance: {publication: PMID456}
114
+ provenance: {repo: PMID, publication: 456}
115
115
  statement:
116
116
  subject: {encoding: gene_symbol}
117
117
 
@@ -330,6 +330,8 @@ subject:
330
330
 
331
331
  Executed in order.
332
332
 
333
+ > **Regex dialect:** Patterns are passed directly to Polars `str.replace_all()`, which uses the Rust [`regex`](https://docs.rs/regex/) crate. Only features supported by that engine work — in particular, **capturing groups (`(...)`, `\1`) and lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)` are not supported** and will raise an error at parse time. Stick to character classes, anchors (`^`, `$`), quantifiers, alternation (`a|b`), and non-capturing groups (`(?:...)`) if grouping is needed. If a transformation is too complex to express, prefer chaining several simple substitutions or capturing the residual context in a `miscellaneous notes` annotation instead.
334
+
333
335
  **`remove: list[string]`** - Filter out specific strings
334
336
 
335
337
  ```yaml
@@ -339,6 +341,8 @@ subject:
339
341
  - "^NA " # Remove rows starting with "NA "
340
342
  ```
341
343
 
344
+ Same regex constraints apply as the `regex` field — Polars-compatible patterns only, no capturing groups or lookarounds.
345
+
342
346
  **`prefix` / `suffix`** - Add text
343
347
 
344
348
  ```yaml
@@ -416,7 +420,7 @@ Required metadata about data source.
416
420
  | Field | Type | Required | Description |
417
421
  |-------|------|----------|-------------|
418
422
  | `repo` | String | Yes | Repository: `"PMC"`, `"PMID"` |
419
- | `publication` | String | Yes | Identifier (e.g., `"PMC11708054"`, `"PMID123"`) |
423
+ | `publication` | String | Yes | Repository-local identifier appended to `repo:` (e.g., `"11708054"`, `"123"`) |
420
424
  | `contributors` | List[Contributor] | Yes | Curation information |
421
425
 
422
426
  **Contributor fields:**
@@ -433,7 +437,7 @@ Required metadata about data source.
433
437
  ```yaml
434
438
  provenance:
435
439
  repo: PMC
436
- publication: PMC11708054
440
+ publication: 11708054
437
441
  contributors:
438
442
  - kind: curation
439
443
  name: Skye Lane Goetz
@@ -467,8 +471,16 @@ annotations:
467
471
  - annotation: multiple testing correction method
468
472
  method: value
469
473
  encoding: "Benjamini Hochberg"
474
+
475
+ # Freetext catch-all for context that doesn't fit a structured field —
476
+ # study caveats, units, post-hoc notes, anything you'd otherwise lose.
477
+ - annotation: miscellaneous notes
478
+ method: value
479
+ encoding: "Values are log2 fold-change relative to vehicle control; n=3 biological replicates per arm"
470
480
  ```
471
481
 
482
+ > **Tip:** When source data carries information that can't be cleanly mapped to a structured annotation (assay-specific caveats, non-standard units, qualitative observations), add a `miscellaneous notes` annotation rather than forcing it into another field or dropping it. It accepts both `method: value` (one note for the whole table) and `method: column` (per-row notes from the source).
483
+
472
484
  ## Complete Example
473
485
 
474
486
  Minimal table configuration:
@@ -498,7 +510,7 @@ template:
498
510
 
499
511
  provenance:
500
512
  repo: PMID
501
- publication: PMID12345678
513
+ publication: 12345678
502
514
  contributors:
503
515
  - kind: curation
504
516
  name: Example User
@@ -33,7 +33,7 @@ The build command automatically downloads BABEL exports from RENCI (`https://sta
33
33
  1. **Download** — BABEL class and synonym files are downloaded from RENCI and split into LZ4-compressed NDJSON chunks under `./datassert/downloads/`.
34
34
  2. **Lookup** — Class files (`*.ndjson.lz4`) are read to build an in-memory equivalent-identifier lookup.
35
35
  3. **Parquet Staging** — Synonym files are processed with the lookup, quality-controlled, and written as sharded Parquet files to `./datassert/parquets/`.
36
- 4. **DuckDB Generation** — Parquet files are loaded into 12 sharded DuckDB databases under `./datassert/data/`.
36
+ 4. **DuckDB Generation** — Parquet files are loaded into 10 sharded DuckDB databases under `./datassert/data/`.
37
37
 
38
38
  ### Examples
39
39
 
@@ -57,11 +57,11 @@ datassert build --use-existing-parquets
57
57
 
58
58
  ## Output Artifacts
59
59
 
60
- - 12 sharded DuckDB databases are written to `./datassert/data/{0..11}.duckdb`.
60
+ - 10 sharded DuckDB databases are written to `./datassert/data/{0..9}.duckdb`.
61
61
  - Each shard contains `SOURCES`, `CATEGORIES`, `CURIES`, and `SYNONYMS` tables, deduplicated, sorted, and indexed for query performance.
62
- - Staging Parquet files are written to `./datassert/parquets/{0..11}/`.
62
+ - Staging Parquet files are written to `./datassert/parquets/{0..9}/`.
63
63
 
64
- Terms are routed to shards deterministically via `xxhash64(term) % 12`, so a given string always hits the same shard.
64
+ Terms are routed to shards deterministically via `xxhash64(term) % 10`, so a given string always hits the same shard.
65
65
 
66
66
  ### Schema
67
67
 
@@ -76,14 +76,14 @@ Each shard contains four tables:
76
76
 
77
77
  ## Usage in Graph Config
78
78
 
79
- The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 12 shards at startup and passes the connections to `resolve()`.
79
+ The `datassert:` field in a GC2 graph configuration points to the directory containing the shards. Tablassert opens all 10 shards at startup and passes the connections to `resolve()`.
80
80
 
81
81
  ```yaml
82
82
  # graph-config.yaml (GC2)
83
83
  syntax: GC2
84
84
  name: my-graph
85
85
  version: "1.0"
86
- datassert: /path/to/datassert/ # directory containing data/0..11.duckdb
86
+ datassert: /path/to/datassert/ # directory containing data/0..9.duckdb
87
87
  tables:
88
88
  - ./TABLE/my-table.yaml
89
89
  ```
@@ -99,7 +99,7 @@ from tablassert.fullmap import resolve
99
99
  datassert_dir = "/path/to/datassert"
100
100
  conns = [
101
101
  duckdb.connect(f"{datassert_dir}/data/{i}.duckdb", read_only=True)
102
- for i in range(12)
102
+ for i in range(10)
103
103
  ]
104
104
  ```
105
105
 
@@ -81,8 +81,8 @@ docker run --rm \
81
81
 
82
82
  - **Datassert path** — The graph configuration YAML specifies the `datassert` path for the entity-resolution database. Ensure it is accessible inside the container.
83
83
  - **Multiprocessing** — `src/tablassert/cli.py:63` uses `multiprocessing.Pool` for parallel table loading and section extraction.
84
- - **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all 12 Datassert DuckDB shards concurrently.
85
- - **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across 12 DuckDB shards (`SHARDS = 12`) using xxhash64.
84
+ - **DuckDB connections** — An `ExitStack` at `src/tablassert/cli.py:81` opens read-only connections to all 10 Datassert DuckDB shards concurrently.
85
+ - **Entity resolution** — The `fullmap` module (`src/tablassert/fullmap.py`) shards terms across 10 DuckDB shards (`SHARDS = 10`) using xxhash64.
86
86
  - **Text normalization** — `src/tablassert/nlp.py` provides `level_one` (strip + lowercase) and `level_two` (regex-based cleanup).
87
87
 
88
88
  ## CI/CD Integration
@@ -33,7 +33,7 @@ template:
33
33
  - Disease
34
34
  provenance:
35
35
  repo: PMID
36
- publication: PMID12345678
36
+ publication: 12345678
37
37
  contributors:
38
38
  - kind: curation
39
39
  name: Your Name
@@ -85,7 +85,7 @@ template:
85
85
  taxon: 9606
86
86
  provenance:
87
87
  repo: PMID
88
- publication: PMID98765432
88
+ publication: 98765432
89
89
  contributors:
90
90
  - kind: curation
91
91
  name: Your Name
@@ -146,7 +146,7 @@ template:
146
146
  encoding: CHEBI:41774
147
147
  provenance:
148
148
  repo: PMC
149
- publication: PMC11708054
149
+ publication: 11708054
150
150
  contributors:
151
151
  - kind: curation
152
152
  name: Your Name
@@ -161,11 +161,15 @@ template:
161
161
  - annotation: assertion method
162
162
  method: value
163
163
  encoding: "Spearman correlation"
164
+ # Freetext catch-all for context that doesn't fit a structured field.
165
+ - annotation: miscellaneous notes
166
+ method: value
167
+ encoding: "FDR-corrected; samples pooled across two cohorts"
164
168
  ```
165
169
 
166
170
  **Key techniques:**
167
171
 
168
- - **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`)
172
+ - **Regex pipeline** cleans raw taxonomic strings (e.g., `d__Bacteria;p__Firmicutes;g__Lactobacillus` → `Lactobacillus`). Patterns must be Polars `str.replace_all()`-compatible — no capturing groups (`(...)` / `\1`) and no lookarounds (`(?=...)`, `(?<=...)`, `(?!...)`, `(?<!...)`). Chain several simple substitutions instead.
169
173
  - **Avoid list** (`avoid: [Gene]`) prevents organism names from resolving to gene entities
170
174
  - **Fixed-value object** (`method: value`) assigns the same metabolite CURIE to all rows
171
175
  - **Excel source** with sheet name and row slicing
@@ -199,7 +203,7 @@ template:
199
203
  encoding: PLACEHOLDER
200
204
  provenance:
201
205
  repo: PMID
202
- publication: PMID11223344
206
+ publication: 11223344
203
207
  contributors:
204
208
  - kind: curation
205
209
  name: Your Name
@@ -277,7 +281,7 @@ template:
277
281
  - Disease
278
282
  provenance:
279
283
  repo: PMID
280
- publication: PMID55667788
284
+ publication: 55667788
281
285
  contributors:
282
286
  - kind: curation
283
287
  name: Your Name
@@ -330,7 +334,7 @@ template:
330
334
  - ChemicalEntity
331
335
  provenance:
332
336
  repo: PMID
333
- publication: PMID99887766
337
+ publication: 99887766
334
338
  contributors:
335
339
  - kind: curation
336
340
  name: Your Name
@@ -68,7 +68,7 @@ template:
68
68
  - Disease
69
69
  provenance:
70
70
  repo: PMID
71
- publication: PMID12345678
71
+ publication: 12345678
72
72
  contributors:
73
73
  - kind: curation
74
74
  name: Tutorial Example
@@ -17,4 +17,4 @@ nav:
17
17
  - Batch Resolution: api/lib.md
18
18
  - Quality Control: api/qc.md
19
19
  - Utilities: api/utils.md
20
- - Changelog: ../CHANGELOG.md
20
+ - Changelog: changelog.md
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.3.3"
3
+ version = "7.3.4"
4
4
  description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
5
5
  authors = [
6
6
  { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
@@ -29,13 +29,24 @@ def from_url(website: str, p: Path, timeout: int = 60_000, retries: int = 3) ->
29
29
  try:
30
30
  with sync_playwright() as pw:
31
31
  browser = pw.chromium.launch(headless=True)
32
- page = browser.new_page()
33
- page.goto(website, wait_until="networkidle", timeout=timeout)
32
+ context = browser.new_context(accept_downloads=True)
33
+
34
+ page = context.new_page()
34
35
  with page.expect_download(timeout=timeout) as info:
35
- download = info.value
36
- download.save_as(p)
36
+ try:
37
+ page.goto(website, wait_until="load", timeout=timeout)
38
+ except Exception as e:
39
+ if "net::ERR_ABORTED" not in str(e):
40
+ raise
41
+
42
+ download = info.value
43
+ download.save_as(p)
44
+
45
+ context.close()
37
46
  browser.close()
47
+
38
48
  return p
49
+
39
50
  except Exception as e:
40
51
  last = e
41
52
  if attempt < retries - 1:
@@ -2211,7 +2211,7 @@ wheels = [
2211
2211
 
2212
2212
  [[package]]
2213
2213
  name = "tablassert"
2214
- version = "7.3.3"
2214
+ version = "7.3.4"
2215
2215
  source = { editable = "." }
2216
2216
  dependencies = [
2217
2217
  { name = "duckdb" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes