tablassert 7.3.0__tar.gz → 7.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {tablassert-7.3.0 → tablassert-7.3.1}/CHANGELOG.md +9 -0
  2. {tablassert-7.3.0 → tablassert-7.3.1}/Dockerfile +2 -0
  3. {tablassert-7.3.0 → tablassert-7.3.1}/PKG-INFO +15 -1
  4. {tablassert-7.3.0 → tablassert-7.3.1}/README.md +14 -0
  5. {tablassert-7.3.0 → tablassert-7.3.1}/docs/api/lib.md +22 -19
  6. {tablassert-7.3.0 → tablassert-7.3.1}/pyproject.toml +1 -1
  7. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/lib.py +3 -2
  8. {tablassert-7.3.0 → tablassert-7.3.1}/.github/workflows/autotag.yml +0 -0
  9. {tablassert-7.3.0 → tablassert-7.3.1}/.github/workflows/docker.yml +0 -0
  10. {tablassert-7.3.0 → tablassert-7.3.1}/.github/workflows/docs.yml +0 -0
  11. {tablassert-7.3.0 → tablassert-7.3.1}/.github/workflows/pipy.yml +0 -0
  12. {tablassert-7.3.0 → tablassert-7.3.1}/.gitignore +0 -0
  13. {tablassert-7.3.0 → tablassert-7.3.1}/.pre-commit-config.yaml +0 -0
  14. {tablassert-7.3.0 → tablassert-7.3.1}/AGENTS.md +0 -0
  15. {tablassert-7.3.0 → tablassert-7.3.1}/CITATION.cff +0 -0
  16. {tablassert-7.3.0 → tablassert-7.3.1}/CONTRIBUTING.md +0 -0
  17. {tablassert-7.3.0 → tablassert-7.3.1}/LICENSE +0 -0
  18. {tablassert-7.3.0 → tablassert-7.3.1}/docs/api/fullmap.md +0 -0
  19. {tablassert-7.3.0 → tablassert-7.3.1}/docs/api/qc.md +0 -0
  20. {tablassert-7.3.0 → tablassert-7.3.1}/docs/api/utils.md +0 -0
  21. {tablassert-7.3.0 → tablassert-7.3.1}/docs/cli.md +0 -0
  22. {tablassert-7.3.0 → tablassert-7.3.1}/docs/configuration/advanced-example.md +0 -0
  23. {tablassert-7.3.0 → tablassert-7.3.1}/docs/configuration/graph.md +0 -0
  24. {tablassert-7.3.0 → tablassert-7.3.1}/docs/configuration/table.md +0 -0
  25. {tablassert-7.3.0 → tablassert-7.3.1}/docs/datassert.md +0 -0
  26. {tablassert-7.3.0 → tablassert-7.3.1}/docs/docker.md +0 -0
  27. {tablassert-7.3.0 → tablassert-7.3.1}/docs/examples/tutorial-data.csv +0 -0
  28. {tablassert-7.3.0 → tablassert-7.3.1}/docs/examples/tutorial-graph.yaml +0 -0
  29. {tablassert-7.3.0 → tablassert-7.3.1}/docs/examples/tutorial-table.yaml +0 -0
  30. {tablassert-7.3.0 → tablassert-7.3.1}/docs/examples.md +0 -0
  31. {tablassert-7.3.0 → tablassert-7.3.1}/docs/index.md +0 -0
  32. {tablassert-7.3.0 → tablassert-7.3.1}/docs/installation.md +0 -0
  33. {tablassert-7.3.0 → tablassert-7.3.1}/docs/tutorial.md +0 -0
  34. {tablassert-7.3.0 → tablassert-7.3.1}/llms.txt +0 -0
  35. {tablassert-7.3.0 → tablassert-7.3.1}/mkdocs.yml +0 -0
  36. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/__init__.py +0 -0
  37. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/cli.py +0 -0
  38. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/downloader.py +0 -0
  39. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/enums.py +0 -0
  40. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/fullmap.py +0 -0
  41. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/ingests.py +0 -0
  42. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/log.py +0 -0
  43. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/models.py +0 -0
  44. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/nlp.py +0 -0
  45. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/qc.py +0 -0
  46. {tablassert-7.3.0 → tablassert-7.3.1}/src/tablassert/utils.py +0 -0
  47. {tablassert-7.3.0 → tablassert-7.3.1}/tests/__init__.py +0 -0
  48. {tablassert-7.3.0 → tablassert-7.3.1}/tests/conftest.py +0 -0
  49. {tablassert-7.3.0 → tablassert-7.3.1}/tests/fixtures/invalid_section_missing_source.yaml +0 -0
  50. {tablassert-7.3.0 → tablassert-7.3.1}/tests/fixtures/minimal_section.yaml +0 -0
  51. {tablassert-7.3.0 → tablassert-7.3.1}/tests/fixtures/minimal_section_with_sections.yaml +0 -0
  52. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_enums.py +0 -0
  53. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_fullmap.py +0 -0
  54. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_ingests.py +0 -0
  55. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_lib.py +0 -0
  56. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_models.py +0 -0
  57. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_nlp.py +0 -0
  58. {tablassert-7.3.0 → tablassert-7.3.1}/tests/test_utils.py +0 -0
  59. {tablassert-7.3.0 → tablassert-7.3.1}/uv.lock +0 -0
@@ -2,6 +2,15 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.3.1 - 2026-04-03
6
+
7
+ ### Changes
8
+ - Changed `resolve_many()` return type from `dict[str, list[str]]` to `list[dict[str, Any]]` — each resolved entity is now a row dictionary, produced via `to_dicts()`.
9
+ - `resolve_many()` now preserves the original input text in an `original {col}` key on each result row.
10
+
11
+ ### Documentation
12
+ - Updated `resolve_many()` API reference to match the current function signature, return type, and output format.
13
+
5
14
  ## 7.3.0 - 2026-04-03
6
15
 
7
16
  ### New Features
@@ -2,5 +2,7 @@ FROM python:3.14-slim
2
2
 
3
3
  RUN pip install --no-cache-dir "tablassert"
4
4
 
5
+ VOLUME ["/data", "/datassert"]
6
+
5
7
  ENTRYPOINT ["tablassert"]
6
8
  CMD ["--help"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tablassert
3
- Version: 7.3.0
3
+ Version: 7.3.1
4
4
  Summary: Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in.
5
5
  Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
6
6
  Project-URL: Source, https://github.com/SkyeAv/Tablassert
@@ -93,6 +93,20 @@ docker run --rm \
93
93
 
94
94
  </details>
95
95
 
96
+ ## Quick Demo
97
+
98
+ ```bash
99
+ # Build a knowledge graph from a YAML configuration
100
+ $ tablassert build-knowledge-graph graph-config.yaml
101
+ ⠋ Loading table configurations...
102
+ ⠋ Resolving entities across 16 DuckDB shards...
103
+ ⠋ Compiling subgraphs...
104
+ ⠋ Deduplicating nodes and edges...
105
+ ✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
106
+ ```
107
+
108
+ Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
109
+
96
110
  ## Key Features
97
111
 
98
112
  - **Declarative Configuration** — YAML-based, no code required
@@ -41,6 +41,20 @@ docker run --rm \
41
41
 
42
42
  </details>
43
43
 
44
+ ## Quick Demo
45
+
46
+ ```bash
47
+ # Build a knowledge graph from a YAML configuration
48
+ $ tablassert build-knowledge-graph graph-config.yaml
49
+ ⠋ Loading table configurations...
50
+ ⠋ Resolving entities across 16 DuckDB shards...
51
+ ⠋ Compiling subgraphs...
52
+ ⠋ Deduplicating nodes and edges...
53
+ ✓ Done — wrote nodes.ndjson and edges.ndjson to .storassert/
54
+ ```
55
+
56
+ Define your entities and relationships in YAML, point tablassert at your data, and get NCATS Translator-compliant KGX NDJSON out the other side — no code required.
57
+
44
58
  ## Key Features
45
59
 
46
60
  - **Declarative Configuration** — YAML-based, no code required
@@ -19,7 +19,7 @@ def resolve_many(
19
19
  prioritize: Optional[list[Categories]] = None,
20
20
  avoid: Optional[list[Categories]] = None,
21
21
  column_context: bool = True,
22
- ) -> dict[str, list[str]]
22
+ ) -> list[dict[str, Any]]
23
23
  ```
24
24
 
25
25
  ### Parameters
@@ -73,12 +73,13 @@ This is useful when resolving a column of related entities (e.g., all genes) —
73
73
 
74
74
  ### Return Value
75
75
 
76
- Returns a `dict[str, list[str]]` where each key is a column name and each value is a list of resolved values. The dictionary is produced by calling `polars.DataFrame.to_dict(as_series=False)` on the collected resolution output.
76
+ Returns a `list[dict[str, Any]]` one dictionary per resolved entity. The list is produced by calling `polars.DataFrame.to_dicts()` on the collected resolution output.
77
77
 
78
- The returned dictionary contains the following keys (where `{col}` is the value of the `col` parameter):
78
+ Each dictionary contains the following keys (where `{col}` is the value of the `col` parameter):
79
79
 
80
80
  | Key | Description | Example Value |
81
81
  |-----|-------------|---------------|
82
+ | `original {col}` | Original input text before normalization | `"TP53"` |
82
83
  | `{col}` | CURIE identifier | `"HGNC:11998"` |
83
84
  | `{col} name` | Preferred entity name | `"TP53"` |
84
85
  | `{col} category` | Biolink category (prefixed) | `"biolink:Gene"` |
@@ -87,7 +88,7 @@ The returned dictionary contains the following keys (where `{col}` is the value
87
88
  | `{col} source version` | Database version | `"2025-01"` |
88
89
  | `{col} nlp level` | NLP processing level used for match | `0` or `1` |
89
90
 
90
- **Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned lists may therefore be shorter than the input iterable.
91
+ **Important:** Only entities that successfully resolve to a CURIE are included in the output. Unresolved entities are filtered out by `resolve()`. The returned list may therefore be shorter than the input iterable.
91
92
 
92
93
  ### Pipeline Internals
93
94
 
@@ -101,7 +102,7 @@ The returned dictionary contains the following keys (where `{col}` is the value
101
102
 
102
103
  4. **Entity resolution** — Delegates to `fullmap.resolve()` which queries the sharded DuckDB database, ranks matches by category priority, preferred-name exactness, NLP level, and category frequency, then deduplicates to one CURIE per input string.
103
104
 
104
- 5. **Collection and conversion** — Collects the lazy result into an eager `pl.DataFrame` and converts to a Python dictionary via `to_dict(as_series=False)`.
105
+ 5. **Collection and conversion** — Collects the lazy result into an eager `pl.DataFrame` and converts to a list of row dictionaries via `to_dicts()`.
105
106
 
106
107
  ### Example Usage
107
108
 
@@ -109,12 +110,13 @@ The returned dictionary contains the following keys (where `{col}` is the value
109
110
 
110
111
  ```python
111
112
  from pathlib import Path
113
+ from typing import Any
112
114
  from tablassert.lib import resolve_many
113
115
  from tablassert.enums import Categories
114
116
 
115
117
  datassert: Path = Path("/path/to/datassert")
116
118
 
117
- result: dict[str, list[str]] = resolve_many(
119
+ result: list[dict[str, Any]] = resolve_many(
118
120
  col="gene",
119
121
  entities=["TP53", "BRCA1", "EGFR", "KRAS"],
120
122
  datassert=datassert,
@@ -122,41 +124,41 @@ result: dict[str, list[str]] = resolve_many(
122
124
  prioritize=[Categories.Gene],
123
125
  )
124
126
 
125
- # result["gene"] → ["HGNC:11998", "HGNC:1100", ...]
126
- # result["gene name"] → ["TP53", "BRCA1", ...]
127
- # result["gene category"] → ["biolink:Gene", "biolink:Gene", ...]
127
+ # result[0] → {"original gene": "TP53", "gene": "HGNC:11998", "gene name": "TP53", ...}
128
+ # result[1] → {"original gene": "BRCA1", "gene": "HGNC:1100", "gene name": "BRCA1", ...}
128
129
  ```
129
130
 
130
131
  #### Disease Resolution With Category Avoidance
131
132
 
132
133
  ```python
133
134
  from pathlib import Path
135
+ from typing import Any
134
136
  from tablassert.lib import resolve_many
135
137
  from tablassert.enums import Categories
136
138
 
137
139
  datassert: Path = Path("/path/to/datassert")
138
140
 
139
- result: dict[str, list[str]] = resolve_many(
141
+ result: list[dict[str, Any]] = resolve_many(
140
142
  col="disease",
141
143
  entities=["diabetes mellitus", "breast cancer", "alzheimer disease"],
142
144
  datassert=datassert,
143
145
  avoid=[Categories.Gene, Categories.Protein],
144
146
  )
145
147
 
146
- # result["disease"] → ["MONDO:0005015", ...]
147
- # result["disease name"] → ["diabetes mellitus", ...]
148
- # result["disease category"] → ["biolink:Disease", ...]
148
+ # result[0] → {"original disease": "diabetes mellitus", "disease": "MONDO:0005015", ...}
149
+ # result[1] → {"original disease": "breast cancer", "disease name": "breast cancer", ...}
149
150
  ```
150
151
 
151
152
  #### Chemical Resolution Without Column Context
152
153
 
153
154
  ```python
154
155
  from pathlib import Path
156
+ from typing import Any
155
157
  from tablassert.lib import resolve_many
156
158
 
157
159
  datassert: Path = Path("/path/to/datassert")
158
160
 
159
- result: dict[str, list[str]] = resolve_many(
161
+ result: list[dict[str, Any]] = resolve_many(
160
162
  col="chemical",
161
163
  entities=["aspirin", "metformin", "ibuprofen"],
162
164
  datassert=datassert,
@@ -169,11 +171,12 @@ result: dict[str, list[str]] = resolve_many(
169
171
  ```python
170
172
  import polars as pl
171
173
  from pathlib import Path
174
+ from typing import Any
172
175
  from tablassert.lib import resolve_many
173
176
 
174
177
  datassert: Path = Path("/path/to/datassert")
175
178
 
176
- result: dict[str, list[str]] = resolve_many(
179
+ result: list[dict[str, Any]] = resolve_many(
177
180
  col="gene",
178
181
  entities=["TP53", "BRCA1"],
179
182
  datassert=datassert,
@@ -183,9 +186,9 @@ result: dict[str, list[str]] = resolve_many(
183
186
  # Convert back to a Polars DataFrame
184
187
  df: pl.DataFrame = pl.DataFrame(result)
185
188
 
186
- # Or iterate over resolved pairs
187
- for curie, name in zip(result["gene"], result["gene name"]):
188
- print(f"{name} → {curie}")
189
+ # Or iterate over resolved rows
190
+ for row in result:
191
+ print(f"{row['gene name']} → {row['gene']}")
189
192
  ```
190
193
 
191
194
  ### Comparison With resolve()
@@ -196,7 +199,7 @@ for curie, name in zip(result["gene"], result["gene name"]):
196
199
  | **Input** | Plain iterable of strings | Pre-normalized `pl.LazyFrame` |
197
200
  | **NLP** | Applied automatically | Must be applied upstream |
198
201
  | **Connections** | Managed internally via `ExitStack` | Must be opened externally |
199
- | **Output** | `dict[str, list[str]]` | `pl.LazyFrame` |
202
+ | **Output** | `list[dict[str, Any]]` | `pl.LazyFrame` |
200
203
  | **Logging** | Uses default (`log=True`) | Configurable |
201
204
  | **Context params** | Not exposed (`section_hash`, `config_file`, `tag`) | Fully configurable |
202
205
  | **Use case** | Standalone batch lookups, scripting, notebooks | Internal pipeline integration |
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "tablassert"
3
- version = "7.3.0"
3
+ version = "7.3.1"
4
4
  description = "Extract knowledge assertions from tabular data into NCATS Translator-compliant KGX NDJSON — declaratively, with entity resolution and quality control built in."
5
5
  authors = [
6
6
  { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
@@ -487,10 +487,11 @@ def resolve_many(
487
487
  prioritize: Optional[list[Categories]] = None,
488
488
  avoid: Optional[list[Categories]] = None,
489
489
  column_context: bool = True,
490
- ) -> dict[str, list[str]]:
490
+ ) -> list[dict[str, Any]]:
491
491
  series: pl.Series = pl.Series(col, entities)
492
492
  lf: pl.LazyFrame = series.to_frame().lazy()
493
493
 
494
+ lf = column(lf, add("original ", col), col)
494
495
  lf = level_one(lf, col)
495
496
  lf = level_two(lf, col)
496
497
 
@@ -503,4 +504,4 @@ def resolve_many(
503
504
  lf = resolve(lf, col, conns, taxon=taxon, prioritize=prioritize, avoid=avoid, column_context=column_context)
504
505
 
505
506
  df: pl.DataFrame = lf.collect()
506
- return df.to_dict(as_series=False)
507
+ return df.to_dicts()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes