tablassert 7.0.0__tar.gz → 7.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablassert-7.0.2/.pre-commit-config.yaml +15 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/CHANGELOG.md +20 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/PKG-INFO +28 -4
- {tablassert-7.0.0 → tablassert-7.0.2}/README.md +17 -2
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/api/fullmap.md +28 -15
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/api/qc.md +7 -9
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/cli.md +1 -1
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/index.md +10 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/installation.md +20 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/pyproject.toml +30 -3
- tablassert-7.0.2/src/tablassert/downloader.py +37 -0
- tablassert-7.0.2/src/tablassert/enums.py +521 -0
- tablassert-7.0.2/src/tablassert/fullmap.py +177 -0
- tablassert-7.0.2/src/tablassert/ingests.py +43 -0
- tablassert-7.0.2/src/tablassert/lib.py +586 -0
- tablassert-7.0.2/src/tablassert/log.py +16 -0
- tablassert-7.0.2/src/tablassert/models.py +132 -0
- tablassert-7.0.2/src/tablassert/qc.py +129 -0
- tablassert-7.0.2/src/tablassert/utils.py +43 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/uv.lock +29 -1
- tablassert-7.0.0/.planning/PROJECT.md +0 -47
- tablassert-7.0.0/.planning/REQUIREMENTS.md +0 -73
- tablassert-7.0.0/.planning/ROADMAP.md +0 -66
- tablassert-7.0.0/.planning/STATE.md +0 -79
- tablassert-7.0.0/.planning/config.json +0 -15
- tablassert-7.0.0/.planning/quick/1-please-add-a-github-action-that-runs-uv-/1-PLAN.md +0 -90
- tablassert-7.0.0/.planning/quick/1-please-add-a-github-action-that-runs-uv-/1-SUMMARY.md +0 -80
- tablassert-7.0.0/.planning/research/ARCHITECTURE.md +0 -220
- tablassert-7.0.0/.planning/research/FEATURES.md +0 -134
- tablassert-7.0.0/.planning/research/PITFALLS.md +0 -219
- tablassert-7.0.0/.planning/research/STACK.md +0 -140
- tablassert-7.0.0/.planning/research/SUMMARY.md +0 -146
- tablassert-7.0.0/.pre-commit-config.yaml +0 -15
- tablassert-7.0.0/src/tablassert/downloader.py +0 -35
- tablassert-7.0.0/src/tablassert/enums.py +0 -521
- tablassert-7.0.0/src/tablassert/fullmap.py +0 -167
- tablassert-7.0.0/src/tablassert/ingests.py +0 -43
- tablassert-7.0.0/src/tablassert/lib.py +0 -602
- tablassert-7.0.0/src/tablassert/log.py +0 -15
- tablassert-7.0.0/src/tablassert/models.py +0 -131
- tablassert-7.0.0/src/tablassert/qc.py +0 -124
- tablassert-7.0.0/src/tablassert/utils.py +0 -43
- {tablassert-7.0.0 → tablassert-7.0.2}/.github/workflows/docs.yml +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/.github/workflows/pipy.yml +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/.gitignore +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/.python-version +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/.vscode/settings.json +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/LICENSE +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/api/utils.md +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/configuration/advanced-example.md +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/configuration/graph.md +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/configuration/table.md +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/examples/tutorial-graph.yaml +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/docs/tutorial.md +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/mkdocs.yml +0 -0
- {tablassert-7.0.0 → tablassert-7.0.2}/src/tablassert/__init__.py +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.9.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: local
|
|
9
|
+
hooks:
|
|
10
|
+
- id: pyright
|
|
11
|
+
name: pyright
|
|
12
|
+
entry: uv run pyright
|
|
13
|
+
language: system
|
|
14
|
+
types: [python]
|
|
15
|
+
pass_filenames: false
|
|
@@ -2,6 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
+
## 7.0.2 - 2026-03-23
|
|
6
|
+
|
|
7
|
+
### Changes
|
|
8
|
+
- Updated package metadata for the 7.0.2 release.
|
|
9
|
+
- Added optional `log` and `column_context` controls to `fullmap.version4()` for more configurable entity-resolution behavior.
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
- Reworked entity-resolution querying to register terms directly in DuckDB instead of writing temporary parquet files, removing tempfile lifecycle issues in `fullmap` query execution.
|
|
13
|
+
- Isolated unmatched-entity logging into a dedicated helper and gated it behind an explicit logging flag.
|
|
14
|
+
|
|
15
|
+
### Documentation
|
|
16
|
+
- Updated API reference docs to match the current `version4()` function signature and behavior.
|
|
17
|
+
- Corrected QC documentation to reflect the implemented fuzzy/BERT validation pipeline.
|
|
18
|
+
- Fixed documentation path typos for cache/store artifact directories.
|
|
19
|
+
|
|
20
|
+
## 7.0.1 - 2026-03-17
|
|
21
|
+
|
|
22
|
+
### Documentation
|
|
23
|
+
- Updated installation docs to reflect `pyproject.toml` extras and added `tablassert[rtcompat]` guidance for systems without required default Polars CPU instructions.
|
|
24
|
+
|
|
5
25
|
## 7.0.0 - 2026-03-17
|
|
6
26
|
|
|
7
27
|
### New Features
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tablassert
|
|
3
|
-
Version: 7.0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 7.0.2
|
|
4
|
+
Summary: Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
|
|
5
|
+
Project-URL: Homepage, https://github.com/SkyeAv/Tablassert
|
|
6
|
+
Project-URL: Source, https://github.com/SkyeAv/Tablassert
|
|
7
|
+
Project-URL: Documentation, https://skyeav.github.io/Tablassert/
|
|
8
|
+
Author-email: Skye Lane Goetz <sgoetz@isbscience.org>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
5
10
|
License-File: LICENSE
|
|
11
|
+
Keywords: declarative pipeline,knowledge graph,natural language processing,ncats translator,ner,tablassert,table mining,yaml configuration
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
6
13
|
Requires-Python: >=3.13
|
|
7
14
|
Requires-Dist: diskcache>=5.6.3
|
|
8
15
|
Requires-Dist: duckdb>=1.5.0
|
|
@@ -24,6 +31,8 @@ Requires-Dist: sentence-transformers>=5.3.0
|
|
|
24
31
|
Requires-Dist: sqlite-utils>=3.39
|
|
25
32
|
Requires-Dist: typer>=0.24.1
|
|
26
33
|
Requires-Dist: xxhash>=3.6.0
|
|
34
|
+
Provides-Extra: rtcompat
|
|
35
|
+
Requires-Dist: polars[rtcompat]>=1.39.0; extra == 'rtcompat'
|
|
27
36
|
Description-Content-Type: text/markdown
|
|
28
37
|
|
|
29
38
|
# Tablassert
|
|
@@ -61,6 +70,12 @@ uv tool install tablassert
|
|
|
61
70
|
# Option B: pip install
|
|
62
71
|
pip install tablassert
|
|
63
72
|
|
|
73
|
+
# Option C: runtime-compatible Polars build
|
|
74
|
+
# (for CPUs without the required Polars instructions)
|
|
75
|
+
uv tool install "tablassert[rtcompat]"
|
|
76
|
+
# or
|
|
77
|
+
pip install "tablassert[rtcompat]"
|
|
78
|
+
|
|
64
79
|
tablassert --help
|
|
65
80
|
```
|
|
66
81
|
|
|
@@ -90,12 +105,18 @@ uv run tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
|
90
105
|
Recommended for most users.
|
|
91
106
|
|
|
92
107
|
```bash
|
|
93
|
-
# Option A:
|
|
108
|
+
# Option A: standard install (UV)
|
|
94
109
|
uv tool install tablassert
|
|
95
110
|
|
|
96
|
-
# Option B:
|
|
111
|
+
# Option B: standard install (pip)
|
|
97
112
|
pip install tablassert
|
|
98
113
|
|
|
114
|
+
# Option C: runtime-compatible Polars build
|
|
115
|
+
# (for CPUs without the required Polars instructions)
|
|
116
|
+
uv tool install "tablassert[rtcompat]"
|
|
117
|
+
# or
|
|
118
|
+
pip install "tablassert[rtcompat]"
|
|
119
|
+
|
|
99
120
|
tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
100
121
|
```
|
|
101
122
|
|
|
@@ -108,6 +129,9 @@ uv tool install git+https://github.com/SkyeAv/Tablassert.git@main
|
|
|
108
129
|
tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
109
130
|
```
|
|
110
131
|
|
|
132
|
+
If your CPU does not support the instructions required by default Polars builds,
|
|
133
|
+
use **Method 2** with `tablassert[rtcompat]`.
|
|
134
|
+
|
|
111
135
|
### Method 4: Local source install
|
|
112
136
|
|
|
113
137
|
For contributors testing local changes.
|
|
@@ -33,6 +33,12 @@ uv tool install tablassert
|
|
|
33
33
|
# Option B: pip install
|
|
34
34
|
pip install tablassert
|
|
35
35
|
|
|
36
|
+
# Option C: runtime-compatible Polars build
|
|
37
|
+
# (for CPUs without the required Polars instructions)
|
|
38
|
+
uv tool install "tablassert[rtcompat]"
|
|
39
|
+
# or
|
|
40
|
+
pip install "tablassert[rtcompat]"
|
|
41
|
+
|
|
36
42
|
tablassert --help
|
|
37
43
|
```
|
|
38
44
|
|
|
@@ -62,12 +68,18 @@ uv run tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
|
62
68
|
Recommended for most users.
|
|
63
69
|
|
|
64
70
|
```bash
|
|
65
|
-
# Option A:
|
|
71
|
+
# Option A: standard install (UV)
|
|
66
72
|
uv tool install tablassert
|
|
67
73
|
|
|
68
|
-
# Option B:
|
|
74
|
+
# Option B: standard install (pip)
|
|
69
75
|
pip install tablassert
|
|
70
76
|
|
|
77
|
+
# Option C: runtime-compatible Polars build
|
|
78
|
+
# (for CPUs without the required Polars instructions)
|
|
79
|
+
uv tool install "tablassert[rtcompat]"
|
|
80
|
+
# or
|
|
81
|
+
pip install "tablassert[rtcompat]"
|
|
82
|
+
|
|
71
83
|
tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
72
84
|
```
|
|
73
85
|
|
|
@@ -80,6 +92,9 @@ uv tool install git+https://github.com/SkyeAv/Tablassert.git@main
|
|
|
80
92
|
tablassert build-knowledge-graph /path/to/graph-config.yaml
|
|
81
93
|
```
|
|
82
94
|
|
|
95
|
+
If your CPU does not support the instructions required by default Polars builds,
|
|
96
|
+
use **Method 2** with `tablassert[rtcompat]`.
|
|
97
|
+
|
|
83
98
|
### Method 4: Local source install
|
|
84
99
|
|
|
85
100
|
For contributors testing local changes.
|
|
@@ -13,11 +13,13 @@ def version4(
|
|
|
13
13
|
lf: pl.LazyFrame,
|
|
14
14
|
col: str,
|
|
15
15
|
conn: object,
|
|
16
|
-
taxon: Optional[str],
|
|
17
|
-
prioritize: Optional[list[Categories]],
|
|
18
|
-
avoid: Optional[list[Categories]],
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
taxon: Optional[str] = None,
|
|
17
|
+
prioritize: Optional[list[Categories]] = None,
|
|
18
|
+
avoid: Optional[list[Categories]] = None,
|
|
19
|
+
log: bool = True,
|
|
20
|
+
section_hash: Optional[str] = None,
|
|
21
|
+
config_file: Optional[str] = None,
|
|
22
|
+
column_context: bool = True,
|
|
21
23
|
tag: str = " one"
|
|
22
24
|
) -> pl.LazyFrame
|
|
23
25
|
```
|
|
@@ -61,6 +63,18 @@ Optional list of Biolink categories to exclude from results.
|
|
|
61
63
|
|
|
62
64
|
Example: `[Categories.Gene]` prevents gene mappings.
|
|
63
65
|
|
|
66
|
+
**`log: bool` (default: `True`)**
|
|
67
|
+
|
|
68
|
+
Controls unmatched-value logging. When enabled, unresolved terms are logged with section/config/column context.
|
|
69
|
+
|
|
70
|
+
**`section_hash: Optional[str]` / `config_file: Optional[str]`**
|
|
71
|
+
|
|
72
|
+
Optional context fields used for operational logging when unmatched values are encountered.
|
|
73
|
+
|
|
74
|
+
**`column_context: bool` (default: `True`)**
|
|
75
|
+
|
|
76
|
+
Controls category-frequency tie-breaking when multiple matches exist for a term. When `True`, the query result adds a category frequency score and prefers more frequent category hits.
|
|
77
|
+
|
|
64
78
|
**`tag: str` (default: `" one"`)**
|
|
65
79
|
|
|
66
80
|
Suffix for NLP processing level column.
|
|
@@ -71,10 +85,6 @@ The function looks for both:
|
|
|
71
85
|
|
|
72
86
|
Default `" one"` means it uses level-one text processing (lowercase, stripped).
|
|
73
87
|
|
|
74
|
-
**`section_hash: str` / `config_file: str`**
|
|
75
|
-
|
|
76
|
-
Context fields used for operational logging when unmatched values are encountered.
|
|
77
|
-
|
|
78
88
|
### Return Value
|
|
79
89
|
|
|
80
90
|
Returns a Polars LazyFrame with these columns added:
|
|
@@ -91,25 +101,26 @@ Returns a Polars LazyFrame with these columns added:
|
|
|
91
101
|
|
|
92
102
|
### DuckDB Query
|
|
93
103
|
|
|
94
|
-
The function executes a
|
|
104
|
+
The function executes a SQL query that:
|
|
105
|
+
|
|
106
|
+
1. **Builds an in-memory term table** by collecting distinct terms from both NLP levels and registering them in DuckDB as `PARQUET` via `conn.register("PARQUET", df.to_arrow())`.
|
|
95
107
|
|
|
96
|
-
|
|
108
|
+
2. **Ranks matches** by:
|
|
97
109
|
- Category priority (if `prioritize` specified)
|
|
98
110
|
- NLP level (exact case match preferred over normalized)
|
|
99
|
-
-
|
|
111
|
+
- Category frequency (if `column_context=True`)
|
|
100
112
|
|
|
101
|
-
|
|
113
|
+
3. **Filters by:**
|
|
102
114
|
- Taxon ID (if specified)
|
|
103
115
|
- Category avoidance (if specified)
|
|
104
116
|
|
|
105
|
-
|
|
117
|
+
4. **Deduplicates** to one CURIE per input string
|
|
106
118
|
|
|
107
119
|
### Example Usage
|
|
108
120
|
|
|
109
121
|
```python
|
|
110
122
|
from tablassert.fullmap import version4
|
|
111
123
|
from tablassert.enums import Categories
|
|
112
|
-
from pathlib import Path
|
|
113
124
|
import duckdb
|
|
114
125
|
import polars as pl
|
|
115
126
|
|
|
@@ -127,8 +138,10 @@ result = version4(
|
|
|
127
138
|
taxon="9606", # Human only
|
|
128
139
|
prioritize=[Categories.Gene],
|
|
129
140
|
avoid=[Categories.Protein],
|
|
141
|
+
log=True,
|
|
130
142
|
section_hash="tutorial-section",
|
|
131
143
|
config_file="tutorial-table.yaml",
|
|
144
|
+
column_context=True,
|
|
132
145
|
tag=" one"
|
|
133
146
|
)
|
|
134
147
|
|
|
@@ -72,28 +72,26 @@ original == preferred_name
|
|
|
72
72
|
|
|
73
73
|
**Performance:** O(1) string comparison
|
|
74
74
|
|
|
75
|
+
Before fuzzy matching, the function also applies rule-based pass-through checks for known safe patterns (for example CHEBI/PR/UniProtKB CURIE families and selected exception prefixes).
|
|
76
|
+
|
|
75
77
|
#### Stage 2: Fuzzy Matching
|
|
76
78
|
|
|
77
79
|
**Medium confidence using RapidFuzz.**
|
|
78
80
|
|
|
79
|
-
|
|
81
|
+
Two fuzzy matching algorithms:
|
|
80
82
|
1. **Ratio:** Overall string similarity
|
|
81
|
-
2. **Partial ratio:**
|
|
82
|
-
3. **Token sort ratio:** Order-independent word matching
|
|
83
|
-
4. **Partial token sort ratio:** Combined approach
|
|
83
|
+
2. **Partial token sort ratio:** Combined token/subsequence matching
|
|
84
84
|
|
|
85
85
|
**Threshold:** Default 20% similarity (configurable)
|
|
86
86
|
|
|
87
87
|
```python
|
|
88
88
|
fuzz.ratio(original, preferred) >= 20
|
|
89
|
-
or fuzz.ratio(original, curie) >= 20
|
|
90
89
|
or fuzz.partial_token_sort_ratio(original, preferred) >= 20
|
|
91
|
-
or fuzz.partial_token_sort_ratio(original, curie) >= 20
|
|
92
90
|
```
|
|
93
91
|
|
|
94
92
|
**Example passes:**
|
|
95
93
|
- Original: `"breast ca"` → Preferred: `"breast cancer"` ✓
|
|
96
|
-
- Original: `"T53"` →
|
|
94
|
+
- Original: `"T53"` → Preferred: `"tumor protein p53"` ✗ (goes to Stage 3)
|
|
97
95
|
|
|
98
96
|
**Performance:** O(n) string operations, cached via `@DISKCACHE.memoize()`
|
|
99
97
|
|
|
@@ -128,7 +126,7 @@ return similarity >= 0.2
|
|
|
128
126
|
- ONNX session caching
|
|
129
127
|
- Disk cache for embeddings (~100MB LRU)
|
|
130
128
|
|
|
131
|
-
|
|
129
|
+
Lazy-loaded on first `BERT_audit()` call, then reused for subsequent calls.
|
|
132
130
|
|
|
133
131
|
### Disk Caching
|
|
134
132
|
|
|
@@ -142,7 +140,7 @@ def fuzz_audit(...): ...
|
|
|
142
140
|
def BERT_audit(...): ...
|
|
143
141
|
```
|
|
144
142
|
|
|
145
|
-
**Cache location:** `
|
|
143
|
+
**Cache location:** `./.cachassert` directory
|
|
146
144
|
|
|
147
145
|
**Cache strategy:** LRU eviction when size exceeds limit
|
|
148
146
|
|
|
@@ -40,7 +40,7 @@ Final output files are written to the current working directory as:
|
|
|
40
40
|
- `{name}_{version}.nodes.ndjson` - Node file (entities)
|
|
41
41
|
- `{name}_{version}.edges.ndjson` - Edge file (relationships)
|
|
42
42
|
|
|
43
|
-
Intermediate parquet artifacts are written to
|
|
43
|
+
Intermediate parquet artifacts are written to `.storassert/` during section processing.
|
|
44
44
|
|
|
45
45
|
See [Graph Configuration](configuration/graph.md) for details on the YAML schema.
|
|
46
46
|
|
|
@@ -30,11 +30,21 @@ tablassert --help
|
|
|
30
30
|
pip install tablassert
|
|
31
31
|
tablassert --help
|
|
32
32
|
|
|
33
|
+
# Install runtime-compatible Polars build
|
|
34
|
+
# (for CPUs without the required Polars instructions)
|
|
35
|
+
uv tool install "tablassert[rtcompat]"
|
|
36
|
+
# or
|
|
37
|
+
pip install "tablassert[rtcompat]"
|
|
38
|
+
tablassert --help
|
|
39
|
+
|
|
33
40
|
# Or install latest from GitHub main
|
|
34
41
|
uv tool install git+https://github.com/SkyeAv/Tablassert.git@main
|
|
35
42
|
tablassert --help
|
|
36
43
|
```
|
|
37
44
|
|
|
45
|
+
`tablassert[rtcompat]` is defined in `pyproject.toml` and installs a runtime-compatible
|
|
46
|
+
Polars dependency for systems without the default Polars CPU instruction support.
|
|
47
|
+
|
|
38
48
|
For development from source:
|
|
39
49
|
|
|
40
50
|
```bash
|
|
@@ -45,6 +45,8 @@ This creates a virtual environment in `.venv/` and installs all dependencies. Th
|
|
|
45
45
|
### Method 2: Install from PyPI
|
|
46
46
|
|
|
47
47
|
Recommended for most users who just need the CLI.
|
|
48
|
+
`pyproject.toml` also defines `tablassert[rtcompat]`, which installs runtime-compatible
|
|
49
|
+
Polars for systems without the required default Polars CPU instructions.
|
|
48
50
|
|
|
49
51
|
```bash
|
|
50
52
|
# Option A: Install from PyPI with UV
|
|
@@ -53,6 +55,12 @@ uv tool install tablassert
|
|
|
53
55
|
# Option B: Install from PyPI with pip
|
|
54
56
|
pip install tablassert
|
|
55
57
|
|
|
58
|
+
# Option C: Install runtime-compatible Polars build
|
|
59
|
+
# (for CPUs without the required Polars instructions)
|
|
60
|
+
uv tool install "tablassert[rtcompat]"
|
|
61
|
+
# or
|
|
62
|
+
pip install "tablassert[rtcompat]"
|
|
63
|
+
|
|
56
64
|
# Tablassert CLI is now available
|
|
57
65
|
tablassert --help
|
|
58
66
|
```
|
|
@@ -156,3 +164,15 @@ If you encounter dependency installation issues, try:
|
|
|
156
164
|
uv cache clean
|
|
157
165
|
uv sync --reinstall
|
|
158
166
|
```
|
|
167
|
+
|
|
168
|
+
### Polars CPU Instruction Issues
|
|
169
|
+
|
|
170
|
+
If your machine does not support the CPU instructions required by default Polars
|
|
171
|
+
builds, install Tablassert with the runtime-compat package extra from
|
|
172
|
+
`pyproject.toml`:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
uv tool install "tablassert[rtcompat]"
|
|
176
|
+
# or
|
|
177
|
+
pip install "tablassert[rtcompat]"
|
|
178
|
+
```
|
|
@@ -1,8 +1,25 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tablassert"
|
|
3
|
-
version = "7.0.
|
|
4
|
-
description = "
|
|
3
|
+
version = "7.0.2"
|
|
4
|
+
description = "Tablassert is a highly performant declarative knowledge graph backend designed to extract knowledge assertions from tabular data while exporting NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON."
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" }
|
|
7
|
+
]
|
|
8
|
+
keywords = [
|
|
9
|
+
"knowledge graph",
|
|
10
|
+
"natural language processing",
|
|
11
|
+
"ner",
|
|
12
|
+
"ncats translator",
|
|
13
|
+
"yaml configuration",
|
|
14
|
+
"table mining",
|
|
15
|
+
"declarative pipeline",
|
|
16
|
+
"tablassert"
|
|
17
|
+
]
|
|
5
18
|
readme = "README.md"
|
|
19
|
+
license = "Apache-2.0"
|
|
20
|
+
classifiers = [
|
|
21
|
+
"License :: OSI Approved :: Apache Software License",
|
|
22
|
+
]
|
|
6
23
|
requires-python = ">=3.13"
|
|
7
24
|
dependencies = [
|
|
8
25
|
"diskcache>=5.6.3",
|
|
@@ -27,6 +44,11 @@ dependencies = [
|
|
|
27
44
|
"xxhash>=3.6.0",
|
|
28
45
|
]
|
|
29
46
|
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/SkyeAv/Tablassert"
|
|
49
|
+
Source = "https://github.com/SkyeAv/Tablassert"
|
|
50
|
+
Documentation = "https://skyeav.github.io/Tablassert/"
|
|
51
|
+
|
|
30
52
|
[build-system]
|
|
31
53
|
requires = ["hatchling"]
|
|
32
54
|
build-backend = "hatchling.build"
|
|
@@ -37,6 +59,11 @@ packages = ["./src/tablassert"]
|
|
|
37
59
|
[project.scripts]
|
|
38
60
|
tablassert = "tablassert.lib:CLI"
|
|
39
61
|
|
|
62
|
+
[project.optional-dependencies]
|
|
63
|
+
rtcompat = [
|
|
64
|
+
"polars[rtcompat]>=1.39.0",
|
|
65
|
+
]
|
|
66
|
+
|
|
40
67
|
[dependency-groups]
|
|
41
68
|
dev = [
|
|
42
69
|
"mkdocs>=1.6.1",
|
|
@@ -48,7 +75,7 @@ dev = [
|
|
|
48
75
|
|
|
49
76
|
[tool.ruff]
|
|
50
77
|
line-length = 120
|
|
51
|
-
indent-width =
|
|
78
|
+
indent-width = 4
|
|
52
79
|
target-version = "py313"
|
|
53
80
|
|
|
54
81
|
[tool.ruff.format]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from time import sleep
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import pyexcel
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def modernize_xls(p: Path) -> Path:
|
|
10
|
+
xlsx: Path = p.with_suffix(".xlsx")
|
|
11
|
+
pyexcel.save_book_as(file_name=str(p), dest_file_name=str(xlsx))
|
|
12
|
+
return xlsx
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def from_url(website: str, p: Path, timeout: int = 60_000, retries: int = 3) -> Path:
|
|
16
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
if p.is_file():
|
|
18
|
+
return p
|
|
19
|
+
|
|
20
|
+
last: Optional[Exception] = None
|
|
21
|
+
for attempt in range(retries):
|
|
22
|
+
try:
|
|
23
|
+
with sync_playwright() as pw:
|
|
24
|
+
browser = pw.chromium.launch(headless=True)
|
|
25
|
+
page = browser.new_page()
|
|
26
|
+
page.goto(website, wait_until="networkidle", timeout=timeout)
|
|
27
|
+
with page.expect_download(timeout=timeout) as info:
|
|
28
|
+
download = info.value
|
|
29
|
+
download.save_as(p)
|
|
30
|
+
browser.close()
|
|
31
|
+
return p
|
|
32
|
+
except Exception as e:
|
|
33
|
+
last = e
|
|
34
|
+
if attempt < retries - 1:
|
|
35
|
+
sleep(2**attempt)
|
|
36
|
+
|
|
37
|
+
raise RuntimeError(f"01 | Download Failed After {retries} Attempts: {last}")
|