tablassert 7.1.0__tar.gz → 7.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablassert-7.2.0/.github/workflows/autotag.yml +24 -0
- tablassert-7.2.0/.github/workflows/docker.yml +27 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/.github/workflows/pipy.yml +3 -3
- {tablassert-7.1.0 → tablassert-7.2.0}/.gitignore +3 -3
- tablassert-7.2.0/.pre-commit-config.yaml +21 -0
- tablassert-7.2.0/AGENTS.md +170 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/CHANGELOG.md +24 -4
- tablassert-7.2.0/CITATION.cff +34 -0
- tablassert-7.2.0/CONTRIBUTING.md +266 -0
- tablassert-7.2.0/Dockerfile +6 -0
- tablassert-7.2.0/PKG-INFO +113 -0
- tablassert-7.2.0/README.md +66 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/api/fullmap.md +73 -33
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/api/qc.md +7 -12
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/cli.md +31 -3
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/configuration/advanced-example.md +164 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/configuration/graph.md +8 -8
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/configuration/table.md +2 -2
- tablassert-7.2.0/docs/datassert.md +66 -0
- tablassert-7.2.0/docs/docker.md +90 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/examples/tutorial-graph.yaml +1 -1
- tablassert-7.2.0/docs/examples.md +352 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/index.md +7 -7
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/installation.md +41 -13
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/tutorial.md +3 -3
- tablassert-7.2.0/llms.txt +48 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/mkdocs.yml +3 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/pyproject.toml +26 -9
- tablassert-7.2.0/src/tablassert/cli.py +127 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/fullmap.py +75 -33
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/lib.py +12 -132
- tablassert-7.2.0/src/tablassert/models.py +216 -0
- tablassert-7.2.0/src/tablassert/nlp.py +28 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/qc.py +27 -35
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/utils.py +0 -7
- tablassert-7.2.0/tests/__init__.py +0 -0
- tablassert-7.2.0/tests/conftest.py +10 -0
- tablassert-7.2.0/tests/fixtures/invalid_section_missing_source.yaml +15 -0
- tablassert-7.2.0/tests/fixtures/minimal_section.yaml +20 -0
- tablassert-7.2.0/tests/fixtures/minimal_section_with_sections.yaml +41 -0
- tablassert-7.2.0/tests/test_enums.py +179 -0
- tablassert-7.2.0/tests/test_fullmap.py +59 -0
- tablassert-7.2.0/tests/test_ingests.py +111 -0
- tablassert-7.2.0/tests/test_lib.py +118 -0
- tablassert-7.2.0/tests/test_models.py +265 -0
- tablassert-7.2.0/tests/test_nlp.py +64 -0
- tablassert-7.2.0/tests/test_utils.py +80 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/uv.lock +922 -317
- tablassert-7.1.0/.pre-commit-config.yaml +0 -15
- tablassert-7.1.0/.python-version +0 -1
- tablassert-7.1.0/.vscode/settings.json +0 -23
- tablassert-7.1.0/PKG-INFO +0 -166
- tablassert-7.1.0/README.md +0 -128
- tablassert-7.1.0/src/tablassert/models.py +0 -134
- {tablassert-7.1.0 → tablassert-7.2.0}/.github/workflows/docs.yml +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/LICENSE +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/api/utils.md +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/examples/tutorial-data.csv +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/docs/examples/tutorial-table.yaml +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/__init__.py +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/downloader.py +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/enums.py +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/ingests.py +0 -0
- {tablassert-7.1.0 → tablassert-7.2.0}/src/tablassert/log.py +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: Auto Tag Versions
|
|
2
|
+
on:
|
|
3
|
+
workflow_run:
|
|
4
|
+
workflows:
|
|
5
|
+
- "Deploy to PyPI"
|
|
6
|
+
types:
|
|
7
|
+
- completed
|
|
8
|
+
branches:
|
|
9
|
+
- main
|
|
10
|
+
permissions:
|
|
11
|
+
contents: write
|
|
12
|
+
jobs:
|
|
13
|
+
autotag:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
with:
|
|
18
|
+
fetch-depth: 0
|
|
19
|
+
- name: Autotag using pyproject.toml
|
|
20
|
+
uses: butlerlogic/action-autotag@1.0.1
|
|
21
|
+
with:
|
|
22
|
+
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
|
23
|
+
tag_prefix: "v"
|
|
24
|
+
strategy: python
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Publish Docker Image
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
tags:
|
|
5
|
+
- "v*"
|
|
6
|
+
jobs:
|
|
7
|
+
publish:
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
packages: write
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: docker/setup-buildx-action@v3
|
|
15
|
+
- uses: docker/login-action@v3
|
|
16
|
+
with:
|
|
17
|
+
registry: ghcr.io
|
|
18
|
+
username: ${{ github.actor }}
|
|
19
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
20
|
+
- uses: docker/build-push-action@v6
|
|
21
|
+
with:
|
|
22
|
+
context: .
|
|
23
|
+
file: ./Dockerfile
|
|
24
|
+
push: true
|
|
25
|
+
tags: |
|
|
26
|
+
ghcr.io/${{ github.repository_owner }}/tablassert:latest
|
|
27
|
+
ghcr.io/${{ github.repository_owner }}/tablassert:${{ github.ref_name }}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
name: Deploy to PyPI
|
|
2
2
|
on:
|
|
3
3
|
push:
|
|
4
|
-
branches:
|
|
5
|
-
|
|
4
|
+
branches:
|
|
5
|
+
- main
|
|
6
6
|
jobs:
|
|
7
7
|
publish:
|
|
8
8
|
runs-on: ubuntu-latest
|
|
@@ -16,4 +16,4 @@ jobs:
|
|
|
16
16
|
- name: Build package
|
|
17
17
|
run: uv build
|
|
18
18
|
- name: Publish to PyPI
|
|
19
|
-
uses: pypa/gh-action-pypi-publish@release/v1
|
|
19
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
*.egg-info
|
|
2
|
+
*.python-version
|
|
2
3
|
*__pycache__/
|
|
3
4
|
*.logs/
|
|
4
5
|
*.opencode/
|
|
5
6
|
*.ruff_cache/
|
|
6
7
|
*.pytest_cache/
|
|
7
8
|
*plans/
|
|
8
|
-
*
|
|
9
|
-
*.claude/
|
|
10
|
-
*.cachassert/
|
|
9
|
+
*specs/
|
|
11
10
|
*.storassert/
|
|
12
11
|
*.logassert/
|
|
13
12
|
*DATALAKE/
|
|
14
13
|
*.onnxassert/
|
|
14
|
+
*.claude/
|
|
15
15
|
*.envrc
|
|
16
16
|
*venv/
|
|
17
17
|
*.log
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.9.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: local
|
|
9
|
+
hooks:
|
|
10
|
+
- id: pyright
|
|
11
|
+
name: pyright
|
|
12
|
+
entry: uv run pyright
|
|
13
|
+
language: system
|
|
14
|
+
types: [python]
|
|
15
|
+
pass_filenames: false
|
|
16
|
+
- id: pytest
|
|
17
|
+
name: pytest
|
|
18
|
+
entry: uv run pytest
|
|
19
|
+
language: system
|
|
20
|
+
types: [python]
|
|
21
|
+
pass_filenames: false
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# AGENTS.md — Tablassert
|
|
2
|
+
|
|
3
|
+
Guidance for AI coding agents working in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Tablassert is a Python package (>=3.11) for tabular data assertion, normalization, and quality control. It builds declarative knowledge graphs from tabular data, exporting NCATS Translator-compliant KGX NDJSON. Uses **Polars** DataFrames, **DuckDB** for entity resolution, and **ONNX/BioBERT** for quality control. CLI built with **Typer**. Models built with **Pydantic v2**.
|
|
8
|
+
|
|
9
|
+
## Quick Reference
|
|
10
|
+
|
|
11
|
+
| Task | Command |
|
|
12
|
+
|---|---|
|
|
13
|
+
| Install | `uv sync` |
|
|
14
|
+
| Run CLI | `uv run tablassert` |
|
|
15
|
+
| Lint | `uv run ruff check .` |
|
|
16
|
+
| Lint (fix) | `uv run ruff check --fix .` |
|
|
17
|
+
| Format | `uv run ruff format .` |
|
|
18
|
+
| Format check | `uv run ruff format --check .` |
|
|
19
|
+
| Type check | `uv run pyright` |
|
|
20
|
+
| All checks | `uv run pre-commit run --all-files` |
|
|
21
|
+
| Run all tests | `uv run pytest` |
|
|
22
|
+
| Run single test | `uv run pytest tests/test_foo.py::test_name` |
|
|
23
|
+
| Run by keyword | `uv run pytest -k "test_pattern"` |
|
|
24
|
+
| Run with print | `uv run pytest -s tests/test_foo.py` |
|
|
25
|
+
| Build | `uv build` |
|
|
26
|
+
| Build docs | `uv run --group dev mkdocs build` |
|
|
27
|
+
| Add dependency | `uv add <package>` |
|
|
28
|
+
| Add dev dependency | `uv add --group dev <package>` |
|
|
29
|
+
|
|
30
|
+
## Repository Structure
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
src/tablassert/
|
|
34
|
+
cli.py # Typer CLI (entry point: tablassert.cli:CLI)
|
|
35
|
+
lib.py # Core logic: encodings, data loading, Tcode(Section) class
|
|
36
|
+
models.py # Pydantic v2 models (TablaBase base class)
|
|
37
|
+
enums.py # str, Enum subclasses (Tokens, Repositories, Comparisons, etc.)
|
|
38
|
+
fullmap.py # NER / entity resolution (DuckDB, 16 shards)
|
|
39
|
+
qc.py # Quality control (ONNX/BioBERT, sentence_transformers)
|
|
40
|
+
nlp.py # Text normalization (level_one: strip+lowercase, level_two: regex)
|
|
41
|
+
ingests.py # YAML ingestion: from_yaml(), to_sections(), fastmerge()
|
|
42
|
+
downloader.py # Playwright-based file downloads with retries
|
|
43
|
+
utils.py # Hashing (xxhash), STORE path, namespace UUIDs
|
|
44
|
+
log.py # loguru logger → .logassert/logassert.log
|
|
45
|
+
__init__.py # Empty file (lazy loading is per-module, not here)
|
|
46
|
+
docs/ # MkDocs documentation source
|
|
47
|
+
mkdocs.yml # MkDocs configuration
|
|
48
|
+
pyproject.toml # Project config, dependencies, tool settings
|
|
49
|
+
tests/ # Test directory (at repo root)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- `conftest.py` provides a `fixtures_path` fixture returning `Path(__file__).parent / "fixtures"`.
|
|
53
|
+
- pytest configured via `pyproject.toml` `[tool.pytest.ini_options]` with `testpaths = ["tests"]`.
|
|
54
|
+
- Test fixtures: `tests/fixtures/` contains YAML files for Section model tests.
|
|
55
|
+
- Test modules: `test_enums.py`, `test_fullmap.py`, `test_ingests.py`, `test_lib.py`, `test_models.py`, `test_nlp.py`, `test_utils.py`.
|
|
56
|
+
|
|
57
|
+
## Code Style
|
|
58
|
+
|
|
59
|
+
### Imports
|
|
60
|
+
|
|
61
|
+
- Every file starts with `from __future__ import annotations`
|
|
62
|
+
- Heavy dependencies are loaded **lazily per-module** using this pattern:
|
|
63
|
+
```python
|
|
64
|
+
from typing import TYPE_CHECKING
|
|
65
|
+
import lazy_loader as Lazy
|
|
66
|
+
|
|
67
|
+
if TYPE_CHECKING:
|
|
68
|
+
import polars as pl
|
|
69
|
+
else:
|
|
70
|
+
pl = Lazy.load("polars")
|
|
71
|
+
```
|
|
72
|
+
- Lazy-loaded deps: `polars`, `duckdb`, `orjson`, `typer`, `xxhash`, `polars_hash`, `yaml`
|
|
73
|
+
- Direct (non-lazy) heavy deps: `sqlite_utils`, `rapidfuzz`, `pydantic`, `loguru`, `yaml.CLoader`
|
|
74
|
+
- Previously-optional deps now in core: `sentence_transformers`, `onnxruntime`, `sklearn`, `playwright`, `pyexcel` — lazy-loaded when present
|
|
75
|
+
- Some modules mix direct and lazy imports for the same package (e.g., `ingests.py` does `from yaml import CLoader` directly, then lazy-loads `yaml` for `yaml.load()`)
|
|
76
|
+
- Import order: standard library → blank line → third-party → blank line → local
|
|
77
|
+
- Use `from __future__ import annotations` to enable deferred evaluation
|
|
78
|
+
|
|
79
|
+
### Type Annotations
|
|
80
|
+
|
|
81
|
+
- **Every variable** gets a type annotation, including locals: `col: str = "name"`, `df: pl.DataFrame = ...`
|
|
82
|
+
- Use `Optional[T]` and `Union[...]` (not `T | None` or `X | Y`)
|
|
83
|
+
- Use `Self` for class methods returning the class type
|
|
84
|
+
- Use `Path` (not `str`) for filesystem paths
|
|
85
|
+
- Use `# pyright: ignore` comments to suppress false positives from lazy-loaded modules
|
|
86
|
+
|
|
87
|
+
### Pydantic Models
|
|
88
|
+
|
|
89
|
+
- All models inherit from `TablaBase(BaseModel)` which sets:
|
|
90
|
+
```python
|
|
91
|
+
model_config: ConfigDict = ConfigDict( # pyright: ignore
|
|
92
|
+
str_strip_whitespace=False,
|
|
93
|
+
validate_assignment=True,
|
|
94
|
+
use_enum_values=True,
|
|
95
|
+
extra="forbid",
|
|
96
|
+
populate_by_name=True,
|
|
97
|
+
)
|
|
98
|
+
```
|
|
99
|
+
- Required fields: `Field(...)` (ellipsis sentinel)
|
|
100
|
+
- Optional fields: `Optional[T] = Field(None)`
|
|
101
|
+
- All enums are `str, Enum` subclasses (defined in `enums.py`)
|
|
102
|
+
|
|
103
|
+
### Enums
|
|
104
|
+
|
|
105
|
+
All enums live in `enums.py` and extend `str, Enum`. Key enums: `Tokens`, `Repositories`, `Contributions`, `Comparisons`, `Functions`, `Files`, `EncodingMethods`, `FillMethods`, `Syntaxes`, `Statuses`, `Categories`, `Predicates`, `Qualifiers`.
|
|
106
|
+
|
|
107
|
+
### Naming
|
|
108
|
+
|
|
109
|
+
- Functions/variables: `snake_case`
|
|
110
|
+
- Classes: `PascalCase`
|
|
111
|
+
- Module-level constants: `UPPER_CASE`
|
|
112
|
+
|
|
113
|
+
### Comments
|
|
114
|
+
|
|
115
|
+
- `# ?` — descriptions / clarifications
|
|
116
|
+
- `# !` — warnings / important notes
|
|
117
|
+
- `# *` — stage markers (pipeline steps)
|
|
118
|
+
- `# TODO:` — todos
|
|
119
|
+
- No docstrings on functions; use `# ?` comment on the line above instead
|
|
120
|
+
|
|
121
|
+
### Formatting (enforced by ruff)
|
|
122
|
+
|
|
123
|
+
- Line length: **120**
|
|
124
|
+
- Quote style: **double quotes**
|
|
125
|
+
- Indent: **4 spaces**
|
|
126
|
+
- `skip-magic-trailing-comma = true`
|
|
127
|
+
- Target: Python >=3.11
|
|
128
|
+
|
|
129
|
+
### Error Handling
|
|
130
|
+
|
|
131
|
+
- Use `RuntimeError` for exceptional cases (no custom exception classes currently)
|
|
132
|
+
- Use `logger.warning()` for non-fatal issues (e.g., empty subgraphs)
|
|
133
|
+
- Logger: `from tablassert.log import logger`
|
|
134
|
+
|
|
135
|
+
### Other Conventions
|
|
136
|
+
|
|
137
|
+
- `operator.add` for Polars string concatenation on columns (not `+` directly)
|
|
138
|
+
- CLI entry point: `tablassert.cli:CLI` (Typer app with `pretty_exceptions_show_locals=False`)
|
|
139
|
+
- Use `rich.progress` for progress tracking in CLI
|
|
140
|
+
- Data side-effects stored in hidden directories: `.logassert/`, `.storassert/`, `.onnxassert/`
|
|
141
|
+
|
|
142
|
+
## Tools
|
|
143
|
+
|
|
144
|
+
- **ruff** — linting (`ruff check`) and formatting (`ruff format`)
|
|
145
|
+
- **pyright** — type checking (no pyrightconfig.json; uses defaults)
|
|
146
|
+
- **pre-commit** — runs ruff fix, ruff-format, pyright, and pytest on all Python files
|
|
147
|
+
- **pytest** — testing (>=9.0.2)
|
|
148
|
+
- **uv** — package manager (use `uv run` for all commands, `uv add` for deps)
|
|
149
|
+
- **hatchling** — build backend
|
|
150
|
+
|
|
151
|
+
## Optional Dependency Groups
|
|
152
|
+
|
|
153
|
+
Defined in `pyproject.toml` `[project.optional-dependencies]`:
|
|
154
|
+
- `rtcompat` — `polars[rtcompat]` (runtime-compatible Polars build for CPUs without required instructions)
|
|
155
|
+
- `rt` — alias for `rtcompat`
|
|
156
|
+
|
|
157
|
+
All other dependencies (ML, web, Excel) are now in core `dependencies`.
|
|
158
|
+
|
|
159
|
+
Install with: `uv sync` or `pip install tablassert`
|
|
160
|
+
|
|
161
|
+
## CI Workflows
|
|
162
|
+
|
|
163
|
+
- **PyPI publish** (`.github/workflows/pipy.yml`): builds and publishes on push to `main`
|
|
164
|
+
- **MkDocs deploy** (`.github/workflows/docs.yml`): builds docs and deploys to GitHub Pages on push to `main`
|
|
165
|
+
- **Docker publish** (`.github/workflows/docker.yml`): builds and pushes image to GHCR on tag push (`v*`)
|
|
166
|
+
- **Autotag** (`.github/workflows/autotag.yml`): automatic version tagging
|
|
167
|
+
|
|
168
|
+
## Key Dependencies
|
|
169
|
+
|
|
170
|
+
polars, duckdb, orjson, pydantic, typer, xxhash, loguru, rapidfuzz, scikit-learn, sqlite-utils, pyyaml, lazy-loader, polars-hash, fastexcel, pyarrow, optimum-onnx
|
|
@@ -2,24 +2,44 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project are documented in this file.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## 7.2.0 - 2026-03-31
|
|
6
|
+
|
|
7
|
+
### New Features
|
|
8
|
+
- Added `tablassert version` command to display current package version.
|
|
9
|
+
- Added autotag GitHub Action for automated version tagging on releases.
|
|
10
|
+
- Added PyPI publishing GitHub Action.
|
|
11
|
+
- Added Docker image publishing to GitHub Container Registry (ghcr.io).
|
|
6
12
|
|
|
7
13
|
### Changes
|
|
14
|
+
- Sharded datassert entity-resolution database into 16 DuckDB shards for parallel querying.
|
|
15
|
+
- Renamed dependency from DBssert to DATASSERT throughout.
|
|
16
|
+
- Separated CLI logic into dedicated `cli.py` module.
|
|
17
|
+
- Extracted NLP normalization into dedicated `nlp.py` module for cleaner separation of concerns.
|
|
18
|
+
- Implemented improved parallelization model for graph compilation.
|
|
19
|
+
- Annotated Pydantic model fields with `Field(...)` schema metadata.
|
|
20
|
+
- Renamed `fullmap.version4()` to `fullmap.resolve()` for clarity.
|
|
8
21
|
- Updated `fullmap` ranking to prioritize case-insensitive exact matches between normalized terms and preferred names.
|
|
9
22
|
- Updated `fullmap` term de-duplication to keep first occurrences, improving deterministic output ordering.
|
|
23
|
+
- Moved MkDocs to dev-only dependencies.
|
|
24
|
+
|
|
25
|
+
### Testing
|
|
26
|
+
- Added basic pytest suite covering core models, enums, ingests, lib, nlp, and utils.
|
|
27
|
+
|
|
28
|
+
### Maintenance
|
|
29
|
+
- Improved `.gitignore` to exclude common artifacts.
|
|
10
30
|
|
|
11
31
|
## 7.0.2 - 2026-03-23
|
|
12
32
|
|
|
13
33
|
### Changes
|
|
14
34
|
- Updated package metadata for the 7.0.2 release.
|
|
15
|
-
- Added optional `log` and `column_context` controls to `fullmap.
|
|
35
|
+
- Added optional `log` and `column_context` controls to `fullmap.resolve()` for more configurable entity-resolution behavior.
|
|
16
36
|
|
|
17
37
|
### Bug Fixes
|
|
18
38
|
- Reworked entity-resolution querying to register terms directly in DuckDB instead of writing temporary parquet files, removing tempfile lifecycle issues in `fullmap` query execution.
|
|
19
39
|
- Isolated unmatched-entity logging into a dedicated helper and gated it behind an explicit logging flag.
|
|
20
40
|
|
|
21
41
|
### Documentation
|
|
22
|
-
- Updated API reference docs to match the current `
|
|
42
|
+
- Updated API reference docs to match the current `resolve()` function signature and behavior.
|
|
23
43
|
- Corrected QC documentation to reflect the implemented fuzzy/BERT validation pipeline.
|
|
24
44
|
- Fixed documentation path typos for cache/store artifact directories.
|
|
25
45
|
|
|
@@ -45,7 +65,7 @@ All notable changes to this project are documented in this file.
|
|
|
45
65
|
|
|
46
66
|
### Breaking Changes
|
|
47
67
|
- Nix is no longer supported for development and installation. Use UV-based installation instead.
|
|
48
|
-
- Project now requires Python 3.
|
|
68
|
+
- Project now requires Python 3.11+ for compatibility with UV toolchain.
|
|
49
69
|
|
|
50
70
|
### Documentation
|
|
51
71
|
- Completely rewrote installation documentation to reflect UV-based development environment.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use Tablassert, please cite it as below."
|
|
3
|
+
type: software
|
|
4
|
+
title: Tablassert
|
|
5
|
+
version: 7.2.0
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
repository-code: https://github.com/SkyeAv/Tablassert
|
|
8
|
+
abstract: Tablassert is a highly performant declarative knowledge graph backend for bioinformatics that extracts knowledge assertions from tabular data, performs entity resolution and data quality control, and exports NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
|
|
9
|
+
authors:
|
|
10
|
+
- given-names: Skye Lane
|
|
11
|
+
family-names: Goetz
|
|
12
|
+
email: sgoetz@isbscience.org
|
|
13
|
+
affiliation: Institute for Systems Biology; CalPoly SLO
|
|
14
|
+
- given-names: "Gwênlyn"
|
|
15
|
+
family-names: Glusman
|
|
16
|
+
email: gglusman@isbscience.org
|
|
17
|
+
affiliation: Institute for Systems Biology
|
|
18
|
+
- given-names: Jared C.
|
|
19
|
+
family-names: Roach
|
|
20
|
+
affiliation: Institute for Systems Biology
|
|
21
|
+
references:
|
|
22
|
+
- type: article
|
|
23
|
+
title: "MicrobiomeKG: bridging microbiome research and host health through knowledge graphs"
|
|
24
|
+
authors:
|
|
25
|
+
- given-names: Skye Lane
|
|
26
|
+
family-names: Goetz
|
|
27
|
+
- given-names: Alex K.
|
|
28
|
+
family-names: Glen
|
|
29
|
+
- given-names: "Gwênlyn"
|
|
30
|
+
family-names: Glusman
|
|
31
|
+
journal: Frontiers in Systems Biology
|
|
32
|
+
year: 2025
|
|
33
|
+
volume: 5
|
|
34
|
+
doi: "10.3389/fsysb.2025.1544432"
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# Contributing to Tablassert
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to Tablassert! This guide covers everything you need to get started.
|
|
4
|
+
|
|
5
|
+
For full documentation, visit [skyeav.github.io/Tablassert](https://skyeav.github.io/Tablassert/).
|
|
6
|
+
|
|
7
|
+
## Getting Started
|
|
8
|
+
|
|
9
|
+
### Prerequisites
|
|
10
|
+
|
|
11
|
+
- **Python 3.11** or higher
|
|
12
|
+
- **[UV](https://docs.astral.sh/uv/)** package manager
|
|
13
|
+
- **Git**
|
|
14
|
+
|
|
15
|
+
### Setup
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
git clone https://github.com/SkyeAv/Tablassert.git
|
|
19
|
+
cd Tablassert
|
|
20
|
+
uv sync
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Optional Dependency Groups
|
|
24
|
+
|
|
25
|
+
Some features require optional dependencies:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv sync --extra ml # sentence-transformers, onnxruntime, scikit-learn
|
|
29
|
+
uv sync --extra web # playwright
|
|
30
|
+
uv sync --extra pyexcel # pyexcel
|
|
31
|
+
uv sync --extra full # all optional deps
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Development Workflow
|
|
35
|
+
|
|
36
|
+
### Quick Reference
|
|
37
|
+
|
|
38
|
+
| Task | Command |
|
|
39
|
+
|---|---|
|
|
40
|
+
| Run CLI | `uv run tablassert --help` |
|
|
41
|
+
| Lint | `uv run ruff check .` |
|
|
42
|
+
| Lint (fix) | `uv run ruff check --fix .` |
|
|
43
|
+
| Format | `uv run ruff format .` |
|
|
44
|
+
| Format check | `uv run ruff format --check .` |
|
|
45
|
+
| Type check | `uv run pyright` |
|
|
46
|
+
| All checks | `uv run pre-commit run --all-files` |
|
|
47
|
+
| Run all tests | `uv run pytest` |
|
|
48
|
+
| Run single test | `uv run pytest tests/test_foo.py::test_name` |
|
|
49
|
+
| Run by keyword | `uv run pytest -k "test_pattern"` |
|
|
50
|
+
| Build | `uv build` |
|
|
51
|
+
|
|
52
|
+
### Branching
|
|
53
|
+
|
|
54
|
+
1. Fork the repository
|
|
55
|
+
2. Create a branch from `main`:
|
|
56
|
+
```bash
|
|
57
|
+
git checkout -b my-feature
|
|
58
|
+
```
|
|
59
|
+
3. Make your changes
|
|
60
|
+
4. Run all checks before committing:
|
|
61
|
+
```bash
|
|
62
|
+
uv run ruff check --fix . && uv run ruff format . && uv run pyright && uv run pytest
|
|
63
|
+
```
|
|
64
|
+
5. Push and open a pull request
|
|
65
|
+
|
|
66
|
+
### Pre-commit Hooks
|
|
67
|
+
|
|
68
|
+
Pre-commit is configured to run ruff, ruff-format, pyright, and pytest on all Python files. To install the hooks:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uv run pre-commit install
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Pull Requests
|
|
75
|
+
|
|
76
|
+
- Describe the change and its motivation
|
|
77
|
+
- Link any related issues
|
|
78
|
+
- Ensure all checks pass (ruff, pyright, pytest)
|
|
79
|
+
- Keep PRs focused — one concern per PR is ideal
|
|
80
|
+
- If adding a new feature, include tests
|
|
81
|
+
|
|
82
|
+
## Code Style
|
|
83
|
+
|
|
84
|
+
### Formatting
|
|
85
|
+
|
|
86
|
+
Formatting is enforced by **ruff** with these settings:
|
|
87
|
+
|
|
88
|
+
- Line length: **120**
|
|
89
|
+
- Quote style: **double quotes**
|
|
90
|
+
- Indent: **4 spaces**
|
|
91
|
+
- Target: **Python >=3.11**
|
|
92
|
+
|
|
93
|
+
### Naming
|
|
94
|
+
|
|
95
|
+
| Element | Convention | Example |
|
|
96
|
+
|---|---|---|
|
|
97
|
+
| Functions / variables | `snake_case` | `process_data`, `col_name` |
|
|
98
|
+
| Classes | `PascalCase` | `Tcode`, `TablaBase` |
|
|
99
|
+
| Module constants | `UPPER_CASE` | `STORE`, `TOKEN_SEP` |
|
|
100
|
+
|
|
101
|
+
### Comment Markers
|
|
102
|
+
|
|
103
|
+
Use these prefixes for inline comments:
|
|
104
|
+
|
|
105
|
+
| Marker | Meaning | Example |
|
|
106
|
+
|---|---|---|
|
|
107
|
+
| `# ?` | Description or clarification | `# ? strip whitespace from column names` |
|
|
108
|
+
| `# !` | Warning or important note | `# ! must run before entity resolution` |
|
|
109
|
+
| `# *` | Pipeline stage marker | `# * Stage 2: Entity Resolution` |
|
|
110
|
+
| `# TODO:` | Todo item | `# TODO: add fuzzy matching support` |
|
|
111
|
+
|
|
112
|
+
Do **not** write docstrings on functions. Use a `# ?` comment on the line above instead.
|
|
113
|
+
|
|
114
|
+
### Type Annotations
|
|
115
|
+
|
|
116
|
+
**Every variable** must have a type annotation, including locals:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
col: str = "name"
|
|
120
|
+
df: pl.DataFrame = pl.DataFrame()
|
|
121
|
+
result: Optional[int] = None
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
- Use `Optional[T]` and `Union[...]` (not `T | None` or `X | Y`)
|
|
125
|
+
- Use `Self` for class methods returning the class type
|
|
126
|
+
- Use `Path` (not `str`) for filesystem paths
|
|
127
|
+
- Use `# pyright: ignore` to suppress false positives from lazy-loaded modules
|
|
128
|
+
|
|
129
|
+
### Imports
|
|
130
|
+
|
|
131
|
+
Every file starts with:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from __future__ import annotations
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Heavy dependencies are **lazy-loaded** per module:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from typing import TYPE_CHECKING
|
|
141
|
+
import lazy_loader as Lazy
|
|
142
|
+
|
|
143
|
+
if TYPE_CHECKING:
|
|
144
|
+
import polars as pl
|
|
145
|
+
else:
|
|
146
|
+
pl = Lazy.load("polars")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Lazy-loaded packages: `polars`, `duckdb`, `orjson`, `typer`, `xxhash`, `polars_hash`, `yaml`
|
|
150
|
+
|
|
151
|
+
Import order: standard library → blank line → third-party → blank line → local
|
|
152
|
+
|
|
153
|
+
### Pydantic Models
|
|
154
|
+
|
|
155
|
+
All models inherit from `TablaBase(BaseModel)`:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from tablassert.models import TablaBase
|
|
159
|
+
|
|
160
|
+
class MyModel(TablaBase):
|
|
161
|
+
name: str = Field(...)
|
|
162
|
+
description: Optional[str] = Field(None)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
- Required fields use `Field(...)` (ellipsis sentinel)
|
|
166
|
+
- Optional fields use `Optional[T] = Field(None)`
|
|
167
|
+
- `extra = "forbid"` — no unknown fields allowed
|
|
168
|
+
- `validate_assignment = True` — re-validate on mutation
|
|
169
|
+
|
|
170
|
+
### Enums
|
|
171
|
+
|
|
172
|
+
All enums live in `enums.py` and extend `str, Enum`:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
class Tokens(str, Enum):
|
|
176
|
+
PIPE = "|"
|
|
177
|
+
COMMA = ","
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Error Handling
|
|
181
|
+
|
|
182
|
+
- Use `RuntimeError` for exceptional cases
|
|
183
|
+
- Use `logger.warning()` for non-fatal issues
|
|
184
|
+
- Import logger: `from tablassert.log import logger`
|
|
185
|
+
|
|
186
|
+
## Testing
|
|
187
|
+
|
|
188
|
+
Tests live in the `tests/` directory at the repo root. Test fixtures are in `tests/fixtures/`.
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Run all tests
|
|
192
|
+
uv run pytest
|
|
193
|
+
|
|
194
|
+
# Run a specific test
|
|
195
|
+
uv run pytest tests/test_lib.py::test_my_function
|
|
196
|
+
|
|
197
|
+
# Run tests matching a pattern
|
|
198
|
+
uv run pytest -k "encoding"
|
|
199
|
+
|
|
200
|
+
# Run with print output
|
|
201
|
+
uv run pytest -s tests/test_lib.py
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
`conftest.py` provides a `fixtures_path` fixture returning `Path(__file__).parent / "fixtures"`.
|
|
205
|
+
|
|
206
|
+
### Adding Tests
|
|
207
|
+
|
|
208
|
+
- Place test files in `tests/` following the naming convention `test_<module>.py`
|
|
209
|
+
- Use the `fixtures_path` fixture for loading test data
|
|
210
|
+
- Add YAML fixture files to `tests/fixtures/` as needed
|
|
211
|
+
|
|
212
|
+
## AI-Assisted Contributions
|
|
213
|
+
|
|
214
|
+
Tablassert supports AI-assisted development. The repository includes an `AGENTS.md` file in the root that provides detailed guidance for AI coding tools (GitHub Copilot, Cursor, Claude Code, OpenHands, etc.).
|
|
215
|
+
|
|
216
|
+
If you use AI tools to contribute:
|
|
217
|
+
|
|
218
|
+
- Review all generated code before submitting
|
|
219
|
+
- Ensure it follows the conventions described above and in `AGENTS.md`
|
|
220
|
+
- Run all checks (`ruff`, `pyright`, `pytest`) — AI-generated code often needs style adjustments
|
|
221
|
+
- The conventions in this file and `AGENTS.md` help AI tools produce idiomatic Tablassert code
|
|
222
|
+
|
|
223
|
+
## Reporting Issues
|
|
224
|
+
|
|
225
|
+
- **Bug reports** and **feature requests**: open an issue at [github.com/SkyeAv/Tablassert/issues](https://github.com/SkyeAv/Tablassert/issues)
|
|
226
|
+
- Please include reproduction steps for bugs and a clear description for feature requests
|
|
227
|
+
|
|
228
|
+
## License
|
|
229
|
+
|
|
230
|
+
By contributing to Tablassert, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE).
|
|
231
|
+
|
|
232
|
+
## Code of Conduct
|
|
233
|
+
|
|
234
|
+
### Our Pledge
|
|
235
|
+
|
|
236
|
+
We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
|
|
237
|
+
|
|
238
|
+
We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
|
|
239
|
+
|
|
240
|
+
### Our Standards
|
|
241
|
+
|
|
242
|
+
Examples of behavior that contributes to a positive environment:
|
|
243
|
+
|
|
244
|
+
- Demonstrating empathy and kindness toward other people
|
|
245
|
+
- Being respectful of differing opinions, viewpoints, and experiences
|
|
246
|
+
- Giving and gracefully accepting constructive feedback
|
|
247
|
+
- Accepting responsibility and apologizing to those affected by mistakes
|
|
248
|
+
- Focusing on what is best not just for us as individuals, but for the overall community
|
|
249
|
+
|
|
250
|
+
Examples of unacceptable behavior:
|
|
251
|
+
|
|
252
|
+
- The use of sexualized language or imagery, and sexual attention or advances
|
|
253
|
+
- Trolling, insulting or derogatory comments, and personal or political attacks
|
|
254
|
+
- Public or private harassment
|
|
255
|
+
- Publishing others' private information without explicit permission
|
|
256
|
+
- Other conduct which could reasonably be considered inappropriate
|
|
257
|
+
|
|
258
|
+
### Enforcement
|
|
259
|
+
|
|
260
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the project maintainer at [sgoetz@isbscience.org](mailto:sgoetz@isbscience.org). All complaints will be reviewed and investigated fairly.
|
|
261
|
+
|
|
262
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions.
|
|
263
|
+
|
|
264
|
+
### Attribution
|
|
265
|
+
|
|
266
|
+
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1.
|