tablassert 7.0.2__tar.gz → 7.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. tablassert-7.2.0/.github/workflows/autotag.yml +24 -0
  2. tablassert-7.2.0/.github/workflows/docker.yml +27 -0
  3. {tablassert-7.0.2 → tablassert-7.2.0}/.github/workflows/pipy.yml +3 -3
  4. {tablassert-7.0.2 → tablassert-7.2.0}/.gitignore +3 -3
  5. tablassert-7.2.0/.pre-commit-config.yaml +21 -0
  6. tablassert-7.2.0/AGENTS.md +170 -0
  7. {tablassert-7.0.2 → tablassert-7.2.0}/CHANGELOG.md +29 -3
  8. tablassert-7.2.0/CITATION.cff +34 -0
  9. tablassert-7.2.0/CONTRIBUTING.md +266 -0
  10. tablassert-7.2.0/Dockerfile +6 -0
  11. tablassert-7.2.0/PKG-INFO +113 -0
  12. tablassert-7.2.0/README.md +66 -0
  13. {tablassert-7.0.2 → tablassert-7.2.0}/docs/api/fullmap.md +75 -34
  14. {tablassert-7.0.2 → tablassert-7.2.0}/docs/api/qc.md +7 -12
  15. {tablassert-7.0.2 → tablassert-7.2.0}/docs/cli.md +31 -3
  16. {tablassert-7.0.2 → tablassert-7.2.0}/docs/configuration/advanced-example.md +164 -0
  17. {tablassert-7.0.2 → tablassert-7.2.0}/docs/configuration/graph.md +8 -8
  18. {tablassert-7.0.2 → tablassert-7.2.0}/docs/configuration/table.md +2 -2
  19. tablassert-7.2.0/docs/datassert.md +66 -0
  20. tablassert-7.2.0/docs/docker.md +90 -0
  21. {tablassert-7.0.2 → tablassert-7.2.0}/docs/examples/tutorial-graph.yaml +1 -1
  22. tablassert-7.2.0/docs/examples.md +352 -0
  23. {tablassert-7.0.2 → tablassert-7.2.0}/docs/index.md +7 -7
  24. {tablassert-7.0.2 → tablassert-7.2.0}/docs/installation.md +41 -13
  25. {tablassert-7.0.2 → tablassert-7.2.0}/docs/tutorial.md +3 -3
  26. tablassert-7.2.0/llms.txt +48 -0
  27. {tablassert-7.0.2 → tablassert-7.2.0}/mkdocs.yml +3 -0
  28. {tablassert-7.0.2 → tablassert-7.2.0}/pyproject.toml +27 -9
  29. tablassert-7.2.0/src/tablassert/cli.py +127 -0
  30. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/downloader.py +9 -2
  31. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/enums.py +2 -0
  32. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/fullmap.py +87 -32
  33. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/ingests.py +9 -2
  34. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/lib.py +26 -135
  35. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/log.py +2 -0
  36. tablassert-7.2.0/src/tablassert/models.py +216 -0
  37. tablassert-7.2.0/src/tablassert/nlp.py +28 -0
  38. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/qc.py +44 -41
  39. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/utils.py +11 -10
  40. tablassert-7.2.0/tests/__init__.py +0 -0
  41. tablassert-7.2.0/tests/conftest.py +10 -0
  42. tablassert-7.2.0/tests/fixtures/invalid_section_missing_source.yaml +15 -0
  43. tablassert-7.2.0/tests/fixtures/minimal_section.yaml +20 -0
  44. tablassert-7.2.0/tests/fixtures/minimal_section_with_sections.yaml +41 -0
  45. tablassert-7.2.0/tests/test_enums.py +179 -0
  46. tablassert-7.2.0/tests/test_fullmap.py +59 -0
  47. tablassert-7.2.0/tests/test_ingests.py +111 -0
  48. tablassert-7.2.0/tests/test_lib.py +118 -0
  49. tablassert-7.2.0/tests/test_models.py +265 -0
  50. tablassert-7.2.0/tests/test_nlp.py +64 -0
  51. tablassert-7.2.0/tests/test_utils.py +80 -0
  52. {tablassert-7.0.2 → tablassert-7.2.0}/uv.lock +936 -317
  53. tablassert-7.0.2/.pre-commit-config.yaml +0 -15
  54. tablassert-7.0.2/.python-version +0 -1
  55. tablassert-7.0.2/.vscode/settings.json +0 -23
  56. tablassert-7.0.2/PKG-INFO +0 -165
  57. tablassert-7.0.2/README.md +0 -128
  58. tablassert-7.0.2/src/tablassert/models.py +0 -132
  59. {tablassert-7.0.2 → tablassert-7.2.0}/.github/workflows/docs.yml +0 -0
  60. {tablassert-7.0.2 → tablassert-7.2.0}/LICENSE +0 -0
  61. {tablassert-7.0.2 → tablassert-7.2.0}/docs/api/utils.md +0 -0
  62. {tablassert-7.0.2 → tablassert-7.2.0}/docs/examples/tutorial-data.csv +0 -0
  63. {tablassert-7.0.2 → tablassert-7.2.0}/docs/examples/tutorial-table.yaml +0 -0
  64. {tablassert-7.0.2 → tablassert-7.2.0}/src/tablassert/__init__.py +0 -0
@@ -0,0 +1,24 @@
1
+ name: Auto Tag Versions
2
+ on:
3
+ workflow_run:
4
+ workflows:
5
+ - "Deploy to PyPI"
6
+ types:
7
+ - completed
8
+ branches:
9
+ - main
10
+ permissions:
11
+ contents: write
12
+ jobs:
13
+ autotag:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 0
19
+ - name: Autotag using pyproject.toml
20
+ uses: butlerlogic/action-autotag@1.0.1
21
+ with:
22
+ GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
23
+ tag_prefix: "v"
24
+ strategy: python
@@ -0,0 +1,27 @@
1
+ name: Publish Docker Image
2
+ on:
3
+ push:
4
+ tags:
5
+ - "v*"
6
+ jobs:
7
+ publish:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ contents: read
11
+ packages: write
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: docker/setup-buildx-action@v3
15
+ - uses: docker/login-action@v3
16
+ with:
17
+ registry: ghcr.io
18
+ username: ${{ github.actor }}
19
+ password: ${{ secrets.GITHUB_TOKEN }}
20
+ - uses: docker/build-push-action@v6
21
+ with:
22
+ context: .
23
+ file: ./Dockerfile
24
+ push: true
25
+ tags: |
26
+ ghcr.io/${{ github.repository_owner }}/tablassert:latest
27
+ ghcr.io/${{ github.repository_owner }}/tablassert:${{ github.ref_name }}
@@ -1,8 +1,8 @@
1
1
  name: Deploy to PyPI
2
2
  on:
3
3
  push:
4
- branches: [main]
5
- paths: [pyproject.toml]
4
+ branches:
5
+ - main
6
6
  jobs:
7
7
  publish:
8
8
  runs-on: ubuntu-latest
@@ -16,4 +16,4 @@ jobs:
16
16
  - name: Build package
17
17
  run: uv build
18
18
  - name: Publish to PyPI
19
- uses: pypa/gh-action-pypi-publish@release/v1
19
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -1,17 +1,17 @@
1
1
  *.egg-info
2
+ *.python-version
2
3
  *__pycache__/
3
4
  *.logs/
4
5
  *.opencode/
5
6
  *.ruff_cache/
6
7
  *.pytest_cache/
7
8
  *plans/
8
- *CLAUDE.md
9
- *.claude/
10
- *.cachassert/
9
+ *specs/
11
10
  *.storassert/
12
11
  *.logassert/
13
12
  *DATALAKE/
14
13
  *.onnxassert/
14
+ *.claude/
15
15
  *.envrc
16
16
  *venv/
17
17
  *.log
@@ -0,0 +1,21 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.9.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: local
9
+ hooks:
10
+ - id: pyright
11
+ name: pyright
12
+ entry: uv run pyright
13
+ language: system
14
+ types: [python]
15
+ pass_filenames: false
16
+ - id: pytest
17
+ name: pytest
18
+ entry: uv run pytest
19
+ language: system
20
+ types: [python]
21
+ pass_filenames: false
@@ -0,0 +1,170 @@
1
+ # AGENTS.md — Tablassert
2
+
3
+ Guidance for AI coding agents working in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Tablassert is a Python package (>=3.11) for tabular data assertion, normalization, and quality control. It builds declarative knowledge graphs from tabular data, exporting NCATS Translator-compliant KGX NDJSON. Uses **Polars** DataFrames, **DuckDB** for entity resolution, and **ONNX/BioBERT** for quality control. CLI built with **Typer**. Models built with **Pydantic v2**.
8
+
9
+ ## Quick Reference
10
+
11
+ | Task | Command |
12
+ |---|---|
13
+ | Install | `uv sync` |
14
+ | Run CLI | `uv run tablassert` |
15
+ | Lint | `uv run ruff check .` |
16
+ | Lint (fix) | `uv run ruff check --fix .` |
17
+ | Format | `uv run ruff format .` |
18
+ | Format check | `uv run ruff format --check .` |
19
+ | Type check | `uv run pyright` |
20
+ | All checks | `uv run pre-commit run --all-files` |
21
+ | Run all tests | `uv run pytest` |
22
+ | Run single test | `uv run pytest tests/test_foo.py::test_name` |
23
+ | Run by keyword | `uv run pytest -k "test_pattern"` |
24
+ | Run with print | `uv run pytest -s tests/test_foo.py` |
25
+ | Build | `uv build` |
26
+ | Build docs | `uv run --group dev mkdocs build` |
27
+ | Add dependency | `uv add <package>` |
28
+ | Add dev dependency | `uv add --group dev <package>` |
29
+
30
+ ## Repository Structure
31
+
32
+ ```
33
+ src/tablassert/
34
+ cli.py # Typer CLI (entry point: tablassert.cli:CLI)
35
+ lib.py # Core logic: encodings, data loading, Tcode(Section) class
36
+ models.py # Pydantic v2 models (TablaBase base class)
37
+ enums.py # str, Enum subclasses (Tokens, Repositories, Comparisons, etc.)
38
+ fullmap.py # NER / entity resolution (DuckDB, 16 shards)
39
+ qc.py # Quality control (ONNX/BioBERT, sentence_transformers)
40
+ nlp.py # Text normalization (level_one: strip+lowercase, level_two: regex)
41
+ ingests.py # YAML ingestion: from_yaml(), to_sections(), fastmerge()
42
+ downloader.py # Playwright-based file downloads with retries
43
+ utils.py # Hashing (xxhash), STORE path, namespace UUIDs
44
+ log.py # loguru logger → .logassert/logassert.log
45
+ __init__.py # Empty file (lazy loading is per-module, not here)
46
+ docs/ # MkDocs documentation source
47
+ mkdocs.yml # MkDocs configuration
48
+ pyproject.toml # Project config, dependencies, tool settings
49
+ tests/ # Test directory (at repo root)
50
+ ```
51
+
52
+ - `conftest.py` provides a `fixtures_path` fixture returning `Path(__file__).parent / "fixtures"`.
53
+ - pytest configured via `pyproject.toml` `[tool.pytest.ini_options]` with `testpaths = ["tests"]`.
54
+ - Test fixtures: `tests/fixtures/` contains YAML files for Section model tests.
55
+ - Test modules: `test_enums.py`, `test_fullmap.py`, `test_ingests.py`, `test_lib.py`, `test_models.py`, `test_nlp.py`, `test_utils.py`.
56
+
57
+ ## Code Style
58
+
59
+ ### Imports
60
+
61
+ - Every file starts with `from __future__ import annotations`
62
+ - Heavy dependencies are loaded **lazily per-module** using this pattern:
63
+ ```python
64
+ from typing import TYPE_CHECKING
65
+ import lazy_loader as Lazy
66
+
67
+ if TYPE_CHECKING:
68
+ import polars as pl
69
+ else:
70
+ pl = Lazy.load("polars")
71
+ ```
72
+ - Lazy-loaded deps: `polars`, `duckdb`, `orjson`, `typer`, `xxhash`, `polars_hash`, `yaml`
73
+ - Direct (non-lazy) heavy deps: `sqlite_utils`, `rapidfuzz`, `pydantic`, `loguru`, `yaml.CLoader`
74
+ - Previously-optional deps now in core: `sentence_transformers`, `onnxruntime`, `sklearn`, `playwright`, `pyexcel` — lazy-loaded when present
75
+ - Some modules mix direct and lazy imports for the same package (e.g., `ingests.py` does `from yaml import CLoader` directly, then lazy-loads `yaml` for `yaml.load()`)
76
+ - Import order: standard library → blank line → third-party → blank line → local
77
+ - Use `from __future__ import annotations` to enable deferred evaluation
78
+
79
+ ### Type Annotations
80
+
81
+ - **Every variable** gets a type annotation, including locals: `col: str = "name"`, `df: pl.DataFrame = ...`
82
+ - Use `Optional[T]` and `Union[...]` (not `T | None` or `X | Y`)
83
+ - Use `Self` for class methods returning the class type
84
+ - Use `Path` (not `str`) for filesystem paths
85
+ - Use `# pyright: ignore` comments to suppress false positives from lazy-loaded modules
86
+
87
+ ### Pydantic Models
88
+
89
+ - All models inherit from `TablaBase(BaseModel)` which sets:
90
+ ```python
91
+ model_config: ConfigDict = ConfigDict( # pyright: ignore
92
+ str_strip_whitespace=False,
93
+ validate_assignment=True,
94
+ use_enum_values=True,
95
+ extra="forbid",
96
+ populate_by_name=True,
97
+ )
98
+ ```
99
+ - Required fields: `Field(...)` (ellipsis sentinel)
100
+ - Optional fields: `Optional[T] = Field(None)`
101
+ - All enums are `str, Enum` subclasses (defined in `enums.py`)
102
+
103
+ ### Enums
104
+
105
+ All enums live in `enums.py` and extend `str, Enum`. Key enums: `Tokens`, `Repositories`, `Contributions`, `Comparisons`, `Functions`, `Files`, `EncodingMethods`, `FillMethods`, `Syntaxes`, `Statuses`, `Categories`, `Predicates`, `Qualifiers`.
106
+
107
+ ### Naming
108
+
109
+ - Functions/variables: `snake_case`
110
+ - Classes: `PascalCase`
111
+ - Module-level constants: `UPPER_CASE`
112
+
113
+ ### Comments
114
+
115
+ - `# ?` — descriptions / clarifications
116
+ - `# !` — warnings / important notes
117
+ - `# *` — stage markers (pipeline steps)
118
+ - `# TODO:` — todos
119
+ - No docstrings on functions; use `# ?` comment on the line above instead
120
+
121
+ ### Formatting (enforced by ruff)
122
+
123
+ - Line length: **120**
124
+ - Quote style: **double quotes**
125
+ - Indent: **4 spaces**
126
+ - `skip-magic-trailing-comma = true`
127
+ - Target: Python >=3.11
128
+
129
+ ### Error Handling
130
+
131
+ - Use `RuntimeError` for exceptional cases (no custom exception classes currently)
132
+ - Use `logger.warning()` for non-fatal issues (e.g., empty subgraphs)
133
+ - Logger: `from tablassert.log import logger`
134
+
135
+ ### Other Conventions
136
+
137
+ - `operator.add` for Polars string concatenation on columns (not `+` directly)
138
+ - CLI entry point: `tablassert.cli:CLI` (Typer app with `pretty_exceptions_show_locals=False`)
139
+ - Use `rich.progress` for progress tracking in CLI
140
+ - Data side-effects stored in hidden directories: `.logassert/`, `.storassert/`, `.onnxassert/`
141
+
142
+ ## Tools
143
+
144
+ - **ruff** — linting (`ruff check`) and formatting (`ruff format`)
145
+ - **pyright** — type checking (no pyrightconfig.json; uses defaults)
146
+ - **pre-commit** — runs ruff fix, ruff-format, pyright, and pytest on all Python files
147
+ - **pytest** — testing (>=9.0.2)
148
+ - **uv** — package manager (use `uv run` for all commands, `uv add` for deps)
149
+ - **hatchling** — build backend
150
+
151
+ ## Optional Dependency Groups
152
+
153
+ Defined in `pyproject.toml` `[project.optional-dependencies]`:
154
+ - `rtcompat` — `polars[rtcompat]` (runtime-compatible Polars build for CPUs without required instructions)
155
+ - `rt` — alias for `rtcompat`
156
+
157
+ All other dependencies (ML, web, Excel) are now in core `dependencies`.
158
+
159
+ Install with: `uv sync` or `pip install tablassert`
160
+
161
+ ## CI Workflows
162
+
163
+ - **PyPI publish** (`.github/workflows/pipy.yml`): builds and publishes on push to `main`
164
+ - **MkDocs deploy** (`.github/workflows/docs.yml`): builds docs and deploys to GitHub Pages on push to `main`
165
+ - **Docker publish** (`.github/workflows/docker.yml`): builds and pushes image to GHCR on tag push (`v*`)
166
+ - **Autotag** (`.github/workflows/autotag.yml`): automatic version tagging
167
+
168
+ ## Key Dependencies
169
+
170
+ polars, duckdb, orjson, pydantic, typer, xxhash, loguru, rapidfuzz, scikit-learn, sqlite-utils, pyyaml, lazy-loader, polars-hash, fastexcel, pyarrow, optimum-onnx
@@ -2,18 +2,44 @@
2
2
 
3
3
  All notable changes to this project are documented in this file.
4
4
 
5
+ ## 7.2.0 - 2026-03-31
6
+
7
+ ### New Features
8
+ - Added `tablassert version` command to display current package version.
9
+ - Added autotag GitHub Action for automated version tagging on releases.
10
+ - Added PyPI publishing GitHub Action.
11
+ - Added Docker image publishing to GitHub Container Registry (ghcr.io).
12
+
13
+ ### Changes
14
+ - Sharded datassert entity-resolution database into 16 DuckDB shards for parallel querying.
15
+ - Renamed dependency from DBssert to DATASSERT throughout.
16
+ - Separated CLI logic into dedicated `cli.py` module.
17
+ - Extracted NLP normalization into dedicated `nlp.py` module for cleaner separation of concerns.
18
+ - Implemented improved parallelization model for graph compilation.
19
+ - Annotated Pydantic model fields with `Field(...)` schema metadata.
20
+ - Renamed `fullmap.version4()` to `fullmap.resolve()` for clarity.
21
+ - Updated `fullmap` ranking to prioritize case-insensitive exact matches between normalized terms and preferred names.
22
+ - Updated `fullmap` term de-duplication to keep first occurrences, improving deterministic output ordering.
23
+ - Moved MkDocs to dev-only dependencies.
24
+
25
+ ### Testing
26
+ - Added basic pytest suite covering core models, enums, ingests, lib, nlp, and utils.
27
+
28
+ ### Maintenance
29
+ - Improved `.gitignore` to exclude common artifacts.
30
+
5
31
  ## 7.0.2 - 2026-03-23
6
32
 
7
33
  ### Changes
8
34
  - Updated package metadata for the 7.0.2 release.
9
- - Added optional `log` and `column_context` controls to `fullmap.version4()` for more configurable entity-resolution behavior.
35
+ - Added optional `log` and `column_context` controls to `fullmap.resolve()` for more configurable entity-resolution behavior.
10
36
 
11
37
  ### Bug Fixes
12
38
  - Reworked entity-resolution querying to register terms directly in DuckDB instead of writing temporary parquet files, removing tempfile lifecycle issues in `fullmap` query execution.
13
39
  - Isolated unmatched-entity logging into a dedicated helper and gated it behind an explicit logging flag.
14
40
 
15
41
  ### Documentation
16
- - Updated API reference docs to match the current `version4()` function signature and behavior.
42
+ - Updated API reference docs to match the current `resolve()` function signature and behavior.
17
43
  - Corrected QC documentation to reflect the implemented fuzzy/BERT validation pipeline.
18
44
  - Fixed documentation path typos for cache/store artifact directories.
19
45
 
@@ -39,7 +65,7 @@ All notable changes to this project are documented in this file.
39
65
 
40
66
  ### Breaking Changes
41
67
  - Nix is no longer supported for development and installation. Use UV-based installation instead.
42
- - Project now requires Python 3.13+ for compatibility with UV toolchain.
68
+ - Project now requires Python 3.11+ for compatibility with UV toolchain.
43
69
 
44
70
  ### Documentation
45
71
  - Completely rewrote installation documentation to reflect UV-based development environment.
@@ -0,0 +1,34 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use Tablassert, please cite it as below."
3
+ type: software
4
+ title: Tablassert
5
+ version: 7.2.0
6
+ license: Apache-2.0
7
+ repository-code: https://github.com/SkyeAv/Tablassert
8
+ abstract: Tablassert is a highly performant declarative knowledge graph backend for bioinformatics that extracts knowledge assertions from tabular data, performs entity resolution and data quality control, and exports NCATS Translator-compliant Knowledge Graph Exchange (KGX) NDJSON.
9
+ authors:
10
+ - given-names: Skye Lane
11
+ family-names: Goetz
12
+ email: sgoetz@isbscience.org
13
+ affiliation: Institute for Systems Biology; CalPoly SLO
14
+ - given-names: "Gwênlyn"
15
+ family-names: Glusman
16
+ email: gglusman@isbscience.org
17
+ affiliation: Institute for Systems Biology
18
+ - given-names: Jared C.
19
+ family-names: Roach
20
+ affiliation: Institute for Systems Biology
21
+ references:
22
+ - type: article
23
+ title: "MicrobiomeKG: bridging microbiome research and host health through knowledge graphs"
24
+ authors:
25
+ - given-names: Skye Lane
26
+ family-names: Goetz
27
+ - given-names: Alex K.
28
+ family-names: Glen
29
+ - given-names: "Gwênlyn"
30
+ family-names: Glusman
31
+ journal: Frontiers in Systems Biology
32
+ year: 2025
33
+ volume: 5
34
+ doi: "10.3389/fsysb.2025.1544432"
@@ -0,0 +1,266 @@
1
+ # Contributing to Tablassert
2
+
3
+ Thank you for your interest in contributing to Tablassert! This guide covers everything you need to get started.
4
+
5
+ For full documentation, visit [skyeav.github.io/Tablassert](https://skyeav.github.io/Tablassert/).
6
+
7
+ ## Getting Started
8
+
9
+ ### Prerequisites
10
+
11
+ - **Python 3.11** or higher
12
+ - **[UV](https://docs.astral.sh/uv/)** package manager
13
+ - **Git**
14
+
15
+ ### Setup
16
+
17
+ ```bash
18
+ git clone https://github.com/SkyeAv/Tablassert.git
19
+ cd Tablassert
20
+ uv sync
21
+ ```
22
+
23
+ ### Optional Dependency Groups
24
+
25
+ Some features require optional dependencies:
26
+
27
+ ```bash
28
+ uv sync --extra ml # sentence-transformers, onnxruntime, scikit-learn
29
+ uv sync --extra web # playwright
30
+ uv sync --extra pyexcel # pyexcel
31
+ uv sync --extra full # all optional deps
32
+ ```
33
+
34
+ ## Development Workflow
35
+
36
+ ### Quick Reference
37
+
38
+ | Task | Command |
39
+ |---|---|
40
+ | Run CLI | `uv run tablassert --help` |
41
+ | Lint | `uv run ruff check .` |
42
+ | Lint (fix) | `uv run ruff check --fix .` |
43
+ | Format | `uv run ruff format .` |
44
+ | Format check | `uv run ruff format --check .` |
45
+ | Type check | `uv run pyright` |
46
+ | All checks | `uv run pre-commit run --all-files` |
47
+ | Run all tests | `uv run pytest` |
48
+ | Run single test | `uv run pytest tests/test_foo.py::test_name` |
49
+ | Run by keyword | `uv run pytest -k "test_pattern"` |
50
+ | Build | `uv build` |
51
+
52
+ ### Branching
53
+
54
+ 1. Fork the repository
55
+ 2. Create a branch from `main`:
56
+ ```bash
57
+ git checkout -b my-feature
58
+ ```
59
+ 3. Make your changes
60
+ 4. Run all checks before committing:
61
+ ```bash
62
+ uv run ruff check --fix . && uv run ruff format . && uv run pyright && uv run pytest
63
+ ```
64
+ 5. Push and open a pull request
65
+
66
+ ### Pre-commit Hooks
67
+
68
+ Pre-commit is configured to run ruff, ruff-format, pyright, and pytest on all Python files. To install the hooks:
69
+
70
+ ```bash
71
+ uv run pre-commit install
72
+ ```
73
+
74
+ ## Pull Requests
75
+
76
+ - Describe the change and its motivation
77
+ - Link any related issues
78
+ - Ensure all checks pass (ruff, pyright, pytest)
79
+ - Keep PRs focused — one concern per PR is ideal
80
+ - If adding a new feature, include tests
81
+
82
+ ## Code Style
83
+
84
+ ### Formatting
85
+
86
+ Formatting is enforced by **ruff** with these settings:
87
+
88
+ - Line length: **120**
89
+ - Quote style: **double quotes**
90
+ - Indent: **4 spaces**
91
+ - Target: **Python >=3.11**
92
+
93
+ ### Naming
94
+
95
+ | Element | Convention | Example |
96
+ |---|---|---|
97
+ | Functions / variables | `snake_case` | `process_data`, `col_name` |
98
+ | Classes | `PascalCase` | `Tcode`, `TablaBase` |
99
+ | Module constants | `UPPER_CASE` | `STORE`, `TOKEN_SEP` |
100
+
101
+ ### Comment Markers
102
+
103
+ Use these prefixes for inline comments:
104
+
105
+ | Marker | Meaning | Example |
106
+ |---|---|---|
107
+ | `# ?` | Description or clarification | `# ? strip whitespace from column names` |
108
+ | `# !` | Warning or important note | `# ! must run before entity resolution` |
109
+ | `# *` | Pipeline stage marker | `# * Stage 2: Entity Resolution` |
110
+ | `# TODO:` | Todo item | `# TODO: add fuzzy matching support` |
111
+
112
+ Do **not** write docstrings on functions. Use a `# ?` comment on the line above instead.
113
+
114
+ ### Type Annotations
115
+
116
+ **Every variable** must have a type annotation, including locals:
117
+
118
+ ```python
119
+ col: str = "name"
120
+ df: pl.DataFrame = pl.DataFrame()
121
+ result: Optional[int] = None
122
+ ```
123
+
124
+ - Use `Optional[T]` and `Union[...]` (not `T | None` or `X | Y`)
125
+ - Use `Self` for class methods returning the class type
126
+ - Use `Path` (not `str`) for filesystem paths
127
+ - Use `# pyright: ignore` to suppress false positives from lazy-loaded modules
128
+
129
+ ### Imports
130
+
131
+ Every file starts with:
132
+
133
+ ```python
134
+ from __future__ import annotations
135
+ ```
136
+
137
+ Heavy dependencies are **lazy-loaded** per module:
138
+
139
+ ```python
140
+ from typing import TYPE_CHECKING
141
+ import lazy_loader as Lazy
142
+
143
+ if TYPE_CHECKING:
144
+ import polars as pl
145
+ else:
146
+ pl = Lazy.load("polars")
147
+ ```
148
+
149
+ Lazy-loaded packages: `polars`, `duckdb`, `orjson`, `typer`, `xxhash`, `polars_hash`, `yaml`
150
+
151
+ Import order: standard library → blank line → third-party → blank line → local
152
+
153
+ ### Pydantic Models
154
+
155
+ All models inherit from `TablaBase(BaseModel)`:
156
+
157
+ ```python
158
+ from tablassert.models import TablaBase
159
+
160
+ class MyModel(TablaBase):
161
+ name: str = Field(...)
162
+ description: Optional[str] = Field(None)
163
+ ```
164
+
165
+ - Required fields use `Field(...)` (ellipsis sentinel)
166
+ - Optional fields use `Optional[T] = Field(None)`
167
+ - `extra = "forbid"` — no unknown fields allowed
168
+ - `validate_assignment = True` — re-validate on mutation
169
+
170
+ ### Enums
171
+
172
+ All enums live in `enums.py` and extend `str, Enum`:
173
+
174
+ ```python
175
+ class Tokens(str, Enum):
176
+ PIPE = "|"
177
+ COMMA = ","
178
+ ```
179
+
180
+ ### Error Handling
181
+
182
+ - Use `RuntimeError` for exceptional cases
183
+ - Use `logger.warning()` for non-fatal issues
184
+ - Import logger: `from tablassert.log import logger`
185
+
186
+ ## Testing
187
+
188
+ Tests live in the `tests/` directory at the repo root. Test fixtures are in `tests/fixtures/`.
189
+
190
+ ```bash
191
+ # Run all tests
192
+ uv run pytest
193
+
194
+ # Run a specific test
195
+ uv run pytest tests/test_lib.py::test_my_function
196
+
197
+ # Run tests matching a pattern
198
+ uv run pytest -k "encoding"
199
+
200
+ # Run with print output
201
+ uv run pytest -s tests/test_lib.py
202
+ ```
203
+
204
+ `conftest.py` provides a `fixtures_path` fixture returning `Path(__file__).parent / "fixtures"`.
205
+
206
+ ### Adding Tests
207
+
208
+ - Place test files in `tests/` following the naming convention `test_<module>.py`
209
+ - Use the `fixtures_path` fixture for loading test data
210
+ - Add YAML fixture files to `tests/fixtures/` as needed
211
+
212
+ ## AI-Assisted Contributions
213
+
214
+ Tablassert supports AI-assisted development. The repository includes an `AGENTS.md` file in the root that provides detailed guidance for AI coding tools (GitHub Copilot, Cursor, Claude Code, OpenHands, etc.).
215
+
216
+ If you use AI tools to contribute:
217
+
218
+ - Review all generated code before submitting
219
+ - Ensure it follows the conventions described above and in `AGENTS.md`
220
+ - Run all checks (`ruff`, `pyright`, `pytest`) — AI-generated code often needs style adjustments
221
+ - The conventions in this file and `AGENTS.md` help AI tools produce idiomatic Tablassert code
222
+
223
+ ## Reporting Issues
224
+
225
+ - **Bug reports** and **feature requests**: open an issue at [github.com/SkyeAv/Tablassert/issues](https://github.com/SkyeAv/Tablassert/issues)
226
+ - Please include reproduction steps for bugs and a clear description for feature requests
227
+
228
+ ## License
229
+
230
+ By contributing to Tablassert, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE).
231
+
232
+ ## Code of Conduct
233
+
234
+ ### Our Pledge
235
+
236
+ We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
237
+
238
+ We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
239
+
240
+ ### Our Standards
241
+
242
+ Examples of behavior that contributes to a positive environment:
243
+
244
+ - Demonstrating empathy and kindness toward other people
245
+ - Being respectful of differing opinions, viewpoints, and experiences
246
+ - Giving and gracefully accepting constructive feedback
247
+ - Accepting responsibility and apologizing to those affected by mistakes
248
+ - Focusing on what is best not just for us as individuals, but for the overall community
249
+
250
+ Examples of unacceptable behavior:
251
+
252
+ - The use of sexualized language or imagery, and sexual attention or advances
253
+ - Trolling, insulting or derogatory comments, and personal or political attacks
254
+ - Public or private harassment
255
+ - Publishing others' private information without explicit permission
256
+ - Other conduct which could reasonably be considered inappropriate
257
+
258
+ ### Enforcement
259
+
260
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the project maintainer at [sgoetz@isbscience.org](mailto:sgoetz@isbscience.org). All complaints will be reviewed and investigated fairly.
261
+
262
+ Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions.
263
+
264
+ ### Attribution
265
+
266
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1.
@@ -0,0 +1,6 @@
1
+ FROM python:3.14-slim
2
+
3
+ RUN pip install --no-cache-dir "tablassert[full]"
4
+
5
+ ENTRYPOINT ["tablassert"]
6
+ CMD ["--help"]