confident-extract 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- confident_extract-0.1.0a1/.github/ISSUE_TEMPLATE/config.yml +5 -0
- confident_extract-0.1.0a1/.github/workflows/ci.yml +43 -0
- confident_extract-0.1.0a1/.github/workflows/publish.yml +33 -0
- confident_extract-0.1.0a1/.gitignore +9 -0
- confident_extract-0.1.0a1/.pre-commit-config.yaml +22 -0
- confident_extract-0.1.0a1/AGENTS.md +40 -0
- confident_extract-0.1.0a1/CHANGELOG.md +23 -0
- confident_extract-0.1.0a1/CONTRIBUTING.md +73 -0
- confident_extract-0.1.0a1/LICENSE +22 -0
- confident_extract-0.1.0a1/PKG-INFO +294 -0
- confident_extract-0.1.0a1/README.md +245 -0
- confident_extract-0.1.0a1/benchmarks/__init__.py +1 -0
- confident_extract-0.1.0a1/benchmarks/test_extract_benchmarks.py +270 -0
- confident_extract-0.1.0a1/confident_extract/__init__.py +23 -0
- confident_extract-0.1.0a1/confident_extract/confidence/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/core/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/core/extractor.py +60 -0
- confident_extract-0.1.0a1/confident_extract/core/preprocessor.py +122 -0
- confident_extract-0.1.0a1/confident_extract/core/result.py +31 -0
- confident_extract-0.1.0a1/confident_extract/providers/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/py.typed +1 -0
- confident_extract-0.1.0a1/confident_extract/repair/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/repair/engine.py +108 -0
- confident_extract-0.1.0a1/confident_extract/repair/strategies.py +369 -0
- confident_extract-0.1.0a1/confident_extract/retry/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/validators/__init__.py +1 -0
- confident_extract-0.1.0a1/confident_extract/validators/msgspec_adapter.py +165 -0
- confident_extract-0.1.0a1/pyproject.toml +110 -0
- confident_extract-0.1.0a1/tests/__init__.py +1 -0
- confident_extract-0.1.0a1/tests/fixtures/__init__.py +1 -0
- confident_extract-0.1.0a1/tests/integration/__init__.py +1 -0
- confident_extract-0.1.0a1/tests/unit/__init__.py +1 -0
- confident_extract-0.1.0a1/tests/unit/test_extractor.py +185 -0
- confident_extract-0.1.0a1/tests/unit/test_msgspec_adapter.py +286 -0
- confident_extract-0.1.0a1/tests/unit/test_package.py +30 -0
- confident_extract-0.1.0a1/tests/unit/test_preprocessor.py +140 -0
- confident_extract-0.1.0a1/tests/unit/test_public_api.py +121 -0
- confident_extract-0.1.0a1/tests/unit/test_repair_engine.py +205 -0
- confident_extract-0.1.0a1/tests/unit/test_repair_strategies.py +229 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
lint-and-test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
strategy:
|
|
11
|
+
fail-fast: false
|
|
12
|
+
matrix:
|
|
13
|
+
python-version:
|
|
14
|
+
- "3.11"
|
|
15
|
+
- "3.12"
|
|
16
|
+
- "3.13"
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- name: Check out repository
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
cache: pip
|
|
27
|
+
|
|
28
|
+
- name: Install package and development dependencies
|
|
29
|
+
run: |
|
|
30
|
+
python -m pip install --upgrade pip
|
|
31
|
+
python -m pip install -e ".[dev]"
|
|
32
|
+
|
|
33
|
+
- name: Run Ruff
|
|
34
|
+
run: python -m ruff check .
|
|
35
|
+
|
|
36
|
+
- name: Run mypy
|
|
37
|
+
run: python -m mypy .
|
|
38
|
+
|
|
39
|
+
- name: Run pytest
|
|
40
|
+
run: python -m pytest
|
|
41
|
+
|
|
42
|
+
- name: Verify import
|
|
43
|
+
run: python -c "import confident_extract"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout repository
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.11"
|
|
23
|
+
|
|
24
|
+
- name: Install build dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install build
|
|
28
|
+
|
|
29
|
+
- name: Build package
|
|
30
|
+
run: python -m build
|
|
31
|
+
|
|
32
|
+
- name: Publish package
|
|
33
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
- id: ruff-check
|
|
5
|
+
name: ruff check
|
|
6
|
+
entry: python -m ruff check .
|
|
7
|
+
language: system
|
|
8
|
+
pass_filenames: false
|
|
9
|
+
types_or: [python, pyi]
|
|
10
|
+
- id: ruff-format
|
|
11
|
+
name: ruff format
|
|
12
|
+
entry: python -m ruff format --check .
|
|
13
|
+
language: system
|
|
14
|
+
pass_filenames: false
|
|
15
|
+
types_or: [python, pyi]
|
|
16
|
+
- id: mypy
|
|
17
|
+
name: mypy
|
|
18
|
+
entry: python -m mypy .
|
|
19
|
+
language: system
|
|
20
|
+
pass_filenames: false
|
|
21
|
+
types_or: [python, pyi]
|
|
22
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
## API Philosophy
|
|
4
|
+
1. Every public function must have a clean, minimal call signature
|
|
5
|
+
2. result = extract(text, schema=Invoice) is the gold standard
|
|
6
|
+
3. Never chain abstraction layers: client.router.pipeline.manager.extract() is forbidden
|
|
7
|
+
4. No required kwargs beyond text and schema on extract()
|
|
8
|
+
|
|
9
|
+
## Performance Rules
|
|
10
|
+
1. Always attempt orjson fast-path before any repair logic
|
|
11
|
+
2. Validation overhead must stay under 10ms — benchmark every PR
|
|
12
|
+
3. msgspec is the primary validator — Pydantic only in the optional bridge
|
|
13
|
+
4. Never add blocking I/O inside the hot path
|
|
14
|
+
5. Async variants must use asyncio natively, never run_in_executor() on sync code
|
|
15
|
+
|
|
16
|
+
## Code Quality Rules
|
|
17
|
+
1. Type hints required on every function signature, no exceptions
|
|
18
|
+
2. Docstrings required on all public functions (Google style)
|
|
19
|
+
3. ruff check . and mypy . must pass before any commit
|
|
20
|
+
4. Unit test required for every new module — 90%+ coverage target
|
|
21
|
+
5. No bare except clauses — always catch specific exception types
|
|
22
|
+
|
|
23
|
+
## Dependency Rules
|
|
24
|
+
1. Never introduce a new hard dependency without explicit approval
|
|
25
|
+
2. Heavy dependencies (pydantic, openai, anthropic) are optional extras only
|
|
26
|
+
3. No dependency on LangChain, LlamaIndex, or any agent framework
|
|
27
|
+
4. If you need JSON parsing, use orjson — never stdlib json in hot paths
|
|
28
|
+
|
|
29
|
+
## DO NOT BUILD
|
|
30
|
+
1. Chatbot or agent framework
|
|
31
|
+
2. Prompt template management
|
|
32
|
+
3. Workflow / DAG orchestrator
|
|
33
|
+
4. Vector store or retrieval system
|
|
34
|
+
5. GUI, web interface, or dashboard
|
|
35
|
+
|
|
36
|
+
## Security Rules
|
|
37
|
+
1. No eval() or exec() on LLM output under any circumstances
|
|
38
|
+
2. Sanitize all malformed inputs before processing
|
|
39
|
+
3. Isolate provider integrations cleanly — no cross-contamination
|
|
40
|
+
4. Deterministic parsing only — no hidden randomness in core pipeline
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on Keep a Changelog and the project follows semantic versioning pre-release tags for public alpha cuts.
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0a1] - 2026-05-12
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- Sync public API at the package root: `extract`, `ExtractionResult`, `MsgspecValidationError`, and `ValidationError`
|
|
14
|
+
- Deterministic preprocessing, repair, and `msgspec` validation layers
|
|
15
|
+
- Minimal synchronous extraction pipeline with repair metadata and latency measurement
|
|
16
|
+
- Unit coverage for preprocess, repair strategies, repair engine, validator adapter, extractor, and package-root API
|
|
17
|
+
- Initial local benchmark suite for preprocess, repair, validation, and full extraction
|
|
18
|
+
- Public alpha OSS release artifacts: README, contribution guide, changelog, and issue-template config
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
|
|
22
|
+
- Package metadata updated for a public alpha release target
|
|
23
|
+
- Development extras now include release tooling for `python -m build` and `twine check`
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
## Scope
|
|
4
|
+
|
|
5
|
+
`confident-extract` is a narrow library for deterministic structured extraction from noisy JSON-like text.
|
|
6
|
+
|
|
7
|
+
Before opening a change:
|
|
8
|
+
|
|
9
|
+
- read [AGENTS.md](AGENTS.md)
|
|
10
|
+
- keep the public API minimal
|
|
11
|
+
- do not add provider logic, retries, or optional bridges unless the task explicitly requires them
|
|
12
|
+
- avoid hidden fallbacks and blocking I/O in the hot path
|
|
13
|
+
|
|
14
|
+
## Development setup
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
python -m venv .venv
|
|
18
|
+
source .venv/bin/activate
|
|
19
|
+
python -m pip install --upgrade pip
|
|
20
|
+
python -m pip install -e ".[dev]"
|
|
21
|
+
pre-commit install
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quality gates
|
|
25
|
+
|
|
26
|
+
Run these before opening a PR:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python -m ruff check .
|
|
30
|
+
python -m mypy .
|
|
31
|
+
python -m pytest
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Benchmarks
|
|
35
|
+
|
|
36
|
+
Run the current local benchmark suite with:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
python -m pytest benchmarks/test_extract_benchmarks.py
|
|
40
|
+
python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-sort=mean
|
|
41
|
+
python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Notes:
|
|
45
|
+
|
|
46
|
+
- benchmark numbers are local measurements unless explicitly captured from CI
|
|
47
|
+
- do not make public performance claims from a single local run
|
|
48
|
+
- if a change touches preprocessing, repair, validation, or extraction orchestration, rerun the benchmark suite
|
|
49
|
+
|
|
50
|
+
## Release checks
|
|
51
|
+
|
|
52
|
+
Before cutting a release candidate or alpha tag:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python -m build
|
|
56
|
+
twine check dist/*
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
If `dist/` or `build/` already exists from a prior run, remove them and rebuild.
|
|
60
|
+
|
|
61
|
+
## Pull requests
|
|
62
|
+
|
|
63
|
+
Keep PRs narrow and traceable:
|
|
64
|
+
|
|
65
|
+
- one feature area or one layer at a time
|
|
66
|
+
- include tests for every changed module
|
|
67
|
+
- explain behavior changes and benchmark impact when hot-path code changes
|
|
68
|
+
|
|
69
|
+
## Issues
|
|
70
|
+
|
|
71
|
+
Use GitHub issues for concrete bugs, regressions, or feature requests.
|
|
72
|
+
|
|
73
|
+
For usage questions or local development setup, start from the README and this guide before filing an issue.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 confident-extract contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: confident-extract
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: Deterministic structured extraction from noisy JSON-like model output.
|
|
5
|
+
Project-URL: Homepage, https://github.com/hitarthbuilds/confident-extract
|
|
6
|
+
Project-URL: Repository, https://github.com/hitarthbuilds/confident-extract
|
|
7
|
+
Project-URL: Issues, https://github.com/hitarthbuilds/confident-extract/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/hitarthbuilds/confident-extract/blob/master/CHANGELOG.md
|
|
9
|
+
Project-URL: CI, https://github.com/hitarthbuilds/confident-extract/actions/workflows/ci.yml
|
|
10
|
+
Author: Hitarth Desai
|
|
11
|
+
Maintainer: Hitarth Desai
|
|
12
|
+
License: MIT
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: json repair,llm,msgspec,orjson,schema validation,structured extraction
|
|
15
|
+
Classifier: Development Status :: 3 - Alpha
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Classifier: Topic :: Text Processing
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.11
|
|
29
|
+
Requires-Dist: msgspec<1.0,>=0.18
|
|
30
|
+
Requires-Dist: orjson<4.0,>=3.9
|
|
31
|
+
Provides-Extra: anthropic
|
|
32
|
+
Requires-Dist: anthropic<1.0,>=0.25; extra == 'anthropic'
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: build<2.0,>=1.2; extra == 'dev'
|
|
35
|
+
Requires-Dist: mypy<2.0,>=1.10; extra == 'dev'
|
|
36
|
+
Requires-Dist: pre-commit<5.0,>=3.7; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-asyncio<1.0,>=0.23; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-benchmark<6.0,>=5.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest<9.0,>=8.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff<1.0,>=0.4; extra == 'dev'
|
|
41
|
+
Requires-Dist: twine<7.0,>=5.1; extra == 'dev'
|
|
42
|
+
Provides-Extra: ollama
|
|
43
|
+
Requires-Dist: ollama<1.0,>=0.2; extra == 'ollama'
|
|
44
|
+
Provides-Extra: openai
|
|
45
|
+
Requires-Dist: openai<2.0,>=1.0; extra == 'openai'
|
|
46
|
+
Provides-Extra: pydantic
|
|
47
|
+
Requires-Dist: pydantic<3.0,>=2.0; extra == 'pydantic'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
# confident-extract
|
|
51
|
+
|
|
52
|
+
[](https://github.com/hitarthbuilds/confident-extract/actions/workflows/ci.yml)
|
|
53
|
+
[](https://pypi.org/project/confident-extract/)
|
|
54
|
+
[](https://pypi.org/project/confident-extract/)
|
|
55
|
+
[](https://github.com/hitarthbuilds/confident-extract/blob/master/LICENSE)
|
|
56
|
+
|
|
57
|
+
`confident-extract` is a small Python library for deterministic structured extraction from noisy JSON-like model output.
|
|
58
|
+
|
|
59
|
+
The current public alpha surface is synchronous and `msgspec`-first:
|
|
60
|
+
|
|
61
|
+
- `from confident_extract import extract`
|
|
62
|
+
- deterministic preprocessing and JSON repair
|
|
63
|
+
- strict `msgspec.Struct` validation
|
|
64
|
+
- lightweight result metadata around the validated output
|
|
65
|
+
|
|
66
|
+
## Project overview
|
|
67
|
+
|
|
68
|
+
The library is built for the common case where an upstream model or OCR system returns JSON-like text that is close to valid, but not always valid enough to parse or validate directly.
|
|
69
|
+
|
|
70
|
+
The current sync pipeline is:
|
|
71
|
+
|
|
72
|
+
1. preprocess raw text
|
|
73
|
+
2. repair malformed JSON conservatively
|
|
74
|
+
3. validate against a `msgspec.Struct` schema
|
|
75
|
+
4. return a typed `ExtractionResult`
|
|
76
|
+
|
|
77
|
+
The package does not currently include provider adapters, retries, async APIs, streaming, confidence scoring, or a pydantic bridge.
|
|
78
|
+
|
|
79
|
+
## Install
|
|
80
|
+
|
|
81
|
+
Install the published package:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python -m pip install confident-extract
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Install for local development:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
python -m pip install -e ".[dev]"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Quickstart example
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import msgspec
|
|
97
|
+
|
|
98
|
+
from confident_extract import extract
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class Invoice(msgspec.Struct):
|
|
102
|
+
invoice_id: int
|
|
103
|
+
status: str
|
|
104
|
+
total_cents: int
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
result = extract(
|
|
108
|
+
text='{"invoice_id": 42, "status": "paid", "total_cents": 1999}',
|
|
109
|
+
schema=Invoice,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
assert result.data == Invoice(invoice_id=42, status="paid", total_cents=1999)
|
|
113
|
+
assert result.repair_applied is False
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Malformed JSON repair example
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
import msgspec
|
|
120
|
+
|
|
121
|
+
from confident_extract import extract
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Invoice(msgspec.Struct):
|
|
125
|
+
invoice_id: int
|
|
126
|
+
status: str
|
|
127
|
+
total_cents: int
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
raw = "{invoice_id: 42, status: 'paid', total_cents: 1999,}"
|
|
131
|
+
result = extract(text=raw, schema=Invoice)
|
|
132
|
+
|
|
133
|
+
assert result.data.status == "paid"
|
|
134
|
+
assert result.repair_applied is True
|
|
135
|
+
assert result.repaired_text == (
|
|
136
|
+
'{"invoice_id": 42, "status": "paid", "total_cents": 1999}'
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Nested schema example
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
import msgspec
|
|
144
|
+
|
|
145
|
+
from confident_extract import extract
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Contact(msgspec.Struct):
|
|
149
|
+
email: str
|
|
150
|
+
phone: str | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class Customer(msgspec.Struct):
|
|
154
|
+
name: str
|
|
155
|
+
contact: Contact
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class Invoice(msgspec.Struct):
|
|
159
|
+
invoice_id: int
|
|
160
|
+
customer: Customer
|
|
161
|
+
tags: list[str]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
raw = """
|
|
165
|
+
{
|
|
166
|
+
"invoice_id": 7,
|
|
167
|
+
"customer": {
|
|
168
|
+
"name": "Acme",
|
|
169
|
+
"contact": {"email": "ops@example.com", "phone": "123"}
|
|
170
|
+
},
|
|
171
|
+
"tags": ["paid", "net30"]
|
|
172
|
+
}
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
result = extract(text=raw, schema=Invoice)
|
|
176
|
+
|
|
177
|
+
assert result.data.customer.contact.email == "ops@example.com"
|
|
178
|
+
assert result.data.tags == ["paid", "net30"]
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Benchmark snapshot
|
|
182
|
+
|
|
183
|
+
Current local measurements were captured on May 12, 2026 with Python 3.13.5 using:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
These are local measurements only. They are useful for regression tracking, not public performance claims.
|
|
190
|
+
|
|
191
|
+
| Path | Scenario | p50 | p99 | Throughput |
|
|
192
|
+
| --- | --- | ---: | ---: | ---: |
|
|
193
|
+
| `preprocess()` | already-valid JSON | `2.17 us` | `2.46 us` | `462k ops/s` |
|
|
194
|
+
| `preprocess()` | fenced ~10KB payload | `4.25 us` | `13.04 us` | `215k ops/s` |
|
|
195
|
+
| `repair()` | valid fast path | `6.21 us` | `31.25 us` | `145k ops/s` |
|
|
196
|
+
| `repair()` | trailing comma repair | `123.50 us` | `328.29 us` | `7.5k ops/s` |
|
|
197
|
+
| `repair()` | multi-strategy repair | `611.38 us` | `965.96 us` | `1.7k ops/s` |
|
|
198
|
+
| `validate_with_msgspec()` | nested decoded payload | `3.00 us` | `3.21 us` | `333k ops/s` |
|
|
199
|
+
| `validate_with_msgspec()` | ~10KB decoded payload | `27.75 us` | `39.00 us` | `34.6k ops/s` |
|
|
200
|
+
| `extract()` | valid fast path | `7.42 us` | `13.88 us` | `123k ops/s` |
|
|
201
|
+
| `extract()` | trailing comma repair | `73.50 us` | `172.63 us` | `13.2k ops/s` |
|
|
202
|
+
| `extract()` | multi-strategy nested repair | `406.83 us` | `820.83 us` | `2.2k ops/s` |
|
|
203
|
+
| `extract()` | ~10KB nested payload | `92.83 us` | `167.75 us` | `10.5k ops/s` |
|
|
204
|
+
| `extract()` | repeated ~10KB throughput | `94.69 us` | `96.21 us` | `10.6k ops/s` |
|
|
205
|
+
|
|
206
|
+
### Benchmark caveats
|
|
207
|
+
|
|
208
|
+
- The current suite is local, deterministic, and provider-free.
|
|
209
|
+
- Outlier behavior will vary by machine, Python version, and thermal state.
|
|
210
|
+
- The current repo does not yet publish benchmark baselines from CI runners.
|
|
211
|
+
- Instructor, Guardrails, and LangChain comparisons are planned, but not yet implemented in this repository.
|
|
212
|
+
|
|
213
|
+
### How to run benchmarks
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
python -m pytest benchmarks/test_extract_benchmarks.py
|
|
217
|
+
python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-sort=mean
|
|
218
|
+
python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Architecture flow diagram
|
|
222
|
+
|
|
223
|
+
```text
|
|
224
|
+
raw input text
|
|
225
|
+
|
|
|
226
|
+
v
|
|
227
|
+
preprocess(text)
|
|
228
|
+
|
|
|
229
|
+
v
|
|
230
|
+
repair(preprocessed_text)
|
|
231
|
+
|
|
|
232
|
+
v
|
|
233
|
+
validate_with_msgspec(parsed payload, schema)
|
|
234
|
+
|
|
|
235
|
+
v
|
|
236
|
+
ExtractionResult[T]
|
|
237
|
+
- data
|
|
238
|
+
- repair_applied
|
|
239
|
+
- repair_attempts
|
|
240
|
+
- raw_input
|
|
241
|
+
- repaired_text
|
|
242
|
+
- latency_ms
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Feature list
|
|
246
|
+
|
|
247
|
+
- Minimal sync API: `extract(text, schema=Invoice)`
|
|
248
|
+
- Conservative preprocessing for markdown fences, whitespace normalization, and escaped JSON
|
|
249
|
+
- Deterministic JSON repair for trailing commas, unterminated containers, single quotes, and bare keys
|
|
250
|
+
- Strict `msgspec.Struct` validation with field-path extraction on failures
|
|
251
|
+
- Frozen, slotted extraction result contract
|
|
252
|
+
- Package-root exports for the public sync API
|
|
253
|
+
- Local benchmark coverage for preprocess, repair, validation, and full extraction
|
|
254
|
+
|
|
255
|
+
## Roadmap
|
|
256
|
+
|
|
257
|
+
- Stabilize the sync extraction API for `0.1.x`
|
|
258
|
+
- Add the optional pydantic bridge outside the hot path
|
|
259
|
+
- Add async and streaming APIs
|
|
260
|
+
- Add provider adapters for live model integrations
|
|
261
|
+
- Add confidence scoring and retry routing
|
|
262
|
+
- Add reproducible cross-library benchmark comparisons
|
|
263
|
+
|
|
264
|
+
## Contribution and dev setup
|
|
265
|
+
|
|
266
|
+
`AGENTS.md` is the repo-level implementation contract. Read it before changing the code.
|
|
267
|
+
|
|
268
|
+
Local setup:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
python -m venv .venv
|
|
272
|
+
source .venv/bin/activate
|
|
273
|
+
python -m pip install --upgrade pip
|
|
274
|
+
python -m pip install -e ".[dev]"
|
|
275
|
+
pre-commit install
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
Quality gates:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
python -m ruff check .
|
|
282
|
+
python -m mypy .
|
|
283
|
+
python -m pytest
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Benchmark and release checks:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
python -m pytest benchmarks/test_extract_benchmarks.py
|
|
290
|
+
python -m build
|
|
291
|
+
twine check dist/*
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
For contributor expectations, issue filing guidance, and release checks, see [CONTRIBUTING.md](https://github.com/hitarthbuilds/confident-extract/blob/master/CONTRIBUTING.md).
|