confident-extract 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. confident_extract-0.1.0a1/.github/ISSUE_TEMPLATE/config.yml +5 -0
  2. confident_extract-0.1.0a1/.github/workflows/ci.yml +43 -0
  3. confident_extract-0.1.0a1/.github/workflows/publish.yml +33 -0
  4. confident_extract-0.1.0a1/.gitignore +9 -0
  5. confident_extract-0.1.0a1/.pre-commit-config.yaml +22 -0
  6. confident_extract-0.1.0a1/AGENTS.md +40 -0
  7. confident_extract-0.1.0a1/CHANGELOG.md +23 -0
  8. confident_extract-0.1.0a1/CONTRIBUTING.md +73 -0
  9. confident_extract-0.1.0a1/LICENSE +22 -0
  10. confident_extract-0.1.0a1/PKG-INFO +294 -0
  11. confident_extract-0.1.0a1/README.md +245 -0
  12. confident_extract-0.1.0a1/benchmarks/__init__.py +1 -0
  13. confident_extract-0.1.0a1/benchmarks/test_extract_benchmarks.py +270 -0
  14. confident_extract-0.1.0a1/confident_extract/__init__.py +23 -0
  15. confident_extract-0.1.0a1/confident_extract/confidence/__init__.py +1 -0
  16. confident_extract-0.1.0a1/confident_extract/core/__init__.py +1 -0
  17. confident_extract-0.1.0a1/confident_extract/core/extractor.py +60 -0
  18. confident_extract-0.1.0a1/confident_extract/core/preprocessor.py +122 -0
  19. confident_extract-0.1.0a1/confident_extract/core/result.py +31 -0
  20. confident_extract-0.1.0a1/confident_extract/providers/__init__.py +1 -0
  21. confident_extract-0.1.0a1/confident_extract/py.typed +1 -0
  22. confident_extract-0.1.0a1/confident_extract/repair/__init__.py +1 -0
  23. confident_extract-0.1.0a1/confident_extract/repair/engine.py +108 -0
  24. confident_extract-0.1.0a1/confident_extract/repair/strategies.py +369 -0
  25. confident_extract-0.1.0a1/confident_extract/retry/__init__.py +1 -0
  26. confident_extract-0.1.0a1/confident_extract/validators/__init__.py +1 -0
  27. confident_extract-0.1.0a1/confident_extract/validators/msgspec_adapter.py +165 -0
  28. confident_extract-0.1.0a1/pyproject.toml +110 -0
  29. confident_extract-0.1.0a1/tests/__init__.py +1 -0
  30. confident_extract-0.1.0a1/tests/fixtures/__init__.py +1 -0
  31. confident_extract-0.1.0a1/tests/integration/__init__.py +1 -0
  32. confident_extract-0.1.0a1/tests/unit/__init__.py +1 -0
  33. confident_extract-0.1.0a1/tests/unit/test_extractor.py +185 -0
  34. confident_extract-0.1.0a1/tests/unit/test_msgspec_adapter.py +286 -0
  35. confident_extract-0.1.0a1/tests/unit/test_package.py +30 -0
  36. confident_extract-0.1.0a1/tests/unit/test_preprocessor.py +140 -0
  37. confident_extract-0.1.0a1/tests/unit/test_public_api.py +121 -0
  38. confident_extract-0.1.0a1/tests/unit/test_repair_engine.py +205 -0
  39. confident_extract-0.1.0a1/tests/unit/test_repair_strategies.py +229 -0
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Contribution guide
4
+ url: https://github.com/hitarthbuilds/confident-extract/blob/master/CONTRIBUTING.md
5
+ about: Read local setup, quality gates, benchmark guidance, and release checks before opening a change.
@@ -0,0 +1,43 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+
7
+ jobs:
8
+ lint-and-test:
9
+ runs-on: ubuntu-latest
10
+ strategy:
11
+ fail-fast: false
12
+ matrix:
13
+ python-version:
14
+ - "3.11"
15
+ - "3.12"
16
+ - "3.13"
17
+
18
+ steps:
19
+ - name: Check out repository
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+ cache: pip
27
+
28
+ - name: Install package and development dependencies
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ python -m pip install -e ".[dev]"
32
+
33
+ - name: Run Ruff
34
+ run: python -m ruff check .
35
+
36
+ - name: Run mypy
37
+ run: python -m mypy .
38
+
39
+ - name: Run pytest
40
+ run: python -m pytest
41
+
42
+ - name: Verify import
43
+ run: python -c "import confident_extract"
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+ contents: read
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout repository
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.11"
23
+
24
+ - name: Install build dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install build
28
+
29
+ - name: Build package
30
+ run: python -m build
31
+
32
+ - name: Publish package
33
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ .env
5
+ .pytest_cache/
6
+ .mypy_cache/
7
+ dist/
8
+ build/
9
+ *.egg-info/
@@ -0,0 +1,22 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff-check
5
+ name: ruff check
6
+ entry: python -m ruff check .
7
+ language: system
8
+ pass_filenames: false
9
+ types_or: [python, pyi]
10
+ - id: ruff-format
11
+ name: ruff format
12
+ entry: python -m ruff format --check .
13
+ language: system
14
+ pass_filenames: false
15
+ types_or: [python, pyi]
16
+ - id: mypy
17
+ name: mypy
18
+ entry: python -m mypy .
19
+ language: system
20
+ pass_filenames: false
21
+ types_or: [python, pyi]
22
+
@@ -0,0 +1,40 @@
1
+ # AGENTS.md
2
+
3
+ ## API Philosophy
4
+ 1. Every public function must have a clean, minimal call signature
5
+ 2. result = extract(text, schema=Invoice) is the gold standard
6
+ 3. Never chain abstraction layers: client.router.pipeline.manager.extract() is forbidden
7
+ 4. No required kwargs beyond text and schema on extract()
8
+
9
+ ## Performance Rules
10
+ 1. Always attempt orjson fast-path before any repair logic
11
+ 2. Validation overhead must stay under 10ms — benchmark every PR
12
+ 3. msgspec is the primary validator — Pydantic only in the optional bridge
13
+ 4. Never add blocking I/O inside the hot path
14
+ 5. Async variants must use asyncio natively, never run_in_executor() on sync code
15
+
16
+ ## Code Quality Rules
17
+ 1. Type hints required on every function signature, no exceptions
18
+ 2. Docstrings required on all public functions (Google style)
19
+ 3. ruff check . and mypy . must pass before any commit
20
+ 4. Unit test required for every new module — 90%+ coverage target
21
+ 5. No bare except clauses — always catch specific exception types
22
+
23
+ ## Dependency Rules
24
+ 1. Never introduce a new hard dependency without explicit approval
25
+ 2. Heavy dependencies (pydantic, openai, anthropic) are optional extras only
26
+ 3. No dependency on LangChain, LlamaIndex, or any agent framework
27
+ 4. If you need JSON parsing, use orjson — never stdlib json in hot paths
28
+
29
+ ## DO NOT BUILD
30
+ 1. Chatbot or agent framework
31
+ 2. Prompt template management
32
+ 3. Workflow / DAG orchestrator
33
+ 4. Vector store or retrieval system
34
+ 5. GUI, web interface, or dashboard
35
+
36
+ ## Security Rules
37
+ 1. No eval() or exec() on LLM output under any circumstances
38
+ 2. Sanitize all malformed inputs before processing
39
+ 3. Isolate provider integrations cleanly — no cross-contamination
40
+ 4. Deterministic parsing only — no hidden randomness in core pipeline
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on Keep a Changelog and the project follows semantic versioning pre-release tags for public alpha cuts.
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.1.0a1] - 2026-05-12
10
+
11
+ ### Added
12
+
13
+ - Sync public API at the package root: `extract`, `ExtractionResult`, `MsgspecValidationError`, and `ValidationError`
14
+ - Deterministic preprocessing, repair, and `msgspec` validation layers
15
+ - Minimal synchronous extraction pipeline with repair metadata and latency measurement
16
+ - Unit coverage for preprocess, repair strategies, repair engine, validator adapter, extractor, and package-root API
17
+ - Initial local benchmark suite for preprocess, repair, validation, and full extraction
18
+ - Public alpha OSS release artifacts: README, contribution guide, changelog, and issue-template config
19
+
20
+ ### Changed
21
+
22
+ - Package metadata updated for a public alpha release target
23
+ - Development extras now include release tooling for `python -m build` and `twine check`
@@ -0,0 +1,73 @@
1
+ # Contributing
2
+
3
+ ## Scope
4
+
5
+ `confident-extract` is a narrow library for deterministic structured extraction from noisy JSON-like text.
6
+
7
+ Before opening a change:
8
+
9
+ - read [AGENTS.md](AGENTS.md)
10
+ - keep the public API minimal
11
+ - do not add provider logic, retries, or optional bridges unless the task explicitly requires them
12
+ - avoid hidden fallbacks and blocking I/O in the hot path
13
+
14
+ ## Development setup
15
+
16
+ ```bash
17
+ python -m venv .venv
18
+ source .venv/bin/activate
19
+ python -m pip install --upgrade pip
20
+ python -m pip install -e ".[dev]"
21
+ pre-commit install
22
+ ```
23
+
24
+ ## Quality gates
25
+
26
+ Run these before opening a PR:
27
+
28
+ ```bash
29
+ python -m ruff check .
30
+ python -m mypy .
31
+ python -m pytest
32
+ ```
33
+
34
+ ## Benchmarks
35
+
36
+ Run the current local benchmark suite with:
37
+
38
+ ```bash
39
+ python -m pytest benchmarks/test_extract_benchmarks.py
40
+ python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-sort=mean
41
+ python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
42
+ ```
43
+
44
+ Notes:
45
+
46
+ - benchmark numbers are local measurements unless explicitly captured from CI
47
+ - do not make public performance claims from a single local run
48
+ - if a change touches preprocessing, repair, validation, or extraction orchestration, rerun the benchmark suite
49
+
50
+ ## Release checks
51
+
52
+ Before cutting a release candidate or alpha tag:
53
+
54
+ ```bash
55
+ python -m build
56
+ twine check dist/*
57
+ ```
58
+
59
+ If `dist/` or `build/` already exists from a prior run, remove them and rebuild.
60
+
61
+ ## Pull requests
62
+
63
+ Keep PRs narrow and traceable:
64
+
65
+ - one feature area or one layer at a time
66
+ - include tests for every changed module
67
+ - explain behavior changes and benchmark impact when hot-path code changes
68
+
69
+ ## Issues
70
+
71
+ Use GitHub issues for concrete bugs, regressions, or feature requests.
72
+
73
+ For usage questions or local development setup, start from the README and this guide before filing an issue.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 confident-extract contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,294 @@
1
+ Metadata-Version: 2.4
2
+ Name: confident-extract
3
+ Version: 0.1.0a1
4
+ Summary: Deterministic structured extraction from noisy JSON-like model output.
5
+ Project-URL: Homepage, https://github.com/hitarthbuilds/confident-extract
6
+ Project-URL: Repository, https://github.com/hitarthbuilds/confident-extract
7
+ Project-URL: Issues, https://github.com/hitarthbuilds/confident-extract/issues
8
+ Project-URL: Changelog, https://github.com/hitarthbuilds/confident-extract/blob/master/CHANGELOG.md
9
+ Project-URL: CI, https://github.com/hitarthbuilds/confident-extract/actions/workflows/ci.yml
10
+ Author: Hitarth Desai
11
+ Maintainer: Hitarth Desai
12
+ License: MIT
13
+ License-File: LICENSE
14
+ Keywords: json repair,llm,msgspec,orjson,schema validation,structured extraction
15
+ Classifier: Development Status :: 3 - Alpha
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Programming Language :: Python :: Implementation :: CPython
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Topic :: Text Processing
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.11
29
+ Requires-Dist: msgspec<1.0,>=0.18
30
+ Requires-Dist: orjson<4.0,>=3.9
31
+ Provides-Extra: anthropic
32
+ Requires-Dist: anthropic<1.0,>=0.25; extra == 'anthropic'
33
+ Provides-Extra: dev
34
+ Requires-Dist: build<2.0,>=1.2; extra == 'dev'
35
+ Requires-Dist: mypy<2.0,>=1.10; extra == 'dev'
36
+ Requires-Dist: pre-commit<5.0,>=3.7; extra == 'dev'
37
+ Requires-Dist: pytest-asyncio<1.0,>=0.23; extra == 'dev'
38
+ Requires-Dist: pytest-benchmark<6.0,>=5.0; extra == 'dev'
39
+ Requires-Dist: pytest<9.0,>=8.0; extra == 'dev'
40
+ Requires-Dist: ruff<1.0,>=0.4; extra == 'dev'
41
+ Requires-Dist: twine<7.0,>=5.1; extra == 'dev'
42
+ Provides-Extra: ollama
43
+ Requires-Dist: ollama<1.0,>=0.2; extra == 'ollama'
44
+ Provides-Extra: openai
45
+ Requires-Dist: openai<2.0,>=1.0; extra == 'openai'
46
+ Provides-Extra: pydantic
47
+ Requires-Dist: pydantic<3.0,>=2.0; extra == 'pydantic'
48
+ Description-Content-Type: text/markdown
49
+
50
+ # confident-extract
51
+
52
+ [![CI](https://img.shields.io/github/actions/workflow/status/hitarthbuilds/confident-extract/ci.yml?branch=master&label=CI)](https://github.com/hitarthbuilds/confident-extract/actions/workflows/ci.yml)
53
+ [![PyPI](https://img.shields.io/badge/PyPI-0.1.0a1%20pending-blue)](https://pypi.org/project/confident-extract/)
54
+ [![Python](https://img.shields.io/badge/python-3.11%2B-blue)](https://pypi.org/project/confident-extract/)
55
+ [![License](https://img.shields.io/github/license/hitarthbuilds/confident-extract)](https://github.com/hitarthbuilds/confident-extract/blob/master/LICENSE)
56
+
57
+ `confident-extract` is a small Python library for deterministic structured extraction from noisy JSON-like model output.
58
+
59
+ The current public alpha surface is synchronous and `msgspec`-first:
60
+
61
+ - `from confident_extract import extract`
62
+ - deterministic preprocessing and JSON repair
63
+ - strict `msgspec.Struct` validation
64
+ - lightweight result metadata around the validated output
65
+
66
+ ## Project overview
67
+
68
+ The library is built for the common case where an upstream model or OCR system returns JSON-like text that is close to valid, but not always valid enough to parse or validate directly.
69
+
70
+ The current sync pipeline is:
71
+
72
+ 1. preprocess raw text
73
+ 2. repair malformed JSON conservatively
74
+ 3. validate against a `msgspec.Struct` schema
75
+ 4. return a typed `ExtractionResult`
76
+
77
+ The package does not currently include provider adapters, retries, async APIs, streaming, confidence scoring, or a pydantic bridge.
78
+
79
+ ## Install
80
+
81
+ Install the published package:
82
+
83
+ ```bash
84
+ python -m pip install confident-extract
85
+ ```
86
+
87
+ Install for local development:
88
+
89
+ ```bash
90
+ python -m pip install -e ".[dev]"
91
+ ```
92
+
93
+ ## Quickstart example
94
+
95
+ ```python
96
+ import msgspec
97
+
98
+ from confident_extract import extract
99
+
100
+
101
+ class Invoice(msgspec.Struct):
102
+ invoice_id: int
103
+ status: str
104
+ total_cents: int
105
+
106
+
107
+ result = extract(
108
+ text='{"invoice_id": 42, "status": "paid", "total_cents": 1999}',
109
+ schema=Invoice,
110
+ )
111
+
112
+ assert result.data == Invoice(invoice_id=42, status="paid", total_cents=1999)
113
+ assert result.repair_applied is False
114
+ ```
115
+
116
+ ## Malformed JSON repair example
117
+
118
+ ```python
119
+ import msgspec
120
+
121
+ from confident_extract import extract
122
+
123
+
124
+ class Invoice(msgspec.Struct):
125
+ invoice_id: int
126
+ status: str
127
+ total_cents: int
128
+
129
+
130
+ raw = "{invoice_id: 42, status: 'paid', total_cents: 1999,}"
131
+ result = extract(text=raw, schema=Invoice)
132
+
133
+ assert result.data.status == "paid"
134
+ assert result.repair_applied is True
135
+ assert result.repaired_text == (
136
+ '{"invoice_id": 42, "status": "paid", "total_cents": 1999}'
137
+ )
138
+ ```
139
+
140
+ ## Nested schema example
141
+
142
+ ```python
143
+ import msgspec
144
+
145
+ from confident_extract import extract
146
+
147
+
148
+ class Contact(msgspec.Struct):
149
+ email: str
150
+ phone: str | None = None
151
+
152
+
153
+ class Customer(msgspec.Struct):
154
+ name: str
155
+ contact: Contact
156
+
157
+
158
+ class Invoice(msgspec.Struct):
159
+ invoice_id: int
160
+ customer: Customer
161
+ tags: list[str]
162
+
163
+
164
+ raw = """
165
+ {
166
+ "invoice_id": 7,
167
+ "customer": {
168
+ "name": "Acme",
169
+ "contact": {"email": "ops@example.com", "phone": "123"}
170
+ },
171
+ "tags": ["paid", "net30"]
172
+ }
173
+ """
174
+
175
+ result = extract(text=raw, schema=Invoice)
176
+
177
+ assert result.data.customer.contact.email == "ops@example.com"
178
+ assert result.data.tags == ["paid", "net30"]
179
+ ```
180
+
181
+ ## Benchmark snapshot
182
+
183
+ Current local measurements were captured on May 12, 2026 with Python 3.13.5 using:
184
+
185
+ ```bash
186
+ python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
187
+ ```
188
+
189
+ These are local measurements only. They are useful for regression tracking, not public performance claims.
190
+
191
+ | Path | Scenario | p50 | p99 | Throughput |
192
+ | --- | --- | ---: | ---: | ---: |
193
+ | `preprocess()` | already-valid JSON | `2.17 us` | `2.46 us` | `462k ops/s` |
194
+ | `preprocess()` | fenced ~10KB payload | `4.25 us` | `13.04 us` | `215k ops/s` |
195
+ | `repair()` | valid fast path | `6.21 us` | `31.25 us` | `145k ops/s` |
196
+ | `repair()` | trailing comma repair | `123.50 us` | `328.29 us` | `7.5k ops/s` |
197
+ | `repair()` | multi-strategy repair | `611.38 us` | `965.96 us` | `1.7k ops/s` |
198
+ | `validate_with_msgspec()` | nested decoded payload | `3.00 us` | `3.21 us` | `333k ops/s` |
199
+ | `validate_with_msgspec()` | ~10KB decoded payload | `27.75 us` | `39.00 us` | `34.6k ops/s` |
200
+ | `extract()` | valid fast path | `7.42 us` | `13.88 us` | `123k ops/s` |
201
+ | `extract()` | trailing comma repair | `73.50 us` | `172.63 us` | `13.2k ops/s` |
202
+ | `extract()` | multi-strategy nested repair | `406.83 us` | `820.83 us` | `2.2k ops/s` |
203
+ | `extract()` | ~10KB nested payload | `92.83 us` | `167.75 us` | `10.5k ops/s` |
204
+ | `extract()` | repeated ~10KB throughput | `94.69 us` | `96.21 us` | `10.6k ops/s` |
205
+
206
+ ### Benchmark caveats
207
+
208
+ - The current suite is local, deterministic, and provider-free.
209
+ - Outlier behavior will vary by machine, Python version, and thermal state.
210
+ - The current repo does not yet publish benchmark baselines from CI runners.
211
+ - Instructor, Guardrails, and LangChain comparisons are planned, but not yet implemented in this repository.
212
+
213
+ ### How to run benchmarks
214
+
215
+ ```bash
216
+ python -m pytest benchmarks/test_extract_benchmarks.py
217
+ python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-sort=mean
218
+ python -m pytest benchmarks/test_extract_benchmarks.py --benchmark-json /tmp/confident_extract_benchmarks.json
219
+ ```
220
+
221
+ ## Architecture flow diagram
222
+
223
+ ```text
224
+ raw input text
225
+ |
226
+ v
227
+ preprocess(text)
228
+ |
229
+ v
230
+ repair(preprocessed_text)
231
+ |
232
+ v
233
+ validate_with_msgspec(parsed payload, schema)
234
+ |
235
+ v
236
+ ExtractionResult[T]
237
+ - data
238
+ - repair_applied
239
+ - repair_attempts
240
+ - raw_input
241
+ - repaired_text
242
+ - latency_ms
243
+ ```
244
+
245
+ ## Feature list
246
+
247
+ - Minimal sync API: `extract(text, schema=Invoice)`
248
+ - Conservative preprocessing for markdown fences, whitespace normalization, and escaped JSON
249
+ - Deterministic JSON repair for trailing commas, unterminated containers, single quotes, and bare keys
250
+ - Strict `msgspec.Struct` validation with field-path extraction on failures
251
+ - Frozen, slotted extraction result contract
252
+ - Package-root exports for the public sync API
253
+ - Local benchmark coverage for preprocess, repair, validation, and full extraction
254
+
255
+ ## Roadmap
256
+
257
+ - Stabilize the sync extraction API for `0.1.x`
258
+ - Add the optional pydantic bridge outside the hot path
259
+ - Add async and streaming APIs
260
+ - Add provider adapters for live model integrations
261
+ - Add confidence scoring and retry routing
262
+ - Add reproducible cross-library benchmark comparisons
263
+
264
+ ## Contribution and dev setup
265
+
266
+ `AGENTS.md` is the repo-level implementation contract. Read it before changing the code.
267
+
268
+ Local setup:
269
+
270
+ ```bash
271
+ python -m venv .venv
272
+ source .venv/bin/activate
273
+ python -m pip install --upgrade pip
274
+ python -m pip install -e ".[dev]"
275
+ pre-commit install
276
+ ```
277
+
278
+ Quality gates:
279
+
280
+ ```bash
281
+ python -m ruff check .
282
+ python -m mypy .
283
+ python -m pytest
284
+ ```
285
+
286
+ Benchmark and release checks:
287
+
288
+ ```bash
289
+ python -m pytest benchmarks/test_extract_benchmarks.py
290
+ python -m build
291
+ twine check dist/*
292
+ ```
293
+
294
+ For contributor expectations, issue filing guidance, and release checks, see [CONTRIBUTING.md](https://github.com/hitarthbuilds/confident-extract/blob/master/CONTRIBUTING.md).