extractforms 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extractforms-0.1.0/.gitignore +277 -0
- extractforms-0.1.0/LICENSE +22 -0
- extractforms-0.1.0/PKG-INFO +78 -0
- extractforms-0.1.0/README.md +51 -0
- extractforms-0.1.0/docs/adr/README.md +17 -0
- extractforms-0.1.0/docs/engineering/README.md +7 -0
- extractforms-0.1.0/pyproject.toml +90 -0
- extractforms-0.1.0/src/extractforms/__init__.py +32 -0
- extractforms-0.1.0/src/extractforms/_bootstrap.py +10 -0
- extractforms-0.1.0/src/extractforms/async_runner.py +64 -0
- extractforms-0.1.0/src/extractforms/backends/__init__.py +7 -0
- extractforms-0.1.0/src/extractforms/backends/multimodal_openai.py +254 -0
- extractforms-0.1.0/src/extractforms/backends/ocr_document_intelligence.py +48 -0
- extractforms-0.1.0/src/extractforms/cli.py +150 -0
- extractforms-0.1.0/src/extractforms/dependencies.py +66 -0
- extractforms-0.1.0/src/extractforms/exceptions.py +83 -0
- extractforms-0.1.0/src/extractforms/extractor.py +646 -0
- extractforms-0.1.0/src/extractforms/logging.py +96 -0
- extractforms-0.1.0/src/extractforms/pdf_render.py +88 -0
- extractforms-0.1.0/src/extractforms/pricing.py +27 -0
- extractforms-0.1.0/src/extractforms/prompts.py +96 -0
- extractforms-0.1.0/src/extractforms/schema_store.py +138 -0
- extractforms-0.1.0/src/extractforms/settings.py +522 -0
- extractforms-0.1.0/src/extractforms/typing/__init__.py +32 -0
- extractforms-0.1.0/src/extractforms/typing/enums.py +65 -0
- extractforms-0.1.0/src/extractforms/typing/models.py +205 -0
- extractforms-0.1.0/src/extractforms/typing/protocol.py +59 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
|
|
2
|
+
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,dotenv,linux,python,windows
|
|
3
|
+
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,dotenv,linux,python,windows
|
|
4
|
+
|
|
5
|
+
### dotenv ###
|
|
6
|
+
.env
|
|
7
|
+
|
|
8
|
+
### Linux ###
|
|
9
|
+
*~
|
|
10
|
+
|
|
11
|
+
# temporary files which can be created if a process still has a handle open of a deleted file
|
|
12
|
+
.fuse_hidden*
|
|
13
|
+
|
|
14
|
+
# KDE directory preferences
|
|
15
|
+
.directory
|
|
16
|
+
|
|
17
|
+
# Linux trash folder which might appear on any partition or disk
|
|
18
|
+
.Trash-*
|
|
19
|
+
|
|
20
|
+
# .nfs files are created when an open file is removed but is still being accessed
|
|
21
|
+
.nfs*
|
|
22
|
+
|
|
23
|
+
### macOS ###
|
|
24
|
+
# General
|
|
25
|
+
.DS_Store
|
|
26
|
+
.AppleDouble
|
|
27
|
+
.LSOverride
|
|
28
|
+
|
|
29
|
+
# Icon must end with two \r
|
|
30
|
+
Icon
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Thumbnails
|
|
34
|
+
._*
|
|
35
|
+
|
|
36
|
+
# Files that might appear in the root of a volume
|
|
37
|
+
.DocumentRevisions-V100
|
|
38
|
+
.fseventsd
|
|
39
|
+
.Spotlight-V100
|
|
40
|
+
.TemporaryItems
|
|
41
|
+
.Trashes
|
|
42
|
+
.VolumeIcon.icns
|
|
43
|
+
.com.apple.timemachine.donotpresent
|
|
44
|
+
|
|
45
|
+
# Directories potentially created on remote AFP share
|
|
46
|
+
.AppleDB
|
|
47
|
+
.AppleDesktop
|
|
48
|
+
Network Trash Folder
|
|
49
|
+
Temporary Items
|
|
50
|
+
.apdisk
|
|
51
|
+
|
|
52
|
+
### macOS Patch ###
|
|
53
|
+
# iCloud generated files
|
|
54
|
+
*.icloud
|
|
55
|
+
|
|
56
|
+
### Python ###
|
|
57
|
+
# Byte-compiled / optimized / DLL files
|
|
58
|
+
__pycache__/
|
|
59
|
+
*.py[cod]
|
|
60
|
+
*$py.class
|
|
61
|
+
|
|
62
|
+
# C extensions
|
|
63
|
+
*.so
|
|
64
|
+
|
|
65
|
+
# Distribution / packaging
|
|
66
|
+
.Python
|
|
67
|
+
build/
|
|
68
|
+
develop-eggs/
|
|
69
|
+
dist/
|
|
70
|
+
downloads/
|
|
71
|
+
eggs/
|
|
72
|
+
.eggs/
|
|
73
|
+
lib/
|
|
74
|
+
lib64/
|
|
75
|
+
parts/
|
|
76
|
+
sdist/
|
|
77
|
+
var/
|
|
78
|
+
wheels/
|
|
79
|
+
share/python-wheels/
|
|
80
|
+
*.egg-info/
|
|
81
|
+
.installed.cfg
|
|
82
|
+
*.egg
|
|
83
|
+
MANIFEST
|
|
84
|
+
|
|
85
|
+
# PyInstaller
|
|
86
|
+
# Usually these files are written by a python script from a template
|
|
87
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
88
|
+
*.manifest
|
|
89
|
+
*.spec
|
|
90
|
+
|
|
91
|
+
# Installer logs
|
|
92
|
+
pip-log.txt
|
|
93
|
+
pip-delete-this-directory.txt
|
|
94
|
+
|
|
95
|
+
# Unit test / coverage reports
|
|
96
|
+
htmlcov/
|
|
97
|
+
.tox/
|
|
98
|
+
.nox/
|
|
99
|
+
.coverage
|
|
100
|
+
.coverage.*
|
|
101
|
+
.cache
|
|
102
|
+
nosetests.xml
|
|
103
|
+
coverage.xml
|
|
104
|
+
*.cover
|
|
105
|
+
*.py,cover
|
|
106
|
+
.hypothesis/
|
|
107
|
+
.pytest_cache/
|
|
108
|
+
cover/
|
|
109
|
+
|
|
110
|
+
# Translations
|
|
111
|
+
*.mo
|
|
112
|
+
*.pot
|
|
113
|
+
|
|
114
|
+
# Django stuff:
|
|
115
|
+
*.log
|
|
116
|
+
local_settings.py
|
|
117
|
+
db.sqlite3
|
|
118
|
+
db.sqlite3-journal
|
|
119
|
+
|
|
120
|
+
# Flask stuff:
|
|
121
|
+
instance/
|
|
122
|
+
.webassets-cache
|
|
123
|
+
|
|
124
|
+
# Scrapy stuff:
|
|
125
|
+
.scrapy
|
|
126
|
+
|
|
127
|
+
# Sphinx documentation
|
|
128
|
+
docs/_build/
|
|
129
|
+
|
|
130
|
+
# PyBuilder
|
|
131
|
+
.pybuilder/
|
|
132
|
+
target/
|
|
133
|
+
|
|
134
|
+
# Jupyter Notebook
|
|
135
|
+
.ipynb_checkpoints
|
|
136
|
+
|
|
137
|
+
# IPython
|
|
138
|
+
profile_default/
|
|
139
|
+
ipython_config.py
|
|
140
|
+
|
|
141
|
+
# pyenv
|
|
142
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
143
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
144
|
+
# .python-version
|
|
145
|
+
|
|
146
|
+
# pipenv
|
|
147
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
148
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
149
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
150
|
+
# install all needed dependencies.
|
|
151
|
+
#Pipfile.lock
|
|
152
|
+
|
|
153
|
+
# poetry
|
|
154
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
155
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
156
|
+
# commonly ignored for libraries.
|
|
157
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
158
|
+
#poetry.lock
|
|
159
|
+
|
|
160
|
+
# pdm
|
|
161
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
162
|
+
#pdm.lock
|
|
163
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
164
|
+
# in version control.
|
|
165
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
166
|
+
.pdm.toml
|
|
167
|
+
|
|
168
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
169
|
+
__pypackages__/
|
|
170
|
+
|
|
171
|
+
# Celery stuff
|
|
172
|
+
celerybeat-schedule
|
|
173
|
+
celerybeat.pid
|
|
174
|
+
|
|
175
|
+
# SageMath parsed files
|
|
176
|
+
*.sage.py
|
|
177
|
+
|
|
178
|
+
# Environments
|
|
179
|
+
.venv
|
|
180
|
+
env/
|
|
181
|
+
venv/
|
|
182
|
+
ENV/
|
|
183
|
+
env.bak/
|
|
184
|
+
venv.bak/
|
|
185
|
+
|
|
186
|
+
# Spyder project settings
|
|
187
|
+
.spyderproject
|
|
188
|
+
.spyproject
|
|
189
|
+
|
|
190
|
+
# Rope project settings
|
|
191
|
+
.ropeproject
|
|
192
|
+
|
|
193
|
+
# mkdocs documentation
|
|
194
|
+
/site
|
|
195
|
+
|
|
196
|
+
# mypy
|
|
197
|
+
.mypy_cache/
|
|
198
|
+
.dmypy.json
|
|
199
|
+
dmypy.json
|
|
200
|
+
|
|
201
|
+
# Pyre type checker
|
|
202
|
+
.pyre/
|
|
203
|
+
|
|
204
|
+
# pytype static type analyzer
|
|
205
|
+
.pytype/
|
|
206
|
+
|
|
207
|
+
# Cython debug symbols
|
|
208
|
+
cython_debug/
|
|
209
|
+
|
|
210
|
+
# PyCharm
|
|
211
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
212
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
213
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
214
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
215
|
+
#.idea/
|
|
216
|
+
|
|
217
|
+
### Python Patch ###
|
|
218
|
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
|
219
|
+
poetry.toml
|
|
220
|
+
|
|
221
|
+
# ruff
|
|
222
|
+
.ruff_cache/
|
|
223
|
+
|
|
224
|
+
# LSP config files
|
|
225
|
+
pyrightconfig.json
|
|
226
|
+
|
|
227
|
+
### VisualStudioCode ###
|
|
228
|
+
.vscode/*
|
|
229
|
+
!.vscode/settings.json
|
|
230
|
+
!.vscode/tasks.json
|
|
231
|
+
!.vscode/launch.json
|
|
232
|
+
!.vscode/extensions.json
|
|
233
|
+
!.vscode/*.code-snippets
|
|
234
|
+
|
|
235
|
+
# Local History for Visual Studio Code
|
|
236
|
+
.history/
|
|
237
|
+
|
|
238
|
+
# Built Visual Studio Code Extensions
|
|
239
|
+
*.vsix
|
|
240
|
+
|
|
241
|
+
### VisualStudioCode Patch ###
|
|
242
|
+
# Ignore all local history of files
|
|
243
|
+
.history
|
|
244
|
+
.ionide
|
|
245
|
+
|
|
246
|
+
### Windows ###
|
|
247
|
+
# Windows thumbnail cache files
|
|
248
|
+
Thumbs.db
|
|
249
|
+
Thumbs.db:encryptable
|
|
250
|
+
ehthumbs.db
|
|
251
|
+
ehthumbs_vista.db
|
|
252
|
+
|
|
253
|
+
# Dump file
|
|
254
|
+
*.stackdump
|
|
255
|
+
|
|
256
|
+
# Folder config file
|
|
257
|
+
[Dd]esktop.ini
|
|
258
|
+
|
|
259
|
+
# Recycle Bin used on file shares
|
|
260
|
+
$RECYCLE.BIN/
|
|
261
|
+
|
|
262
|
+
# Windows Installer files
|
|
263
|
+
*.cab
|
|
264
|
+
*.msi
|
|
265
|
+
*.msix
|
|
266
|
+
*.msm
|
|
267
|
+
*.msp
|
|
268
|
+
|
|
269
|
+
# Windows shortcuts
|
|
270
|
+
*.lnk
|
|
271
|
+
|
|
272
|
+
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,dotenv,linux,python,windows
|
|
273
|
+
|
|
274
|
+
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
|
275
|
+
|
|
276
|
+
data/*
|
|
277
|
+
results/*
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright (c) Guillaume Lombardo
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: extractforms
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A python project to turn scanned forms into a list of key-value pairs.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Guillaume-Lombardo/extractforms
|
|
6
|
+
Project-URL: Repository, https://github.com/Guillaume-Lombardo/extractforms
|
|
7
|
+
Project-URL: Issues, https://github.com/Guillaume-Lombardo/extractforms/issues
|
|
8
|
+
Author-email: Guillaume Lombardo <g1lom@later.day>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: automation,cli,package,python
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Python: >=3.13
|
|
18
|
+
Requires-Dist: certifi>=2026.1.4
|
|
19
|
+
Requires-Dist: httpx>=0.28.1
|
|
20
|
+
Requires-Dist: openai>=2.1.0
|
|
21
|
+
Requires-Dist: pydantic-settings>=2.13.0
|
|
22
|
+
Requires-Dist: pydantic>=2.12.0
|
|
23
|
+
Requires-Dist: pymupdf>=1.26.5
|
|
24
|
+
Requires-Dist: python-dotenv>=1.1.0
|
|
25
|
+
Requires-Dist: structlog>=25.5.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# ExtractForms
|
|
29
|
+
|
|
30
|
+
`extractforms` is a Python package and CLI to extract key/value fields from PDF forms.
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
uv sync --group dev
|
|
36
|
+
uv run pre-commit install
|
|
37
|
+
uv run ruff format .
|
|
38
|
+
uv run ruff check .
|
|
39
|
+
uv run ty check src tests
|
|
40
|
+
uv run pytest
|
|
41
|
+
uv run pre-commit run --all-files
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## CLI
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
extractforms extract --input form.pdf --output results/result.json --passes 2
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Supported options include:
|
|
51
|
+
- `--no-cache`
|
|
52
|
+
- `--dpi`, `--image-format`, `--page-start`, `--page-end`, `--max-pages`
|
|
53
|
+
- `--chunk-pages`
|
|
54
|
+
- `--extra-instructions`
|
|
55
|
+
- `--schema-id`, `--schema-path`, `--match-schema`
|
|
56
|
+
|
|
57
|
+
## Environment
|
|
58
|
+
|
|
59
|
+
Copy `.env.template` to `.env` and configure:
|
|
60
|
+
- logging (`LOG_LEVEL`, `LOG_JSON`, `LOG_FILE`)
|
|
61
|
+
- enterprise network/TLS (`HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, `NO_PROXY`, `CERT_PATH`)
|
|
62
|
+
- model endpoint (`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`)
|
|
63
|
+
|
|
64
|
+
## Project Layout
|
|
65
|
+
|
|
66
|
+
- `src/extractforms`: package code
|
|
67
|
+
- `tests/unit`: fast default tests
|
|
68
|
+
- `tests/integration`: component-level tests
|
|
69
|
+
- `tests/end2end`: user-facing behavior tests
|
|
70
|
+
- `skills`: AI helper skills for coding workflows
|
|
71
|
+
|
|
72
|
+
## Release
|
|
73
|
+
|
|
74
|
+
1. Bump `version` in `pyproject.toml`.
|
|
75
|
+
2. Create and push a git tag: `vX.Y.Z`.
|
|
76
|
+
3. GitHub Action publishes to PyPI.
|
|
77
|
+
|
|
78
|
+
For manual validation, use workflow dispatch with `publish_target=testpypi`.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# ExtractForms
|
|
2
|
+
|
|
3
|
+
`extractforms` is a Python package and CLI to extract key/value fields from PDF forms.
|
|
4
|
+
|
|
5
|
+
## Quickstart
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv sync --group dev
|
|
9
|
+
uv run pre-commit install
|
|
10
|
+
uv run ruff format .
|
|
11
|
+
uv run ruff check .
|
|
12
|
+
uv run ty check src tests
|
|
13
|
+
uv run pytest
|
|
14
|
+
uv run pre-commit run --all-files
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## CLI
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
extractforms extract --input form.pdf --output results/result.json --passes 2
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Supported options include:
|
|
24
|
+
- `--no-cache`
|
|
25
|
+
- `--dpi`, `--image-format`, `--page-start`, `--page-end`, `--max-pages`
|
|
26
|
+
- `--chunk-pages`
|
|
27
|
+
- `--extra-instructions`
|
|
28
|
+
- `--schema-id`, `--schema-path`, `--match-schema`
|
|
29
|
+
|
|
30
|
+
## Environment
|
|
31
|
+
|
|
32
|
+
Copy `.env.template` to `.env` and configure:
|
|
33
|
+
- logging (`LOG_LEVEL`, `LOG_JSON`, `LOG_FILE`)
|
|
34
|
+
- enterprise network/TLS (`HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, `NO_PROXY`, `CERT_PATH`)
|
|
35
|
+
- model endpoint (`OPENAI_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_MODEL`)
|
|
36
|
+
|
|
37
|
+
## Project Layout
|
|
38
|
+
|
|
39
|
+
- `src/extractforms`: package code
|
|
40
|
+
- `tests/unit`: fast default tests
|
|
41
|
+
- `tests/integration`: component-level tests
|
|
42
|
+
- `tests/end2end`: user-facing behavior tests
|
|
43
|
+
- `skills`: AI helper skills for coding workflows
|
|
44
|
+
|
|
45
|
+
## Release
|
|
46
|
+
|
|
47
|
+
1. Bump `version` in `pyproject.toml`.
|
|
48
|
+
2. Create and push a git tag: `vX.Y.Z`.
|
|
49
|
+
3. GitHub Action publishes to PyPI.
|
|
50
|
+
|
|
51
|
+
For manual validation, use workflow dispatch with `publish_target=testpypi`.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Architecture Decision Records (ADR)
|
|
2
|
+
|
|
3
|
+
## Why ADRs
|
|
4
|
+
|
|
5
|
+
- Track architectural decisions.
|
|
6
|
+
- Preserve context and rejected alternatives.
|
|
7
|
+
- Improve onboarding and future reviews.
|
|
8
|
+
|
|
9
|
+
## Naming convention
|
|
10
|
+
|
|
11
|
+
- `NNNN-short-title.md`
|
|
12
|
+
- Example: `0001-http-client-choice.md`
|
|
13
|
+
|
|
14
|
+
## Process
|
|
15
|
+
|
|
16
|
+
- Create an ADR for any long-term architectural decision.
|
|
17
|
+
- Link the ADR in the corresponding PR.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Engineering Docs
|
|
2
|
+
|
|
3
|
+
- [Definition of Done](./DEFINITION_OF_DONE.md)
|
|
4
|
+
- [Review Runbook](./REVIEW_RUNBOOK.md)
|
|
5
|
+
- [ADR](../adr/README.md)
|
|
6
|
+
- [Review Guide](../../.github/review/REVIEW_GUIDE.md)
|
|
7
|
+
- [Repo Coherence Playbook](../../.github/review/REPO_COHERENCE_PLAYBOOK.md)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "extractforms"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A python project to turn scanned forms into a list of key-value pairs."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.13"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Guillaume Lombardo", email = "g1lom@later.day" },
|
|
9
|
+
]
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
keywords = ["automation", "cli", "package", "python"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"certifi>=2026.1.4",
|
|
21
|
+
"httpx>=0.28.1",
|
|
22
|
+
"openai>=2.1.0",
|
|
23
|
+
"pydantic-settings>=2.13.0",
|
|
24
|
+
"pydantic>=2.12.0",
|
|
25
|
+
"pymupdf>=1.26.5",
|
|
26
|
+
"python-dotenv>=1.1.0",
|
|
27
|
+
"structlog>=25.5.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[dependency-groups]
|
|
31
|
+
dev = [
|
|
32
|
+
"build>=1.3.0",
|
|
33
|
+
"detect-secrets>=1.5.0",
|
|
34
|
+
"pre-commit>=4.5.1",
|
|
35
|
+
"pytest-cov>=7.0.0",
|
|
36
|
+
"pytest-mock>=3.15.1",
|
|
37
|
+
"pytest>=9.0.2",
|
|
38
|
+
"ruff>=0.15.0",
|
|
39
|
+
"twine>=6.2.0",
|
|
40
|
+
"ty>=0.0.15",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/Guillaume-Lombardo/extractforms"
|
|
45
|
+
Repository = "https://github.com/Guillaume-Lombardo/extractforms"
|
|
46
|
+
Issues = "https://github.com/Guillaume-Lombardo/extractforms/issues"
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
extractforms = "extractforms.cli:main"
|
|
50
|
+
|
|
51
|
+
[build-system]
|
|
52
|
+
requires = ["hatchling>=1.27.0"]
|
|
53
|
+
build-backend = "hatchling.build"
|
|
54
|
+
|
|
55
|
+
[tool.hatch.build.targets.wheel]
|
|
56
|
+
packages = ["src/extractforms"]
|
|
57
|
+
|
|
58
|
+
[tool.hatch.build.targets.sdist]
|
|
59
|
+
include = [
|
|
60
|
+
"LICENSE",
|
|
61
|
+
"README.md",
|
|
62
|
+
"pyproject.toml",
|
|
63
|
+
"src/extractforms",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[tool.pytest.ini_options]
|
|
67
|
+
minversion = "9.0"
|
|
68
|
+
testpaths = ["tests"]
|
|
69
|
+
addopts = "-ra -q -m unit --strict-markers --cov=src --cov-report=term-missing:skip-covered --cov-report=html"
|
|
70
|
+
pythonpath = ["src"]
|
|
71
|
+
markers = [
|
|
72
|
+
"end2end: mark a test as an end-to-end test.",
|
|
73
|
+
"integration: mark a test as an integration test.",
|
|
74
|
+
"unit: mark a test as a unit test.",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
[tool.coverage.run]
|
|
78
|
+
branch = true
|
|
79
|
+
source = ["src/extractforms"]
|
|
80
|
+
omit = ["*/tests/*"]
|
|
81
|
+
|
|
82
|
+
[tool.coverage.report]
|
|
83
|
+
show_missing = true
|
|
84
|
+
skip_covered = true
|
|
85
|
+
precision = 2
|
|
86
|
+
fail_under = 80
|
|
87
|
+
|
|
88
|
+
[tool.coverage.html]
|
|
89
|
+
directory = "htmlcov"
|
|
90
|
+
title = "ExtractForms coverage"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""ExtractForms package."""
|
|
2
|
+
|
|
3
|
+
from extractforms._bootstrap import logger
|
|
4
|
+
from extractforms.async_runner import run_async
|
|
5
|
+
from extractforms.exceptions import (
|
|
6
|
+
AsyncExecutionError,
|
|
7
|
+
BackendError,
|
|
8
|
+
DependencyError,
|
|
9
|
+
ExtractionError,
|
|
10
|
+
PackageError,
|
|
11
|
+
SettingsError,
|
|
12
|
+
)
|
|
13
|
+
from extractforms.logging import configure_logging, get_logger
|
|
14
|
+
from extractforms.settings import Settings, get_settings
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AsyncExecutionError",
|
|
20
|
+
"BackendError",
|
|
21
|
+
"DependencyError",
|
|
22
|
+
"ExtractionError",
|
|
23
|
+
"PackageError",
|
|
24
|
+
"Settings",
|
|
25
|
+
"SettingsError",
|
|
26
|
+
"__version__",
|
|
27
|
+
"configure_logging",
|
|
28
|
+
"get_logger",
|
|
29
|
+
"get_settings",
|
|
30
|
+
"logger",
|
|
31
|
+
"run_async",
|
|
32
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Helpers to run async operations from sync or async contexts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import threading
|
|
7
|
+
from queue import Queue
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from extractforms.exceptions import AsyncExecutionError
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Coroutine
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _run_in_background_thread[T](coro: Coroutine[Any, Any, T]) -> T:
|
|
17
|
+
"""Run a coroutine in a dedicated thread with its own event loop.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
coro (Coroutine[Any, Any, T]): The coroutine to run.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
AsyncExecutionError: If the coroutine raises an exception.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
T: The result of the coroutine.
|
|
27
|
+
"""
|
|
28
|
+
output: Queue[T | BaseException] = Queue(maxsize=1)
|
|
29
|
+
|
|
30
|
+
def _runner() -> None:
|
|
31
|
+
try:
|
|
32
|
+
output.put(asyncio.run(coro))
|
|
33
|
+
except BaseException as exc:
|
|
34
|
+
output.put(exc)
|
|
35
|
+
|
|
36
|
+
thread = threading.Thread(target=_runner, daemon=True)
|
|
37
|
+
thread.start()
|
|
38
|
+
thread.join()
|
|
39
|
+
|
|
40
|
+
result = output.get()
|
|
41
|
+
if isinstance(result, BaseException):
|
|
42
|
+
raise AsyncExecutionError(result=result) from result
|
|
43
|
+
return result
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_async[T](coro: Coroutine[Any, Any, T]) -> T:
|
|
47
|
+
"""Run an async coroutine from both sync and async contexts.
|
|
48
|
+
|
|
49
|
+
If called from a sync context, the coroutine will be run in a dedicated thread
|
|
50
|
+
with its own event loop. If called from an async context, the coroutine will
|
|
51
|
+
be awaited directly.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
coro (Coroutine[Any, Any, T]): The coroutine to run.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
T: The result of the coroutine.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
asyncio.get_running_loop()
|
|
61
|
+
except RuntimeError:
|
|
62
|
+
return asyncio.run(coro)
|
|
63
|
+
|
|
64
|
+
return _run_in_background_thread(coro)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Extraction backends."""
|
|
2
|
+
|
|
3
|
+
from extractforms.backends.multimodal_openai import MultimodalLLMBackend
|
|
4
|
+
from extractforms.backends.ocr_document_intelligence import OCRBackend
|
|
5
|
+
from extractforms.typing.protocol import ExtractorBackend, PageSource
|
|
6
|
+
|
|
7
|
+
__all__ = ["ExtractorBackend", "MultimodalLLMBackend", "OCRBackend", "PageSource"]
|