extractforms 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ """ExtractForms package."""
2
+
3
+ from extractforms._bootstrap import logger
4
+ from extractforms.async_runner import run_async
5
+ from extractforms.exceptions import (
6
+ AsyncExecutionError,
7
+ BackendError,
8
+ DependencyError,
9
+ ExtractionError,
10
+ PackageError,
11
+ SettingsError,
12
+ )
13
+ from extractforms.logging import configure_logging, get_logger
14
+ from extractforms.settings import Settings, get_settings
15
+
16
+ __version__ = "0.1.0"
17
+
18
+ __all__ = [
19
+ "AsyncExecutionError",
20
+ "BackendError",
21
+ "DependencyError",
22
+ "ExtractionError",
23
+ "PackageError",
24
+ "Settings",
25
+ "SettingsError",
26
+ "__version__",
27
+ "configure_logging",
28
+ "get_logger",
29
+ "get_settings",
30
+ "logger",
31
+ "run_async",
32
+ ]
@@ -0,0 +1,10 @@
1
+ """Package bootstrap helpers."""
2
+
3
+ from extractforms.dependencies import ensure_package_dependencies
4
+ from extractforms.logging import get_logger
5
+
6
+ ensure_package_dependencies()
7
+
8
+ logger = get_logger("extractforms")
9
+
10
+ __all__ = ["logger"]
@@ -0,0 +1,64 @@
1
+ """Helpers to run async operations from sync or async contexts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import threading
7
+ from queue import Queue
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from extractforms.exceptions import AsyncExecutionError
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Coroutine
14
+
15
+
16
+ def _run_in_background_thread[T](coro: Coroutine[Any, Any, T]) -> T:
17
+ """Run a coroutine in a dedicated thread with its own event loop.
18
+
19
+ Args:
20
+ coro (Coroutine[Any, Any, T]): The coroutine to run.
21
+
22
+ Raises:
23
+ AsyncExecutionError: If the coroutine raises an exception.
24
+
25
+ Returns:
26
+ T: The result of the coroutine.
27
+ """
28
+ output: Queue[T | BaseException] = Queue(maxsize=1)
29
+
30
+ def _runner() -> None:
31
+ try:
32
+ output.put(asyncio.run(coro))
33
+ except BaseException as exc:
34
+ output.put(exc)
35
+
36
+ thread = threading.Thread(target=_runner, daemon=True)
37
+ thread.start()
38
+ thread.join()
39
+
40
+ result = output.get()
41
+ if isinstance(result, BaseException):
42
+ raise AsyncExecutionError(result=result) from result
43
+ return result
44
+
45
+
46
+ def run_async[T](coro: Coroutine[Any, Any, T]) -> T:
47
+ """Run an async coroutine from both sync and async contexts.
48
+
49
+ If called from a sync context, the coroutine will be run in a dedicated thread
50
+ with its own event loop. If called from an async context, the coroutine will
51
+ be awaited directly.
52
+
53
+ Args:
54
+ coro (Coroutine[Any, Any, T]): The coroutine to run.
55
+
56
+ Returns:
57
+ T: The result of the coroutine.
58
+ """
59
+ try:
60
+ asyncio.get_running_loop()
61
+ except RuntimeError:
62
+ return asyncio.run(coro)
63
+
64
+ return _run_in_background_thread(coro)
@@ -0,0 +1,7 @@
1
+ """Extraction backends."""
2
+
3
+ from extractforms.backends.multimodal_openai import MultimodalLLMBackend
4
+ from extractforms.backends.ocr_document_intelligence import OCRBackend
5
+ from extractforms.typing.protocol import ExtractorBackend, PageSource
6
+
7
+ __all__ = ["ExtractorBackend", "MultimodalLLMBackend", "OCRBackend", "PageSource"]
@@ -0,0 +1,254 @@
1
+ """OpenAI-compatible multimodal backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import TYPE_CHECKING, Any, cast
7
+
8
+ from openai import APIConnectionError, APIStatusError, APITimeoutError, AsyncOpenAI
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from extractforms import logger
12
+ from extractforms.async_runner import run_async
13
+ from extractforms.exceptions import BackendError
14
+ from extractforms.prompts import (
15
+ build_schema_inference_prompt,
16
+ build_values_extraction_prompt,
17
+ schema_response_format,
18
+ )
19
+ from extractforms.typing.models import FieldValue, PricingCall, RenderedPage, SchemaField, SchemaSpec
20
+
21
+ if TYPE_CHECKING:
22
+ from extractforms.settings import Settings
23
+
24
+
25
+ class _SchemaResponse(BaseModel):
26
+ model_config = ConfigDict(extra="forbid")
27
+
28
+ name: str
29
+ fields: list[SchemaField]
30
+
31
+
32
+ class _ValuesResponse(BaseModel):
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ fields: list[FieldValue]
36
+
37
+
38
+ class MultimodalLLMBackend:
39
+ """Multimodal extraction backend against OpenAI-compatible endpoints."""
40
+
41
+ def __init__(self, settings: Settings) -> None:
42
+ """Initialize backend.
43
+
44
+ Args:
45
+ settings (Settings): Runtime settings.
46
+ """
47
+ self._settings = settings
48
+
49
+ async def _apost_chat_completions(
50
+ self,
51
+ payload: dict[str, Any],
52
+ ) -> tuple[dict[str, Any], PricingCall | None]:
53
+ """Send one async completion request.
54
+
55
+ Args:
56
+ payload (dict[str, Any]): Request payload.
57
+
58
+ Raises:
59
+ BackendError: If request fails or endpoint is misconfigured.
60
+
61
+ Returns:
62
+ tuple[dict[str, Any], PricingCall | None]: Parsed payload and optional pricing call.
63
+ """
64
+ if not self._settings.openai_base_url:
65
+ raise BackendError(message="OPENAI_BASE_URL is required for multimodal backend")
66
+ if not self._settings.openai_api_key:
67
+ raise BackendError(message="OPENAI_API_KEY is required for multimodal backend")
68
+
69
+ client = self._settings.select_async_httpx_client(self._settings.openai_base_url)
70
+ if client is None:
71
+ raise BackendError(message="httpx clients are not initialized in settings")
72
+ http_client = cast("Any", client)
73
+ openai_client = AsyncOpenAI(
74
+ api_key=self._settings.openai_api_key,
75
+ base_url=self._settings.openai_base_url,
76
+ http_client=http_client,
77
+ )
78
+
79
+ try:
80
+ completion = await openai_client.chat.completions.create(**payload)
81
+ data = completion.model_dump(mode="json")
82
+ except APIStatusError as exc:
83
+ status_code = getattr(exc, "status_code", None)
84
+ raise BackendError(
85
+ message=f"Chat completion request failed with status {status_code}",
86
+ ) from exc
87
+ except APITimeoutError as exc:
88
+ raise BackendError(message="Chat completion request timed out") from exc
89
+ except APIConnectionError as exc:
90
+ raise BackendError(message=f"Chat completion request failed: {exc}") from exc
91
+ except Exception as exc:
92
+ raise BackendError(
93
+ message=f"Chat completion request failed: {exc}",
94
+ ) from exc
95
+
96
+ usage = data.get("usage", {})
97
+ pricing = PricingCall(
98
+ provider="openai-compatible",
99
+ model=self._settings.openai_model,
100
+ input_tokens=usage.get("prompt_tokens"),
101
+ output_tokens=usage.get("completion_tokens"),
102
+ total_cost_usd=None,
103
+ )
104
+
105
+ return data, pricing
106
+
107
+ def _post_chat_completions(self, payload: dict[str, Any]) -> tuple[dict[str, Any], PricingCall | None]:
108
+ """Send one completion request from sync call sites.
109
+
110
+ Args:
111
+ payload (dict[str, Any]): Request payload.
112
+
113
+ Returns:
114
+ tuple[dict[str, Any], PricingCall | None]: Parsed payload and optional pricing call.
115
+ """
116
+ return run_async(self._apost_chat_completions(payload))
117
+
118
+ @staticmethod
119
+ def _image_content(page: RenderedPage) -> dict[str, Any]:
120
+ """Build image content chunk.
121
+
122
+ Args:
123
+ page (RenderedPage): Rendered page.
124
+
125
+ Returns:
126
+ dict[str, Any]: OpenAI content block.
127
+ """
128
+ return {
129
+ "type": "image_url",
130
+ "image_url": {"url": f"data:{page.mime_type};base64,{page.data_base64}"},
131
+ }
132
+
133
+ async def ainfer_schema(self, pages: list[RenderedPage]) -> tuple[SchemaSpec, PricingCall | None]:
134
+ """Infer schema from rendered pages.
135
+
136
+ Args:
137
+ pages (list[RenderedPage]): Rendered pages.
138
+
139
+ Raises:
140
+ BackendError: If page list is empty.
141
+
142
+ Returns:
143
+ tuple[SchemaSpec, PricingCall | None]: Inferred schema and call pricing.
144
+ """
145
+ if not pages:
146
+ raise BackendError(message="Cannot infer schema from empty page list")
147
+
148
+ prompt = build_schema_inference_prompt()
149
+ response_format = schema_response_format("schema_response", _SchemaResponse.model_json_schema())
150
+
151
+ content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
152
+ content.extend(self._image_content(page) for page in pages)
153
+
154
+ payload = {
155
+ "model": self._settings.openai_model,
156
+ "messages": [{"role": "user", "content": content}],
157
+ "response_format": {
158
+ "type": "json_schema",
159
+ "json_schema": response_format.model_dump(mode="json", by_alias=True),
160
+ },
161
+ }
162
+
163
+ data, pricing = await self._apost_chat_completions(payload)
164
+ content_text = data["choices"][0]["message"]["content"]
165
+ parsed = _SchemaResponse.model_validate(json.loads(content_text))
166
+
167
+ schema = SchemaSpec(
168
+ id="",
169
+ name=parsed.name,
170
+ fingerprint="",
171
+ fields=parsed.fields,
172
+ )
173
+ logger.info("Schema inferred", extra={"fields": len(schema.fields)})
174
+ return schema, pricing
175
+
176
+ def infer_schema(self, pages: list[RenderedPage]) -> tuple[SchemaSpec, PricingCall | None]:
177
+ """Infer schema from rendered pages (sync wrapper).
178
+
179
+ Args:
180
+ pages (list[RenderedPage]): Rendered pages.
181
+
182
+ Returns:
183
+ tuple[SchemaSpec, PricingCall | None]: Inferred schema and call pricing.
184
+ """
185
+ return run_async(self.ainfer_schema(pages))
186
+
187
+ async def aextract_values(
188
+ self,
189
+ pages: list[RenderedPage],
190
+ keys: list[str],
191
+ *,
192
+ extra_instructions: str | None = None,
193
+ ) -> tuple[list[FieldValue], PricingCall | None]:
194
+ """Extract values for specific keys.
195
+
196
+ Args:
197
+ pages (list[RenderedPage]): Rendered pages.
198
+ keys (list[str]): Keys to extract.
199
+ extra_instructions (str | None): Optional prompt augmentation.
200
+
201
+ Raises:
202
+ BackendError: If page list is empty.
203
+
204
+ Returns:
205
+ tuple[list[FieldValue], PricingCall | None]: Extracted values and pricing.
206
+ """
207
+ if not pages:
208
+ raise BackendError(message="Cannot extract values from empty page list")
209
+
210
+ schema = SchemaSpec(
211
+ id="",
212
+ name="runtime",
213
+ fingerprint="",
214
+ fields=[SchemaField(key=k, label=k) for k in keys],
215
+ )
216
+ prompt = build_values_extraction_prompt(schema, extra_instructions=extra_instructions)
217
+ response_format = schema_response_format("values_response", _ValuesResponse.model_json_schema())
218
+
219
+ content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
220
+ content.extend(self._image_content(page) for page in pages)
221
+
222
+ payload = {
223
+ "model": self._settings.openai_model,
224
+ "messages": [{"role": "user", "content": content}],
225
+ "response_format": {
226
+ "type": "json_schema",
227
+ "json_schema": response_format.model_dump(mode="json", by_alias=True),
228
+ },
229
+ }
230
+
231
+ data, pricing = await self._apost_chat_completions(payload)
232
+ content_text = data["choices"][0]["message"]["content"]
233
+ parsed = _ValuesResponse.model_validate(json.loads(content_text))
234
+ logger.info("Values extracted", extra={"fields": len(parsed.fields)})
235
+ return parsed.fields, pricing
236
+
237
+ def extract_values(
238
+ self,
239
+ pages: list[RenderedPage],
240
+ keys: list[str],
241
+ *,
242
+ extra_instructions: str | None = None,
243
+ ) -> tuple[list[FieldValue], PricingCall | None]:
244
+ """Extract values for specific keys (sync wrapper).
245
+
246
+ Args:
247
+ pages (list[RenderedPage]): Rendered pages.
248
+ keys (list[str]): Keys to extract.
249
+ extra_instructions (str | None): Optional prompt augmentation.
250
+
251
+ Returns:
252
+ tuple[list[FieldValue], PricingCall | None]: Extracted values and pricing.
253
+ """
254
+ return run_async(self.aextract_values(pages, keys, extra_instructions=extra_instructions))
@@ -0,0 +1,48 @@
1
+ """OCR backend stub for future Document Intelligence integration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from extractforms.exceptions import BackendError
8
+
9
+ if TYPE_CHECKING:
10
+ from extractforms.typing.models import FieldValue, PricingCall, RenderedPage, SchemaSpec
11
+
12
+
13
+ class OCRBackend:
14
+ """Placeholder OCR backend."""
15
+
16
+ @staticmethod
17
+ def infer_schema(
18
+ pages: list[RenderedPage],
19
+ ) -> tuple[SchemaSpec, PricingCall | None]:
20
+ """Infer schema with OCR backend.
21
+
22
+ Args:
23
+ pages (list[RenderedPage]): Rendered pages.
24
+
25
+ Raises:
26
+ BackendError: Always in MVP stub.
27
+
28
+ """
29
+ _ = pages
30
+ raise BackendError(message="OCR backend is not implemented yet")
31
+
32
+ @staticmethod
33
+ def extract_values(
34
+ pages: list[RenderedPage],
35
+ keys: list[str],
36
+ ) -> tuple[list[FieldValue], PricingCall | None]:
37
+ """Extract values with OCR backend.
38
+
39
+ Args:
40
+ pages (list[RenderedPage]): Rendered pages.
41
+ keys (list[str]): Keys to extract.
42
+
43
+ Raises:
44
+ BackendError: Always in MVP stub.
45
+
46
+ """
47
+ _ = (pages, keys)
48
+ raise BackendError(message="OCR backend is not implemented yet")
extractforms/cli.py ADDED
@@ -0,0 +1,150 @@
1
+ """CLI entry point for ExtractForms."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from extractforms import __version__, logger
9
+ from extractforms.dependencies import ensure_cli_dependencies_for_extract
10
+ from extractforms.exceptions import PackageError
11
+ from extractforms.extractor import persist_result, run_extract
12
+ from extractforms.logging import configure_logging
13
+ from extractforms.settings import get_settings
14
+ from extractforms.typing.enums import PassMode
15
+ from extractforms.typing.models import ExtractRequest
16
+
17
+
18
+ def _pass_mode_from_cli(value: str) -> PassMode:
19
+ """Convert `--passes` CLI value into pass mode.
20
+
21
+ Args:
22
+ value (str): CLI value (`1` or `2`).
23
+
24
+ Raises:
25
+ argparse.ArgumentTypeError: If value is not supported.
26
+
27
+ Returns:
28
+ PassMode: Selected pass mode.
29
+ """
30
+ mapping = {
31
+ "1": PassMode.ONE_PASS,
32
+ "2": PassMode.TWO_PASS,
33
+ }
34
+ if value not in mapping:
35
+ raise argparse.ArgumentTypeError("--passes must be one of: 1, 2") # noqa: TRY003
36
+ return mapping[value]
37
+
38
+
39
+ def build_parser() -> argparse.ArgumentParser:
40
+ """Create the command-line parser.
41
+
42
+ Returns:
43
+ argparse.ArgumentParser: The configured argument parser.
44
+ """
45
+ parser = argparse.ArgumentParser(prog="extractforms")
46
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
47
+
48
+ subparsers = parser.add_subparsers(dest="command")
49
+
50
+ extract_parser = subparsers.add_parser("extract", help="Extract key/value fields from a PDF form")
51
+ extract_parser.add_argument("--input", required=True, type=Path, dest="input_path")
52
+ extract_parser.add_argument(
53
+ "--output",
54
+ type=Path,
55
+ default=Path("results/result.json"),
56
+ dest="output_path",
57
+ )
58
+ extract_parser.add_argument("--passes", default="2", type=_pass_mode_from_cli, dest="mode")
59
+ extract_parser.add_argument("--no-cache", action="store_true", dest="no_cache")
60
+
61
+ extract_parser.add_argument("--dpi", type=int, default=200)
62
+ extract_parser.add_argument("--image-format", default="png", dest="image_format")
63
+ extract_parser.add_argument("--page-start", type=int, default=None, dest="page_start")
64
+ extract_parser.add_argument("--page-end", type=int, default=None, dest="page_end")
65
+ extract_parser.add_argument("--max-pages", type=int, default=None, dest="max_pages")
66
+
67
+ extract_parser.add_argument("--chunk-pages", type=int, default=1, dest="chunk_pages")
68
+ extract_parser.add_argument("--extra-instructions", default=None, dest="extra_instructions")
69
+
70
+ extract_parser.add_argument("--schema-id", default=None, dest="schema_id")
71
+ extract_parser.add_argument("--schema-path", type=Path, default=None, dest="schema_path")
72
+ extract_parser.add_argument("--match-schema", action="store_true", dest="match_schema")
73
+
74
+ return parser
75
+
76
+
77
+ def _build_extract_request(args: argparse.Namespace) -> ExtractRequest:
78
+ """Build extraction request from CLI arguments.
79
+
80
+ Args:
81
+ args (argparse.Namespace): Parsed CLI args.
82
+
83
+ Returns:
84
+ ExtractRequest: Request object.
85
+ """
86
+ mode = args.mode
87
+ if args.schema_id or args.schema_path:
88
+ mode = PassMode.ONE_SCHEMA_PASS
89
+
90
+ return ExtractRequest(
91
+ input_path=args.input_path,
92
+ output_path=args.output_path,
93
+ mode=mode,
94
+ use_cache=not args.no_cache,
95
+ dpi=args.dpi,
96
+ image_format=args.image_format,
97
+ page_start=args.page_start,
98
+ page_end=args.page_end,
99
+ max_pages=args.max_pages,
100
+ chunk_pages=args.chunk_pages,
101
+ schema_id=args.schema_id,
102
+ schema_path=args.schema_path,
103
+ match_schema=args.match_schema,
104
+ extra_instructions=args.extra_instructions,
105
+ )
106
+
107
+
108
+ def main() -> int:
109
+ """Run the CLI.
110
+
111
+ Returns:
112
+ int: Exit code (0 for success, 1 for error).
113
+ """
114
+ settings = get_settings()
115
+ configure_logging(settings=settings)
116
+
117
+ parser = build_parser()
118
+ args = parser.parse_args()
119
+
120
+ if args.command != "extract":
121
+ parser.print_help()
122
+ return 0
123
+
124
+ ensure_cli_dependencies_for_extract()
125
+ request = _build_extract_request(args)
126
+
127
+ try:
128
+ result = run_extract(request, settings)
129
+ except PackageError:
130
+ logger.exception("Extraction failed")
131
+ return 1
132
+ except KeyboardInterrupt:
133
+ logger.info("Extraction aborted by user")
134
+ return 130
135
+ except Exception:
136
+ logger.exception("Unexpected error during extraction")
137
+ return 1
138
+ finally:
139
+ settings.close_httpx_clients()
140
+
141
+ output_path = request.output_path
142
+ if output_path is None:
143
+ output_path = Path("results/result.json")
144
+ persist_result(result, output_path)
145
+ logger.info("Extraction completed", extra={"output_path": str(output_path)})
146
+ return 0
147
+
148
+
149
+ if __name__ == "__main__":
150
+ raise SystemExit(main())
@@ -0,0 +1,66 @@
1
+ """Runtime dependency checks for CLI commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib.util
6
+
7
+ from extractforms.exceptions import DependencyError
8
+
9
+
10
+ def _is_module_available(module_name: str) -> bool:
11
+ """Check whether a module can be imported.
12
+
13
+ Args:
14
+ module_name (str): Python module name.
15
+
16
+ Returns:
17
+ bool: True if import spec exists.
18
+ """
19
+ return importlib.util.find_spec(module_name) is not None
20
+
21
+
22
+ def _collect_missing_dependencies(modules_by_package: dict[str, str]) -> list[str]:
23
+ """Collect missing packages for a module mapping.
24
+
25
+ Args:
26
+ modules_by_package (Mapping[str, str]): Mapping of package name -> import module.
27
+
28
+ Returns:
29
+ list[str]: Missing package names.
30
+ """
31
+ return [package for package, module in modules_by_package.items() if not _is_module_available(module)]
32
+
33
+
34
+ def ensure_package_dependencies() -> None:
35
+ """Validate required dependencies at package import time.
36
+
37
+ Raises:
38
+ DependencyError: If required runtime dependencies are missing.
39
+ """
40
+ missing = _collect_missing_dependencies(
41
+ {
42
+ "httpx": "httpx",
43
+ "openai": "openai",
44
+ "certifi": "certifi",
45
+ },
46
+ )
47
+ if missing:
48
+ raise DependencyError(missing_package=missing, message="package import")
49
+
50
+
51
+ def ensure_cli_dependencies_for_extract() -> None:
52
+ """Validate required runtime dependencies for `extractforms extract`.
53
+
54
+ Raises:
55
+ DependencyError: If one or more required modules are missing.
56
+ """
57
+ missing = _collect_missing_dependencies(
58
+ {
59
+ "pymupdf": "fitz",
60
+ "httpx": "httpx",
61
+ "openai": "openai",
62
+ },
63
+ )
64
+
65
+ if missing:
66
+ raise DependencyError(missing_package=missing, message="extract")