localparse 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- localparse/__init__.py +43 -0
- localparse/_errors.py +57 -0
- localparse/_models.py +99 -0
- localparse/client.py +352 -0
- localparse/py.typed +0 -0
- localparse-0.1.0.dist-info/METADATA +95 -0
- localparse-0.1.0.dist-info/RECORD +8 -0
- localparse-0.1.0.dist-info/WHEEL +4 -0
localparse/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""LocalParse — official Python client for the LocalParse document-parsing API.
|
|
2
|
+
|
|
3
|
+
LocalParse is a LlamaParse-compatible parser with a deterministic accuracy layer
|
|
4
|
+
(table-detection recovery + oracle-free financial-identity checks).
|
|
5
|
+
|
|
6
|
+
from localparse import LocalParse
|
|
7
|
+
|
|
8
|
+
client = LocalParse(api_key="lp-...") # or set LOCALPARSE_API_KEY
|
|
9
|
+
result = client.parse("invoice.pdf", result_type="markdown")
|
|
10
|
+
print(result.markdown)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from ._errors import (
|
|
16
|
+
APIError,
|
|
17
|
+
AuthenticationError,
|
|
18
|
+
JobFailedError,
|
|
19
|
+
JobTimeoutError,
|
|
20
|
+
LocalParseError,
|
|
21
|
+
NotFoundError,
|
|
22
|
+
QuotaExceededError,
|
|
23
|
+
RateLimitError,
|
|
24
|
+
)
|
|
25
|
+
from ._models import Job, ParseResult
|
|
26
|
+
from .client import LocalParse
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"APIError",
|
|
32
|
+
"AuthenticationError",
|
|
33
|
+
"Job",
|
|
34
|
+
"JobFailedError",
|
|
35
|
+
"JobTimeoutError",
|
|
36
|
+
"LocalParse",
|
|
37
|
+
"LocalParseError",
|
|
38
|
+
"NotFoundError",
|
|
39
|
+
"ParseResult",
|
|
40
|
+
"QuotaExceededError",
|
|
41
|
+
"RateLimitError",
|
|
42
|
+
"__version__",
|
|
43
|
+
]
|
localparse/_errors.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LocalParseError(Exception):
|
|
5
|
+
"""Base class for every error raised by the LocalParse client."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AuthenticationError(LocalParseError):
|
|
9
|
+
"""The API key is missing, unknown, revoked, or expired (HTTP 401/403)."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RateLimitError(LocalParseError):
|
|
13
|
+
"""The key's per-minute rate limit was exceeded (HTTP 429).
|
|
14
|
+
|
|
15
|
+
``retry_after`` is the server-suggested wait in seconds, when provided.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, message: str, *, retry_after: float | None = None) -> None:
|
|
19
|
+
super().__init__(message)
|
|
20
|
+
self.retry_after = retry_after
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QuotaExceededError(LocalParseError):
|
|
24
|
+
"""The key's monthly page quota is exhausted (HTTP 402)."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NotFoundError(LocalParseError):
|
|
28
|
+
"""The job/case doesn't exist or isn't visible to this key (HTTP 404)."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class APIError(LocalParseError):
|
|
32
|
+
"""An otherwise-unclassified non-2xx response.
|
|
33
|
+
|
|
34
|
+
``status_code`` is the HTTP status; ``body`` is the decoded response text.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, message: str, *, status_code: int, body: str | None = None) -> None:
|
|
38
|
+
super().__init__(message)
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
self.body = body
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class JobFailedError(LocalParseError):
|
|
44
|
+
"""A parse job finished in a terminal non-success state (ERROR/CANCELED)."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, message: str, *, job_id: str, status: str) -> None:
|
|
47
|
+
super().__init__(message)
|
|
48
|
+
self.job_id = job_id
|
|
49
|
+
self.status = status
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class JobTimeoutError(LocalParseError):
|
|
53
|
+
"""A job did not reach a terminal state within ``max_wait`` seconds."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, message: str, *, job_id: str) -> None:
|
|
56
|
+
super().__init__(message)
|
|
57
|
+
self.job_id = job_id
|
localparse/_models.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
#: Terminal job states (no further transitions).
|
|
7
|
+
TERMINAL_STATUSES = frozenset({"SUCCESS", "ERROR", "CANCELED"})
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Job:
|
|
12
|
+
"""A parse job handle returned by ``upload`` / ``get_job``.
|
|
13
|
+
|
|
14
|
+
Mirrors ``GET /api/parsing/job/{id}``: ``{"id", "status", "error_message"}``
|
|
15
|
+
where ``status`` is one of ``PENDING`` / ``SUCCESS`` / ``ERROR`` / ``CANCELED``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
id: str
|
|
19
|
+
status: str
|
|
20
|
+
error_message: str | None = None
|
|
21
|
+
raw: dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def is_done(self) -> bool:
|
|
25
|
+
return self.status in TERMINAL_STATUSES
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def is_success(self) -> bool:
|
|
29
|
+
return self.status == "SUCCESS"
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def from_payload(cls, payload: dict[str, Any]) -> Job:
|
|
33
|
+
return cls(
|
|
34
|
+
id=str(payload.get("id", "")),
|
|
35
|
+
status=str(payload.get("status", "")),
|
|
36
|
+
error_message=payload.get("error_message"),
|
|
37
|
+
raw=payload,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ParseResult:
|
|
43
|
+
"""The result of a successful parse, in the requested flavour.
|
|
44
|
+
|
|
45
|
+
``data`` is the raw response from ``GET /api/parsing/job/{id}/result/{type}``.
|
|
46
|
+
For ``markdown``/``text`` it carries the string under ``.markdown``/``.text``;
|
|
47
|
+
for ``json``/``structured`` the structured payload is under ``.data``/``.pages``.
|
|
48
|
+
Every flavour also exposes ``job_metadata`` (and the LocalParse accuracy
|
|
49
|
+
extensions ``identity_check`` / ``recovered_tables`` / ``persisted``).
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
job_id: str
|
|
53
|
+
result_type: str
|
|
54
|
+
data: dict[str, Any]
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def markdown(self) -> str | None:
|
|
58
|
+
value = self.data.get("markdown")
|
|
59
|
+
return value if isinstance(value, str) else None
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def text(self) -> str | None:
|
|
63
|
+
value = self.data.get("text")
|
|
64
|
+
return value if isinstance(value, str) else None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def pages(self) -> list[dict[str, Any]]:
|
|
68
|
+
value = self.data.get("pages")
|
|
69
|
+
return value if isinstance(value, list) else []
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def job_metadata(self) -> dict[str, Any]:
|
|
73
|
+
value = self.data.get("job_metadata")
|
|
74
|
+
return value if isinstance(value, dict) else {}
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def identity_check(self) -> dict[str, Any] | None:
|
|
78
|
+
"""Oracle-free financial-identity summary (Totals that don't reconcile)."""
|
|
79
|
+
value = self.job_metadata.get("identity_check")
|
|
80
|
+
return value if isinstance(value, dict) else None
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def recovered_tables(self) -> int:
|
|
84
|
+
"""Tables recovered by the detection-completeness (``detect_repair``) pass."""
|
|
85
|
+
value = self.job_metadata.get("recovered_tables")
|
|
86
|
+
return value if isinstance(value, int) else 0
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def persisted(self) -> dict[str, Any] | None:
|
|
90
|
+
"""Where the doc landed when a ``case_id`` was supplied (else ``None``)."""
|
|
91
|
+
value = self.job_metadata.get("persisted")
|
|
92
|
+
return value if isinstance(value, dict) else None
|
|
93
|
+
|
|
94
|
+
def __str__(self) -> str:
|
|
95
|
+
if self.markdown is not None:
|
|
96
|
+
return self.markdown
|
|
97
|
+
if self.text is not None:
|
|
98
|
+
return self.text
|
|
99
|
+
return str(self.data)
|
localparse/client.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import Callable, Iterable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import IO, Any, Union
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from ._errors import (
|
|
13
|
+
APIError,
|
|
14
|
+
AuthenticationError,
|
|
15
|
+
JobFailedError,
|
|
16
|
+
JobTimeoutError,
|
|
17
|
+
NotFoundError,
|
|
18
|
+
QuotaExceededError,
|
|
19
|
+
RateLimitError,
|
|
20
|
+
)
|
|
21
|
+
from ._models import Job, ParseResult
|
|
22
|
+
|
|
23
|
+
__all__ = ["LocalParse"]
|
|
24
|
+
|
|
25
|
+
DEFAULT_BASE_URL = "https://api.localparse.com"
|
|
26
|
+
|
|
27
|
+
# A path on disk, or a (filename, file-like) pair for in-memory bytes/streams.
|
|
28
|
+
FileInput = Union[str, "os.PathLike[str]", "tuple[str, IO[bytes]]"]
|
|
29
|
+
|
|
30
|
+
#: Result flavours fetchable from ``/result/{type}``.
|
|
31
|
+
_RESULT_TYPES = frozenset({"markdown", "md", "text", "txt", "json", "structured", "document"})
|
|
32
|
+
|
|
33
|
+
#: Statuses in the case manifest that mean "already ingested, skip on resume".
|
|
34
|
+
_RESUMABLE_STATUSES = frozenset({"ok", "needs_review"})
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LocalParse:
|
|
38
|
+
"""Client for the LocalParse document-parsing API.
|
|
39
|
+
|
|
40
|
+
LocalParse is a drop-in LlamaParse-compatible parser with a deterministic
|
|
41
|
+
accuracy layer (table-detection recovery + oracle-free financial-identity
|
|
42
|
+
checks). Typical use is the one-shot :meth:`parse`::
|
|
43
|
+
|
|
44
|
+
from localparse import LocalParse
|
|
45
|
+
|
|
46
|
+
client = LocalParse(api_key="lp-...") # or set LOCALPARSE_API_KEY
|
|
47
|
+
result = client.parse("invoice.pdf", result_type="markdown")
|
|
48
|
+
print(result.markdown)
|
|
49
|
+
|
|
50
|
+
For long batches use :meth:`parse_folder` with a ``case_id`` (incremental
|
|
51
|
+
re-ingest skips unchanged files); for full control use the async-job
|
|
52
|
+
primitives :meth:`upload` / :meth:`get_job` / :meth:`get_result`.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
api_key: str | None = None,
|
|
58
|
+
*,
|
|
59
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
60
|
+
timeout: float = 60.0,
|
|
61
|
+
poll_interval: float = 2.0,
|
|
62
|
+
max_wait: float = 900.0,
|
|
63
|
+
session: requests.Session | None = None,
|
|
64
|
+
) -> None:
|
|
65
|
+
key = api_key or os.environ.get("LOCALPARSE_API_KEY")
|
|
66
|
+
if not key:
|
|
67
|
+
raise AuthenticationError(
|
|
68
|
+
"An API key is required: pass api_key=... or set LOCALPARSE_API_KEY."
|
|
69
|
+
)
|
|
70
|
+
self.api_key = key
|
|
71
|
+
self.base_url = base_url.rstrip("/")
|
|
72
|
+
self.timeout = timeout
|
|
73
|
+
self.poll_interval = poll_interval
|
|
74
|
+
self.max_wait = max_wait
|
|
75
|
+
self._session = session or requests.Session()
|
|
76
|
+
self._session.headers.update(
|
|
77
|
+
{
|
|
78
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
79
|
+
"User-Agent": "localparse-python/0.1.0",
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# -- low-level HTTP ----------------------------------------------------- #
|
|
84
|
+
|
|
85
|
+
def _url(self, path: str) -> str:
|
|
86
|
+
return f"{self.base_url}{path}"
|
|
87
|
+
|
|
88
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> Any:
|
|
89
|
+
kwargs.setdefault("timeout", self.timeout)
|
|
90
|
+
try:
|
|
91
|
+
resp = self._session.request(method, self._url(path), **kwargs)
|
|
92
|
+
except requests.RequestException as exc: # network/DNS/TLS failure
|
|
93
|
+
raise APIError(f"Request to {path} failed: {exc}", status_code=0) from exc
|
|
94
|
+
self._raise_for_status(resp)
|
|
95
|
+
if not resp.content:
|
|
96
|
+
return None
|
|
97
|
+
try:
|
|
98
|
+
return resp.json()
|
|
99
|
+
except ValueError:
|
|
100
|
+
return resp.text
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def _raise_for_status(resp: requests.Response) -> None:
|
|
104
|
+
if resp.status_code < 400:
|
|
105
|
+
return
|
|
106
|
+
detail = _detail(resp)
|
|
107
|
+
if resp.status_code in (401, 403):
|
|
108
|
+
raise AuthenticationError(detail)
|
|
109
|
+
if resp.status_code == 402:
|
|
110
|
+
raise QuotaExceededError(detail)
|
|
111
|
+
if resp.status_code == 404:
|
|
112
|
+
raise NotFoundError(detail)
|
|
113
|
+
if resp.status_code == 429:
|
|
114
|
+
retry_after = resp.headers.get("Retry-After")
|
|
115
|
+
raise RateLimitError(
|
|
116
|
+
detail, retry_after=float(retry_after) if retry_after else None
|
|
117
|
+
)
|
|
118
|
+
raise APIError(detail, status_code=resp.status_code, body=resp.text)
|
|
119
|
+
|
|
120
|
+
# -- async-job primitives ---------------------------------------------- #
|
|
121
|
+
|
|
122
|
+
def upload(
|
|
123
|
+
self,
|
|
124
|
+
file: FileInput,
|
|
125
|
+
*,
|
|
126
|
+
result_type: str = "markdown",
|
|
127
|
+
case_id: str | None = None,
|
|
128
|
+
source_path: str | None = None,
|
|
129
|
+
language: str | Iterable[str] | None = None,
|
|
130
|
+
target_pages: str | None = None,
|
|
131
|
+
max_pages: int | None = None,
|
|
132
|
+
detect_repair: bool | None = None,
|
|
133
|
+
extra: dict[str, Any] | None = None,
|
|
134
|
+
) -> Job:
|
|
135
|
+
"""Submit a document and return a :class:`Job` (non-blocking).
|
|
136
|
+
|
|
137
|
+
``case_id`` persists the doc into a durable per-case store; ``source_path``
|
|
138
|
+
makes that identity folder-aware (so a re-ingest supersedes its own prior
|
|
139
|
+
version). ``extra`` passes through any other LlamaParse form field verbatim.
|
|
140
|
+
"""
|
|
141
|
+
data: dict[str, str] = {"result_type": result_type}
|
|
142
|
+
if case_id is not None:
|
|
143
|
+
data["case_id"] = case_id
|
|
144
|
+
if source_path is not None:
|
|
145
|
+
data["source_path"] = source_path
|
|
146
|
+
if language is not None:
|
|
147
|
+
data["language"] = language if isinstance(language, str) else ",".join(language)
|
|
148
|
+
if target_pages is not None:
|
|
149
|
+
data["target_pages"] = target_pages
|
|
150
|
+
if max_pages is not None:
|
|
151
|
+
data["max_pages"] = str(max_pages)
|
|
152
|
+
if detect_repair is not None:
|
|
153
|
+
data["detect_repair"] = "true" if detect_repair else "false"
|
|
154
|
+
if extra:
|
|
155
|
+
data.update({k: str(v) for k, v in extra.items()})
|
|
156
|
+
|
|
157
|
+
opened: IO[bytes] | None = None
|
|
158
|
+
try:
|
|
159
|
+
if isinstance(file, tuple):
|
|
160
|
+
filename, stream = file
|
|
161
|
+
files = {"file": (filename, stream)}
|
|
162
|
+
else:
|
|
163
|
+
path = Path(file)
|
|
164
|
+
opened = path.open("rb")
|
|
165
|
+
files = {"file": (path.name, opened)}
|
|
166
|
+
payload = self._request(
|
|
167
|
+
"POST", "/api/parsing/upload", data=data, files=files
|
|
168
|
+
)
|
|
169
|
+
finally:
|
|
170
|
+
if opened is not None:
|
|
171
|
+
opened.close()
|
|
172
|
+
return Job.from_payload(payload or {})
|
|
173
|
+
|
|
174
|
+
def get_job(self, job_id: str) -> Job:
|
|
175
|
+
"""Poll a job's current status."""
|
|
176
|
+
payload = self._request("GET", f"/api/parsing/job/{job_id}")
|
|
177
|
+
return Job.from_payload(payload or {})
|
|
178
|
+
|
|
179
|
+
def cancel(self, job_id: str) -> Job:
|
|
180
|
+
"""Cancel a still-``PENDING`` job so a worker never spends the pod on it."""
|
|
181
|
+
payload = self._request("DELETE", f"/api/parsing/job/{job_id}")
|
|
182
|
+
return Job.from_payload(payload or {})
|
|
183
|
+
|
|
184
|
+
def get_result(self, job_id: str, result_type: str = "markdown") -> ParseResult:
|
|
185
|
+
"""Fetch a finished job's result in the given flavour.
|
|
186
|
+
|
|
187
|
+
``result_type`` ∈ ``markdown``/``md``, ``text``/``txt``, ``json``,
|
|
188
|
+
``structured``/``document``.
|
|
189
|
+
"""
|
|
190
|
+
if result_type not in _RESULT_TYPES:
|
|
191
|
+
raise ValueError(
|
|
192
|
+
f"Unknown result_type {result_type!r}; expected one of {sorted(_RESULT_TYPES)}."
|
|
193
|
+
)
|
|
194
|
+
payload = self._request("GET", f"/api/parsing/job/{job_id}/result/{result_type}")
|
|
195
|
+
data = payload if isinstance(payload, dict) else {"result": payload}
|
|
196
|
+
return ParseResult(job_id=job_id, result_type=result_type, data=data)
|
|
197
|
+
|
|
198
|
+
def wait(
|
|
199
|
+
self,
|
|
200
|
+
job_id: str,
|
|
201
|
+
*,
|
|
202
|
+
poll_interval: float | None = None,
|
|
203
|
+
max_wait: float | None = None,
|
|
204
|
+
) -> Job:
|
|
205
|
+
"""Block until a job reaches a terminal state (or ``max_wait`` elapses)."""
|
|
206
|
+
interval = poll_interval if poll_interval is not None else self.poll_interval
|
|
207
|
+
deadline = time.monotonic() + (max_wait if max_wait is not None else self.max_wait)
|
|
208
|
+
while True:
|
|
209
|
+
job = self.get_job(job_id)
|
|
210
|
+
if job.is_done:
|
|
211
|
+
return job
|
|
212
|
+
if time.monotonic() >= deadline:
|
|
213
|
+
raise JobTimeoutError(
|
|
214
|
+
f"Job {job_id} did not finish within the timeout.", job_id=job_id
|
|
215
|
+
)
|
|
216
|
+
time.sleep(interval)
|
|
217
|
+
|
|
218
|
+
# -- one-shot convenience ---------------------------------------------- #
|
|
219
|
+
|
|
220
|
+
def parse(
|
|
221
|
+
self,
|
|
222
|
+
file: FileInput,
|
|
223
|
+
*,
|
|
224
|
+
result_type: str = "markdown",
|
|
225
|
+
case_id: str | None = None,
|
|
226
|
+
source_path: str | None = None,
|
|
227
|
+
language: str | Iterable[str] | None = None,
|
|
228
|
+
target_pages: str | None = None,
|
|
229
|
+
max_pages: int | None = None,
|
|
230
|
+
detect_repair: bool | None = None,
|
|
231
|
+
poll_interval: float | None = None,
|
|
232
|
+
max_wait: float | None = None,
|
|
233
|
+
extra: dict[str, Any] | None = None,
|
|
234
|
+
) -> ParseResult:
|
|
235
|
+
"""Upload, wait for completion, and return the parsed result.
|
|
236
|
+
|
|
237
|
+
Raises :class:`~localparse.JobFailedError` if the parse ends in ERROR/
|
|
238
|
+
CANCELED, and :class:`~localparse.JobTimeoutError` if it doesn't finish
|
|
239
|
+
within ``max_wait``.
|
|
240
|
+
"""
|
|
241
|
+
job = self.upload(
|
|
242
|
+
file,
|
|
243
|
+
result_type=result_type,
|
|
244
|
+
case_id=case_id,
|
|
245
|
+
source_path=source_path,
|
|
246
|
+
language=language,
|
|
247
|
+
target_pages=target_pages,
|
|
248
|
+
max_pages=max_pages,
|
|
249
|
+
detect_repair=detect_repair,
|
|
250
|
+
extra=extra,
|
|
251
|
+
)
|
|
252
|
+
job = self.wait(job.id, poll_interval=poll_interval, max_wait=max_wait)
|
|
253
|
+
if not job.is_success:
|
|
254
|
+
raise JobFailedError(
|
|
255
|
+
f"Job {job.id} ended as {job.status}: {job.error_message}",
|
|
256
|
+
job_id=job.id,
|
|
257
|
+
status=job.status,
|
|
258
|
+
)
|
|
259
|
+
return self.get_result(job.id, result_type=result_type)
|
|
260
|
+
|
|
261
|
+
# -- case / batch ------------------------------------------------------- #
|
|
262
|
+
|
|
263
|
+
def case_manifest(self, case_id: str) -> dict[str, Any]:
|
|
264
|
+
"""List a case's already-ingested docs (content hash + status)."""
|
|
265
|
+
payload = self._request("GET", f"/api/parsing/case/{case_id}/manifest")
|
|
266
|
+
return payload if isinstance(payload, dict) else {}
|
|
267
|
+
|
|
268
|
+
def case_failures(self, case_id: str, *, include_resolved: bool = False) -> dict[str, Any]:
|
|
269
|
+
"""List a case's failed documents (for monitoring / reingest)."""
|
|
270
|
+
params = {"include_resolved": "1"} if include_resolved else None
|
|
271
|
+
payload = self._request(
|
|
272
|
+
"GET", f"/api/parsing/case/{case_id}/failures", params=params
|
|
273
|
+
)
|
|
274
|
+
return payload if isinstance(payload, dict) else {}
|
|
275
|
+
|
|
276
|
+
def parse_folder(
|
|
277
|
+
self,
|
|
278
|
+
folder: str | os.PathLike[str],
|
|
279
|
+
*,
|
|
280
|
+
case_id: str,
|
|
281
|
+
result_type: str = "markdown",
|
|
282
|
+
resume: bool = True,
|
|
283
|
+
extensions: Iterable[str] | None = None,
|
|
284
|
+
detect_repair: bool | None = None,
|
|
285
|
+
language: str | Iterable[str] | None = None,
|
|
286
|
+
on_progress: Callable[[str, ParseResult | None], None] | None = None,
|
|
287
|
+
) -> list[ParseResult]:
|
|
288
|
+
"""Ingest every file in ``folder`` into ``case_id``, skipping unchanged ones.
|
|
289
|
+
|
|
290
|
+
Walks ``folder`` recursively, persisting each file under ``case_id`` with a
|
|
291
|
+
folder-relative ``source_path``. When ``resume`` is set (default), files
|
|
292
|
+
whose sha256 already appears in the case manifest with an ``ok``/
|
|
293
|
+
``needs_review`` status are skipped, so a re-run only parses the delta.
|
|
294
|
+
``on_progress(relative_path, result_or_None)`` is called per file (result
|
|
295
|
+
is ``None`` for a skipped file). Returns the results that were parsed.
|
|
296
|
+
"""
|
|
297
|
+
root = Path(folder)
|
|
298
|
+
if not root.is_dir():
|
|
299
|
+
raise ValueError(f"{folder!r} is not a directory.")
|
|
300
|
+
|
|
301
|
+
exts = {e.lower().lstrip(".") for e in extensions} if extensions else None
|
|
302
|
+
already: set[str] = set()
|
|
303
|
+
if resume:
|
|
304
|
+
manifest = self.case_manifest(case_id)
|
|
305
|
+
for doc in manifest.get("documents", []) or []:
|
|
306
|
+
if isinstance(doc, dict) and doc.get("status") in _RESUMABLE_STATUSES:
|
|
307
|
+
file_hash = doc.get("file_hash")
|
|
308
|
+
if isinstance(file_hash, str):
|
|
309
|
+
already.add(file_hash)
|
|
310
|
+
|
|
311
|
+
results: list[ParseResult] = []
|
|
312
|
+
for path in sorted(p for p in root.rglob("*") if p.is_file()):
|
|
313
|
+
if exts is not None and path.suffix.lower().lstrip(".") not in exts:
|
|
314
|
+
continue
|
|
315
|
+
rel = path.relative_to(root).as_posix()
|
|
316
|
+
if resume and _sha256(path) in already:
|
|
317
|
+
if on_progress:
|
|
318
|
+
on_progress(rel, None)
|
|
319
|
+
continue
|
|
320
|
+
result = self.parse(
|
|
321
|
+
path,
|
|
322
|
+
result_type=result_type,
|
|
323
|
+
case_id=case_id,
|
|
324
|
+
source_path=rel,
|
|
325
|
+
detect_repair=detect_repair,
|
|
326
|
+
language=language,
|
|
327
|
+
)
|
|
328
|
+
results.append(result)
|
|
329
|
+
if on_progress:
|
|
330
|
+
on_progress(rel, result)
|
|
331
|
+
return results
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _detail(resp: requests.Response) -> str:
|
|
335
|
+
"""Best-effort human message from an error response."""
|
|
336
|
+
try:
|
|
337
|
+
body = resp.json()
|
|
338
|
+
except ValueError:
|
|
339
|
+
return f"HTTP {resp.status_code}: {resp.text[:200]}" if resp.text else f"HTTP {resp.status_code}"
|
|
340
|
+
if isinstance(body, dict):
|
|
341
|
+
message = body.get("detail") or body.get("message") or body.get("error")
|
|
342
|
+
if message:
|
|
343
|
+
return f"HTTP {resp.status_code}: {message}"
|
|
344
|
+
return f"HTTP {resp.status_code}"
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _sha256(path: Path) -> str:
|
|
348
|
+
digest = hashlib.sha256()
|
|
349
|
+
with path.open("rb") as fh:
|
|
350
|
+
for chunk in iter(lambda: fh.read(1 << 20), b""):
|
|
351
|
+
digest.update(chunk)
|
|
352
|
+
return digest.hexdigest()
|
localparse/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: localparse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Official Python client for the LocalParse document-parsing API (LlamaParse-compatible).
|
|
5
|
+
Project-URL: Homepage, https://localparse.com
|
|
6
|
+
Project-URL: Documentation, https://localparse.com/docs
|
|
7
|
+
Project-URL: Source, https://github.com/stevencoveta/Agent-ingestor
|
|
8
|
+
Author: LocalParse
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: document parsing,llamaparse,localparse,ocr,pdf,tables
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Text Processing
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: requests>=2.28
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# LocalParse — Python client
|
|
20
|
+
|
|
21
|
+
Official Python client for the [LocalParse](https://localparse.com) document-parsing
|
|
22
|
+
API. LocalParse is a **drop-in LlamaParse-compatible** parser with a deterministic
|
|
23
|
+
accuracy layer: table-detection recovery (catches tables a layout model misses)
|
|
24
|
+
and oracle-free **financial-identity checks** (flags `Total`s that don't reconcile).
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install localparse
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from localparse import LocalParse
|
|
36
|
+
|
|
37
|
+
client = LocalParse(api_key="lp-...") # or set LOCALPARSE_API_KEY
|
|
38
|
+
|
|
39
|
+
result = client.parse("invoice.pdf", result_type="markdown")
|
|
40
|
+
print(result.markdown)
|
|
41
|
+
|
|
42
|
+
# Accuracy signal that plain OCR/LLM parsers don't give you:
|
|
43
|
+
print(result.identity_check) # {tables_checked, violations, ...} or None
|
|
44
|
+
print(result.recovered_tables) # tables recovered by detect_repair
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Fetch JSON or the ingestion-ready structured contract instead:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
result = client.parse("10k.pdf", result_type="json")
|
|
51
|
+
for page in result.pages:
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
structured = client.parse("10k.pdf", result_type="structured")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Persist a whole folder (incremental)
|
|
58
|
+
|
|
59
|
+
Ingest a data room into a named **case**; re-runs only parse new/changed files
|
|
60
|
+
(unchanged files are skipped by content hash):
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
results = client.parse_folder(
|
|
64
|
+
"./data-room",
|
|
65
|
+
case_id="acme",
|
|
66
|
+
resume=True,
|
|
67
|
+
on_progress=lambda path, res: print("parsed" if res else "skipped", path),
|
|
68
|
+
)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Full control (async jobs)
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
job = client.upload("big.pdf", result_type="json", case_id="acme")
|
|
75
|
+
job = client.wait(job.id)
|
|
76
|
+
if job.is_success:
|
|
77
|
+
result = client.get_result(job.id, "json")
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Configuration
|
|
81
|
+
|
|
82
|
+
| Argument | Default | Meaning |
|
|
83
|
+
|---|---|---|
|
|
84
|
+
| `api_key` | `LOCALPARSE_API_KEY` env | Bearer token for the API. |
|
|
85
|
+
| `base_url` | `https://api.localparse.com` | Point at a self-hosted instance if needed. |
|
|
86
|
+
| `timeout` | `60` | Per-request HTTP timeout (seconds). |
|
|
87
|
+
| `poll_interval` | `2.0` | Seconds between status polls in `parse`/`wait`. |
|
|
88
|
+
| `max_wait` | `900` | Max seconds to wait for a job before `JobTimeoutError`. |
|
|
89
|
+
|
|
90
|
+
## Errors
|
|
91
|
+
|
|
92
|
+
`AuthenticationError` (401/403), `QuotaExceededError` (402), `NotFoundError` (404),
|
|
93
|
+
`RateLimitError` (429, with `.retry_after`), `APIError` (other non-2xx),
|
|
94
|
+
`JobFailedError` (job ended ERROR/CANCELED), `JobTimeoutError` — all subclass
|
|
95
|
+
`LocalParseError`.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
localparse/__init__.py,sha256=YBQvmZkziMqwIyziHLwY7JDVamKbQgU_BW8SCU1VfHE,1019
|
|
2
|
+
localparse/_errors.py,sha256=Uv6TSayiDfeKUREV8n4uN4VNtSXoWZfvPyfI12Lmx5U,1739
|
|
3
|
+
localparse/_models.py,sha256=-Kzb1gos1LInHaKDbSGTgAA-D_ekou7RN6pdaF3ItfE,3225
|
|
4
|
+
localparse/client.py,sha256=SiX8TejX3xWs9eEmjj8J5Y7HOkovyGoeQvUlOPRIlAI,13512
|
|
5
|
+
localparse/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
localparse-0.1.0.dist-info/METADATA,sha256=NfxevZDQzjOOhFfhGRTNVHihTD0_AMugFWVKnw3k9n0,3037
|
|
7
|
+
localparse-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
8
|
+
localparse-0.1.0.dist-info/RECORD,,
|