lfx-paddle 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx_paddle/__init__.py +11 -0
- lfx_paddle/components/paddle/__init__.py +10 -0
- lfx_paddle/components/paddle/paddleocr.py +502 -0
- lfx_paddle/extension.json +16 -0
- lfx_paddle-0.1.0.dist-info/METADATA +54 -0
- lfx_paddle-0.1.0.dist-info/RECORD +8 -0
- lfx_paddle-0.1.0.dist-info/WHEEL +4 -0
- lfx_paddle-0.1.0.dist-info/entry_points.txt +2 -0
lfx_paddle/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""lfx-paddle: PaddleOCR bundle.
|
|
2
|
+
|
|
3
|
+
Distribution unit ``lfx-paddle``. At runtime Langflow's loader discovers
|
|
4
|
+
``extension.json`` shipped alongside this ``__init__.py`` and registers the
|
|
5
|
+
bundle's component under the namespaced ID
|
|
6
|
+
``ext:paddle:PaddleOCRComponent@official``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from lfx_paddle.components.paddle.paddleocr import PaddleOCRComponent
|
|
10
|
+
|
|
11
|
+
__all__ = ["PaddleOCRComponent"]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Component re-exports for the ``paddle`` bundle.
|
|
2
|
+
|
|
3
|
+
Saved-flow migration entries that target ``lfx.components.paddle.<Class>``
|
|
4
|
+
resolve through this package, so the moved Component class(es) must be
|
|
5
|
+
importable from here by name.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .paddleocr import PaddleOCRComponent
|
|
9
|
+
|
|
10
|
+
__all__ = ["PaddleOCRComponent"]
|
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from lfx.base.data.base_file import BaseFileComponent
|
|
9
|
+
from lfx.inputs.inputs import BoolInput, DropdownInput, FloatInput, IntInput, MessageTextInput, SecretStrInput
|
|
10
|
+
from lfx.schema.data import Data
|
|
11
|
+
from lfx.utils.ssrf_protection import is_ssrf_protection_enabled, validate_and_resolve_url
|
|
12
|
+
from lfx.utils.ssrf_transport import create_ssrf_protected_sync_client
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PaddleOCRComponent(BaseFileComponent):
|
|
19
|
+
display_name = "PaddleOCR"
|
|
20
|
+
description = "Use PaddleOCR for either layout-aware document parsing into Markdown or plain OCR text recognition."
|
|
21
|
+
documentation = "https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/paddleocr_and_ppstructure.html"
|
|
22
|
+
icon = "file-search"
|
|
23
|
+
name = "PaddleOCR"
|
|
24
|
+
|
|
25
|
+
VALID_EXTENSIONS = ["png", "jpg", "jpeg", "bmp", "tiff", "webp", "pdf"]
|
|
26
|
+
DEFAULT_BASE_URL = "https://paddleocr.aistudio-app.com"
|
|
27
|
+
API_PATH = "/api/v2/ocr/jobs"
|
|
28
|
+
REQUEST_TIMEOUT = 300.0
|
|
29
|
+
INITIAL_POLL_INTERVAL = 3.0
|
|
30
|
+
POLL_MULTIPLIER = 1.5
|
|
31
|
+
MAX_POLL_INTERVAL = 15.0
|
|
32
|
+
|
|
33
|
+
inputs = [
|
|
34
|
+
*BaseFileComponent.get_base_inputs(),
|
|
35
|
+
SecretStrInput(
|
|
36
|
+
name="access_token",
|
|
37
|
+
display_name="AI Studio Access Token",
|
|
38
|
+
required=True,
|
|
39
|
+
info="AI Studio access token. Get it from https://aistudio.baidu.com/account/accessToken.",
|
|
40
|
+
),
|
|
41
|
+
MessageTextInput(
|
|
42
|
+
name="base_url",
|
|
43
|
+
display_name="Base URL",
|
|
44
|
+
required=False,
|
|
45
|
+
value="",
|
|
46
|
+
info="Optional PaddleOCR service root URL. Leave empty to use the official default service.",
|
|
47
|
+
advanced=True,
|
|
48
|
+
),
|
|
49
|
+
DropdownInput(
|
|
50
|
+
name="task_type",
|
|
51
|
+
display_name="Task Type",
|
|
52
|
+
options=["document_parsing", "ocr"],
|
|
53
|
+
value="document_parsing",
|
|
54
|
+
info=(
|
|
55
|
+
"document_parsing: preserves reading order and layout as Markdown — "
|
|
56
|
+
"best when you need structure-aware text (PDFs, scanned documents, tables).\n"
|
|
57
|
+
"ocr: extracts text regions in scan order — best for images with simple text content."
|
|
58
|
+
),
|
|
59
|
+
real_time_refresh=True,
|
|
60
|
+
),
|
|
61
|
+
DropdownInput(
|
|
62
|
+
name="model",
|
|
63
|
+
display_name="Model",
|
|
64
|
+
options=["PP-StructureV3", "PaddleOCR-VL-1.6"],
|
|
65
|
+
value="PP-StructureV3",
|
|
66
|
+
info="PaddleOCR model to use for the selected task type.",
|
|
67
|
+
),
|
|
68
|
+
IntInput(
|
|
69
|
+
name="poll_timeout",
|
|
70
|
+
display_name="Timeout (s)",
|
|
71
|
+
value=600,
|
|
72
|
+
info="Maximum time to wait for the PaddleOCR job to complete.",
|
|
73
|
+
advanced=True,
|
|
74
|
+
),
|
|
75
|
+
BoolInput(
|
|
76
|
+
name="use_doc_orientation_classify",
|
|
77
|
+
display_name="Document Orientation Classification",
|
|
78
|
+
value=False,
|
|
79
|
+
advanced=True,
|
|
80
|
+
info="OCR/document parsing option. Enable document orientation classification.",
|
|
81
|
+
),
|
|
82
|
+
BoolInput(
|
|
83
|
+
name="use_doc_unwarping",
|
|
84
|
+
display_name="Document Unwarping",
|
|
85
|
+
value=False,
|
|
86
|
+
advanced=True,
|
|
87
|
+
info="OCR/document parsing option. Enable document unwarping.",
|
|
88
|
+
),
|
|
89
|
+
BoolInput(
|
|
90
|
+
name="use_textline_orientation",
|
|
91
|
+
display_name="Text Line Orientation",
|
|
92
|
+
value=False,
|
|
93
|
+
advanced=True,
|
|
94
|
+
info="OCR option. Enable text line orientation detection.",
|
|
95
|
+
),
|
|
96
|
+
FloatInput(
|
|
97
|
+
name="text_det_thresh",
|
|
98
|
+
display_name="Text Detection Threshold",
|
|
99
|
+
required=False,
|
|
100
|
+
advanced=True,
|
|
101
|
+
info="OCR option. Text detection threshold.",
|
|
102
|
+
),
|
|
103
|
+
FloatInput(
|
|
104
|
+
name="text_det_box_thresh",
|
|
105
|
+
display_name="Text Detection Box Threshold",
|
|
106
|
+
required=False,
|
|
107
|
+
advanced=True,
|
|
108
|
+
info="OCR option. Text detection box threshold.",
|
|
109
|
+
),
|
|
110
|
+
FloatInput(
|
|
111
|
+
name="text_det_unclip_ratio",
|
|
112
|
+
display_name="Text Detection Unclip Ratio",
|
|
113
|
+
required=False,
|
|
114
|
+
advanced=True,
|
|
115
|
+
info="OCR option. Text detection unclip ratio.",
|
|
116
|
+
),
|
|
117
|
+
FloatInput(
|
|
118
|
+
name="text_rec_score_thresh",
|
|
119
|
+
display_name="Text Recognition Score Threshold",
|
|
120
|
+
required=False,
|
|
121
|
+
advanced=True,
|
|
122
|
+
info="OCR option. Text recognition score threshold.",
|
|
123
|
+
),
|
|
124
|
+
BoolInput(
|
|
125
|
+
name="use_table_recognition",
|
|
126
|
+
display_name="Table Recognition",
|
|
127
|
+
value=True,
|
|
128
|
+
advanced=True,
|
|
129
|
+
info="Document parsing option. Enable table recognition.",
|
|
130
|
+
),
|
|
131
|
+
BoolInput(
|
|
132
|
+
name="use_formula_recognition",
|
|
133
|
+
display_name="Formula Recognition",
|
|
134
|
+
value=False,
|
|
135
|
+
advanced=True,
|
|
136
|
+
info="Document parsing option. Enable formula recognition.",
|
|
137
|
+
),
|
|
138
|
+
BoolInput(
|
|
139
|
+
name="use_chart_recognition",
|
|
140
|
+
display_name="Chart Recognition",
|
|
141
|
+
value=False,
|
|
142
|
+
advanced=True,
|
|
143
|
+
info="Document parsing option. Enable chart recognition.",
|
|
144
|
+
),
|
|
145
|
+
BoolInput(
|
|
146
|
+
name="use_seal_recognition",
|
|
147
|
+
display_name="Seal Recognition",
|
|
148
|
+
value=False,
|
|
149
|
+
advanced=True,
|
|
150
|
+
info="Document parsing option. Enable seal recognition.",
|
|
151
|
+
),
|
|
152
|
+
BoolInput(
|
|
153
|
+
name="prettify_markdown",
|
|
154
|
+
display_name="Prettify Markdown",
|
|
155
|
+
value=True,
|
|
156
|
+
advanced=True,
|
|
157
|
+
info="Document parsing option. Return prettier Markdown when supported.",
|
|
158
|
+
),
|
|
159
|
+
FloatInput(
|
|
160
|
+
name="temperature",
|
|
161
|
+
display_name="Temperature",
|
|
162
|
+
required=False,
|
|
163
|
+
advanced=True,
|
|
164
|
+
info="PaddleOCR-VL option. Sampling temperature.",
|
|
165
|
+
),
|
|
166
|
+
FloatInput(
|
|
167
|
+
name="top_p",
|
|
168
|
+
display_name="Top P",
|
|
169
|
+
required=False,
|
|
170
|
+
advanced=True,
|
|
171
|
+
info="PaddleOCR-VL option. Nucleus sampling top_p.",
|
|
172
|
+
),
|
|
173
|
+
BoolInput(
|
|
174
|
+
name="visualize",
|
|
175
|
+
display_name="Visualize",
|
|
176
|
+
value=False,
|
|
177
|
+
advanced=True,
|
|
178
|
+
info="Document parsing option. Generate visualization outputs when supported.",
|
|
179
|
+
),
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
outputs = [*BaseFileComponent.get_base_outputs()]
|
|
183
|
+
|
|
184
|
+
def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:
|
|
185
|
+
if field_name == "task_type":
|
|
186
|
+
if field_value == "ocr":
|
|
187
|
+
build_config["model"]["options"] = ["PP-OCRv6", "PP-OCRv5"]
|
|
188
|
+
build_config["model"]["value"] = "PP-OCRv6"
|
|
189
|
+
else:
|
|
190
|
+
build_config["model"]["options"] = ["PP-StructureV3", "PaddleOCR-VL-1.6"]
|
|
191
|
+
build_config["model"]["value"] = "PP-StructureV3"
|
|
192
|
+
|
|
193
|
+
return build_config
|
|
194
|
+
|
|
195
|
+
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
|
196
|
+
if not file_list:
|
|
197
|
+
self.log("No files to process.")
|
|
198
|
+
return file_list
|
|
199
|
+
|
|
200
|
+
access_token = str(self.access_token or "").strip()
|
|
201
|
+
if not access_token:
|
|
202
|
+
msg = "AI Studio Access Token is required."
|
|
203
|
+
raise ValueError(msg)
|
|
204
|
+
|
|
205
|
+
base_url = (str(self.base_url or "").strip() or self.DEFAULT_BASE_URL).rstrip("/")
|
|
206
|
+
headers = {
|
|
207
|
+
"Authorization": f"Bearer {access_token}",
|
|
208
|
+
"Client-Platform": "langflow",
|
|
209
|
+
}
|
|
210
|
+
poll_timeout = int(self.poll_timeout or 600)
|
|
211
|
+
|
|
212
|
+
# ``base_url`` is operator-configurable, so the submit and poll requests
|
|
213
|
+
# (which carry the bearer token and the uploaded file) are validated for
|
|
214
|
+
# SSRF up front and DNS-pinned for the rest of the run. Like
|
|
215
|
+
# ``_fetch_result``, this is a no-op when SSRF protection is disabled
|
|
216
|
+
# (the default), so default behavior is unchanged.
|
|
217
|
+
_validated_url, base_ips = validate_and_resolve_url(base_url)
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
for file in file_list:
|
|
221
|
+
file.data = self._process_file(file.path, base_url, base_ips, headers, poll_timeout)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
error_message = self._format_paddleocr_error(e)
|
|
224
|
+
self.log(error_message)
|
|
225
|
+
raise RuntimeError(error_message) from e
|
|
226
|
+
|
|
227
|
+
return file_list
|
|
228
|
+
|
|
229
|
+
def _process_file(
|
|
230
|
+
self, file_path: Path, base_url: str, base_ips: list[str], headers: dict[str, str], poll_timeout: int
|
|
231
|
+
) -> Data:
|
|
232
|
+
options = self._build_ocr_options() if self.task_type == "ocr" else self._build_document_parsing_options()
|
|
233
|
+
job_id = self._submit_job(
|
|
234
|
+
base_url=base_url, base_ips=base_ips, headers=headers, file_path=file_path, options=options
|
|
235
|
+
)
|
|
236
|
+
jsonl_data = self._poll_job(
|
|
237
|
+
base_url=base_url, base_ips=base_ips, headers=headers, job_id=job_id, poll_timeout=poll_timeout
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if self.task_type == "ocr":
|
|
241
|
+
return self._ocr_result_to_data(job_id, jsonl_data, file_path)
|
|
242
|
+
return self._document_result_to_data(job_id, jsonl_data, file_path)
|
|
243
|
+
|
|
244
|
+
def _submit_job(
|
|
245
|
+
self, *, base_url: str, base_ips: list[str], headers: dict[str, str], file_path: Path, options: dict[str, Any]
|
|
246
|
+
) -> str:
|
|
247
|
+
url = f"{base_url}{self.API_PATH}"
|
|
248
|
+
data = {"model": self.model, "optionalPayload": json.dumps(options)}
|
|
249
|
+
with (
|
|
250
|
+
file_path.open("rb") as file_obj,
|
|
251
|
+
self._build_client(url, base_ips) as client,
|
|
252
|
+
):
|
|
253
|
+
response = client.post(
|
|
254
|
+
url,
|
|
255
|
+
data=data,
|
|
256
|
+
files={"file": (file_path.name, file_obj)},
|
|
257
|
+
headers=headers,
|
|
258
|
+
timeout=self.REQUEST_TIMEOUT,
|
|
259
|
+
)
|
|
260
|
+
response.raise_for_status()
|
|
261
|
+
payload = response.json()
|
|
262
|
+
job_id = (payload.get("data") or {}).get("jobId") or payload.get("jobId")
|
|
263
|
+
if not job_id:
|
|
264
|
+
msg = f"PaddleOCR job ID not found in response: {payload}"
|
|
265
|
+
raise ValueError(msg)
|
|
266
|
+
return job_id
|
|
267
|
+
|
|
268
|
+
def _poll_job(
|
|
269
|
+
self,
|
|
270
|
+
*,
|
|
271
|
+
base_url: str,
|
|
272
|
+
base_ips: list[str],
|
|
273
|
+
headers: dict[str, str],
|
|
274
|
+
job_id: str,
|
|
275
|
+
poll_timeout: int,
|
|
276
|
+
) -> list[dict[str, Any]]:
|
|
277
|
+
status_url = f"{base_url}{self.API_PATH}/{job_id}"
|
|
278
|
+
deadline = time.monotonic() + poll_timeout
|
|
279
|
+
interval = self.INITIAL_POLL_INTERVAL
|
|
280
|
+
|
|
281
|
+
with self._build_client(status_url, base_ips) as client:
|
|
282
|
+
while True:
|
|
283
|
+
remaining = deadline - time.monotonic()
|
|
284
|
+
if remaining <= 0:
|
|
285
|
+
msg = f"PaddleOCR job {job_id} timed out."
|
|
286
|
+
raise TimeoutError(msg)
|
|
287
|
+
|
|
288
|
+
# Bound each request by the remaining budget so a hung poll cannot
|
|
289
|
+
# overrun ``poll_timeout`` by up to ``REQUEST_TIMEOUT``.
|
|
290
|
+
response = client.get(status_url, headers=headers, timeout=min(self.REQUEST_TIMEOUT, remaining))
|
|
291
|
+
response.raise_for_status()
|
|
292
|
+
payload = response.json()
|
|
293
|
+
data = payload.get("data") or {}
|
|
294
|
+
state = data.get("state") or payload.get("state")
|
|
295
|
+
|
|
296
|
+
if state == "done":
|
|
297
|
+
result_url = data.get("resultJsonUrl") or (data.get("resultUrl") or {}).get("jsonUrl")
|
|
298
|
+
if not result_url:
|
|
299
|
+
msg = f"PaddleOCR result URL not found in response: {payload}"
|
|
300
|
+
raise ValueError(msg)
|
|
301
|
+
return self._fetch_result(result_url)
|
|
302
|
+
|
|
303
|
+
if state == "failed":
|
|
304
|
+
msg = f"PaddleOCR job failed: {payload}"
|
|
305
|
+
raise RuntimeError(msg)
|
|
306
|
+
|
|
307
|
+
time.sleep(min(interval, max(deadline - time.monotonic(), 0)))
|
|
308
|
+
interval = min(interval * self.POLL_MULTIPLIER, self.MAX_POLL_INTERVAL)
|
|
309
|
+
|
|
310
|
+
def _fetch_result(self, result_url: str) -> list[dict[str, Any]]:
|
|
311
|
+
# ``result_url`` comes from the remote job-status response, not from
|
|
312
|
+
# operator input, so it is validated for SSRF before being fetched: a
|
|
313
|
+
# compromised/rogue endpoint could otherwise point it at internal or
|
|
314
|
+
# cloud-metadata addresses and have the worker fetch them server-side.
|
|
315
|
+
# ``validate_and_resolve_url`` is a no-op (returns no pinned IPs) when
|
|
316
|
+
# SSRF protection is disabled -- the default -- so behavior is unchanged
|
|
317
|
+
# unless an operator opts in; when enabled it blocks internal targets
|
|
318
|
+
# and pins DNS to the validated IPs. This mirrors the shared pattern in
|
|
319
|
+
# ``lfx.components.data_source.api_request``.
|
|
320
|
+
_validated_url, validated_ips = validate_and_resolve_url(result_url)
|
|
321
|
+
with self._build_client(result_url, validated_ips) as client:
|
|
322
|
+
response = client.get(result_url)
|
|
323
|
+
response.raise_for_status()
|
|
324
|
+
text = response.text.strip()
|
|
325
|
+
if not text:
|
|
326
|
+
return []
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
payload = response.json()
|
|
330
|
+
except ValueError:
|
|
331
|
+
return [json.loads(line) for line in text.splitlines() if line.strip()]
|
|
332
|
+
|
|
333
|
+
if isinstance(payload, list):
|
|
334
|
+
return payload
|
|
335
|
+
if isinstance(payload, dict):
|
|
336
|
+
return [payload]
|
|
337
|
+
return []
|
|
338
|
+
|
|
339
|
+
def _build_client(self, url: str, validated_ips: list[str]) -> httpx.Client:
|
|
340
|
+
"""Create the HTTP client for ``url``, pinning DNS when SSRF protection applies.
|
|
341
|
+
|
|
342
|
+
Used for the submit, poll, and result-fetch requests. Returns a client
|
|
343
|
+
that pins DNS to ``validated_ips`` (preventing rebinding) when SSRF
|
|
344
|
+
protection is enabled and the host resolved to validated IPs; otherwise a
|
|
345
|
+
standard client (protection disabled, allowlisted host, or hostname
|
|
346
|
+
extraction failure).
|
|
347
|
+
"""
|
|
348
|
+
if is_ssrf_protection_enabled() and validated_ips:
|
|
349
|
+
hostname = httpx.URL(url).host
|
|
350
|
+
if hostname:
|
|
351
|
+
return create_ssrf_protected_sync_client(
|
|
352
|
+
hostname=hostname, validated_ips=validated_ips, timeout=self.REQUEST_TIMEOUT
|
|
353
|
+
)
|
|
354
|
+
return httpx.Client(timeout=self.REQUEST_TIMEOUT)
|
|
355
|
+
|
|
356
|
+
def _build_ocr_options(self) -> dict[str, Any]:
|
|
357
|
+
return self._collect_options(
|
|
358
|
+
[
|
|
359
|
+
"use_doc_orientation_classify",
|
|
360
|
+
"use_doc_unwarping",
|
|
361
|
+
"use_textline_orientation",
|
|
362
|
+
"text_det_thresh",
|
|
363
|
+
"text_det_box_thresh",
|
|
364
|
+
"text_det_unclip_ratio",
|
|
365
|
+
"text_rec_score_thresh",
|
|
366
|
+
]
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def _build_document_parsing_options(self) -> dict[str, Any]:
|
|
370
|
+
return self._collect_options(
|
|
371
|
+
[
|
|
372
|
+
"use_doc_orientation_classify",
|
|
373
|
+
"use_doc_unwarping",
|
|
374
|
+
"use_table_recognition",
|
|
375
|
+
"use_formula_recognition",
|
|
376
|
+
"use_chart_recognition",
|
|
377
|
+
"use_seal_recognition",
|
|
378
|
+
"prettify_markdown",
|
|
379
|
+
"temperature",
|
|
380
|
+
"top_p",
|
|
381
|
+
"visualize",
|
|
382
|
+
]
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def _collect_options(self, option_names: list[str]) -> dict[str, Any]:
|
|
386
|
+
options: dict[str, Any] = {}
|
|
387
|
+
for name in option_names:
|
|
388
|
+
value = getattr(self, name, None)
|
|
389
|
+
if value is not None:
|
|
390
|
+
options[name] = value
|
|
391
|
+
return options
|
|
392
|
+
|
|
393
|
+
def _ocr_result_to_data(self, job_id: str, jsonl_data: list[dict[str, Any]], file_path: Path) -> Data:
|
|
394
|
+
pages_payload: list[dict[str, Any]] = []
|
|
395
|
+
text_parts: list[str] = []
|
|
396
|
+
|
|
397
|
+
for line_obj in jsonl_data:
|
|
398
|
+
result = line_obj.get("result", line_obj)
|
|
399
|
+
for item in result.get("ocrResults", []) or []:
|
|
400
|
+
pruned_result = item.get("prunedResult", {}) or {}
|
|
401
|
+
rec_texts = pruned_result.get("rec_texts", []) or []
|
|
402
|
+
if rec_texts:
|
|
403
|
+
text_parts.append("\n".join(str(text) for text in rec_texts))
|
|
404
|
+
pages_payload.append(
|
|
405
|
+
{
|
|
406
|
+
"pruned_result": pruned_result,
|
|
407
|
+
"ocr_image_url": item.get("ocrImage"),
|
|
408
|
+
}
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
text = "\n\n".join(part for part in text_parts if part)
|
|
412
|
+
return Data(
|
|
413
|
+
text=text,
|
|
414
|
+
data={
|
|
415
|
+
self.SERVER_FILE_PATH_FIELDNAME: str(file_path),
|
|
416
|
+
"text": text,
|
|
417
|
+
"task_type": "ocr",
|
|
418
|
+
"output_format": "plain_text",
|
|
419
|
+
"model": self.model,
|
|
420
|
+
"job_id": job_id,
|
|
421
|
+
"pages": pages_payload,
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
def _document_result_to_data(self, job_id: str, jsonl_data: list[dict[str, Any]], file_path: Path) -> Data:
|
|
426
|
+
pages_payload: list[dict[str, Any]] = []
|
|
427
|
+
text_parts: list[str] = []
|
|
428
|
+
|
|
429
|
+
for line_obj in jsonl_data:
|
|
430
|
+
result = line_obj.get("result", line_obj)
|
|
431
|
+
layout_results = result.get("layoutParsingResults", []) or []
|
|
432
|
+
if layout_results:
|
|
433
|
+
self._append_layout_results(layout_results, pages_payload, text_parts)
|
|
434
|
+
continue
|
|
435
|
+
self._append_ocr_fallback_results(result.get("ocrResults", []) or [], pages_payload, text_parts)
|
|
436
|
+
|
|
437
|
+
markdown_text = "\n\n".join(part for part in text_parts if part)
|
|
438
|
+
return Data(
|
|
439
|
+
text=markdown_text,
|
|
440
|
+
data={
|
|
441
|
+
self.SERVER_FILE_PATH_FIELDNAME: str(file_path),
|
|
442
|
+
"text": markdown_text,
|
|
443
|
+
"task_type": "document_parsing",
|
|
444
|
+
"output_format": "markdown",
|
|
445
|
+
"model": self.model,
|
|
446
|
+
"job_id": job_id,
|
|
447
|
+
"pages": pages_payload,
|
|
448
|
+
},
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
def _append_layout_results(
|
|
452
|
+
self,
|
|
453
|
+
layout_results: list[dict[str, Any]],
|
|
454
|
+
pages_payload: list[dict[str, Any]],
|
|
455
|
+
text_parts: list[str],
|
|
456
|
+
) -> None:
|
|
457
|
+
for item in layout_results:
|
|
458
|
+
markdown = item.get("markdown", {}) or {}
|
|
459
|
+
markdown_text = markdown.get("text") or item.get("markdown_text") or ""
|
|
460
|
+
if markdown_text:
|
|
461
|
+
text_parts.append(str(markdown_text))
|
|
462
|
+
pages_payload.append(
|
|
463
|
+
{
|
|
464
|
+
"markdown_text": markdown_text,
|
|
465
|
+
"markdown_images": markdown.get("images", {}) or {},
|
|
466
|
+
"output_images": item.get("outputImages", {}) or {},
|
|
467
|
+
}
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
def _append_ocr_fallback_results(
|
|
471
|
+
self,
|
|
472
|
+
ocr_results: list[dict[str, Any]],
|
|
473
|
+
pages_payload: list[dict[str, Any]],
|
|
474
|
+
text_parts: list[str],
|
|
475
|
+
) -> None:
|
|
476
|
+
for item in ocr_results:
|
|
477
|
+
pruned_result = item.get("prunedResult", {}) or {}
|
|
478
|
+
rec_texts = pruned_result.get("rec_texts", []) or []
|
|
479
|
+
text = "\n".join(str(text) for text in rec_texts)
|
|
480
|
+
if text:
|
|
481
|
+
text_parts.append(text)
|
|
482
|
+
pages_payload.append(
|
|
483
|
+
{
|
|
484
|
+
"markdown_text": text,
|
|
485
|
+
"markdown_images": {},
|
|
486
|
+
"output_images": {},
|
|
487
|
+
"pruned_result": pruned_result,
|
|
488
|
+
"ocr_image_url": item.get("ocrImage"),
|
|
489
|
+
}
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
def _format_paddleocr_error(self, error: Exception) -> str:
|
|
493
|
+
if isinstance(error, httpx.HTTPStatusError):
|
|
494
|
+
status_code = error.response.status_code
|
|
495
|
+
if status_code in {401, 403}:
|
|
496
|
+
return "PaddleOCR authentication failed. Please check the AI Studio Access Token."
|
|
497
|
+
return f"PaddleOCR API error ({status_code}): {error.response.text}"
|
|
498
|
+
if isinstance(error, httpx.TimeoutException | TimeoutError):
|
|
499
|
+
return "PaddleOCR job timed out. Increase the timeout or try again later."
|
|
500
|
+
if isinstance(error, httpx.HTTPError):
|
|
501
|
+
return f"PaddleOCR network error: {error}"
|
|
502
|
+
return f"PaddleOCR failed: {error}"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://schemas.langflow.org/extension/v1.json",
|
|
3
|
+
"id": "lfx-paddle",
|
|
4
|
+
"version": "0.1.0",
|
|
5
|
+
"name": "PaddleOCR",
|
|
6
|
+
"description": "PaddleOCR component (OCR and layout-aware document parsing via the AI Studio async Job API) as a standalone Langflow Extension Bundle.",
|
|
7
|
+
"lfx": {
|
|
8
|
+
"compat": ["1"]
|
|
9
|
+
},
|
|
10
|
+
"bundles": [
|
|
11
|
+
{
|
|
12
|
+
"name": "paddle",
|
|
13
|
+
"path": "components/paddle"
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lfx-paddle
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PaddleOCR component (OCR and layout-aware document parsing via the AI Studio async Job API) as a standalone Langflow Extension Bundle.
|
|
5
|
+
Project-URL: Homepage, https://github.com/langflow-ai/langflow
|
|
6
|
+
Project-URL: Documentation, https://docs.langflow.org/extensions
|
|
7
|
+
Project-URL: Repository, https://github.com/langflow-ai/langflow
|
|
8
|
+
Author-email: Langflow <contact@langflow.org>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: bundle,extension,langflow,lfx,ocr,paddle,paddleocr
|
|
11
|
+
Requires-Python: <3.15,>=3.10
|
|
12
|
+
Requires-Dist: httpx<1.0.0,>=0.24.0
|
|
13
|
+
Requires-Dist: lfx<2.0.0,>=1.11.0.dev0
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# lfx-paddle
|
|
17
|
+
|
|
18
|
+
PaddleOCR as a standalone Langflow Extension Bundle.
|
|
19
|
+
|
|
20
|
+
Ships the **PaddleOCR** component, which performs either layout-aware document
|
|
21
|
+
parsing into Markdown (`PP-StructureV3`, `PaddleOCR-VL-1.6`) or plain OCR text
|
|
22
|
+
recognition (`PP-OCRv5`, `PP-OCRv6`). It talks to the PaddleOCR
|
|
23
|
+
[AI Studio async Job HTTP API](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/paddleocr_and_ppstructure.html)
|
|
24
|
+
(`submit -> poll -> fetch`) directly via `httpx`, so it does **not** require the
|
|
25
|
+
`paddleocr` Python SDK (whose transitive `pyyaml` constraint conflicts with
|
|
26
|
+
Langflow's dependency tree).
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install lfx-paddle
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The bundle is registered automatically via the `langflow.extensions`
|
|
35
|
+
entry-point. After install, restart your Langflow server; the component will
|
|
36
|
+
appear in the palette under the `paddle` group.
|
|
37
|
+
|
|
38
|
+
You will need an AI Studio access token
|
|
39
|
+
(<https://aistudio.baidu.com/account/accessToken>) to run the component.
|
|
40
|
+
|
|
41
|
+
## Develop
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
cd src/bundles/paddle
|
|
45
|
+
pip install -e .
|
|
46
|
+
lfx extension validate src/lfx_paddle
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Migration
|
|
50
|
+
|
|
51
|
+
Saved flows referencing the legacy class name or the old import paths under
|
|
52
|
+
`lfx.components.paddle.*` are rewritten to the new namespaced ID
|
|
53
|
+
`ext:paddle:PaddleOCRComponent@official` by the migration table in
|
|
54
|
+
`src/lfx/src/lfx/extension/migration/migration_table.json`.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
lfx_paddle/__init__.py,sha256=WjKSh_Oi7xqE2lGfnxrt3P9etv3ps1rEN2DZ46XjNp0,380
|
|
2
|
+
lfx_paddle/extension.json,sha256=6-XV3PlmbFgKLRhaUM0Txi-tkJFOZb1xzK8zcujKhkg,414
|
|
3
|
+
lfx_paddle/components/paddle/__init__.py,sha256=c73ZHRS90szobz4AArj3l9tmNeh759ZYf0bRZd-e_8M,309
|
|
4
|
+
lfx_paddle/components/paddle/paddleocr.py,sha256=Njvs-9x4AKfYZBv3GjkSOpkmNGy3v2aotCtjC7bilus,20044
|
|
5
|
+
lfx_paddle-0.1.0.dist-info/METADATA,sha256=vmyn7kFSGuqar41TrW3ZwgonS9qXw-mukt9PA420Fuw,2000
|
|
6
|
+
lfx_paddle-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
7
|
+
lfx_paddle-0.1.0.dist-info/entry_points.txt,sha256=YWCXw3eNS9iNZ9Y8BtDYK_zoIOxk6wQf--pLSZDB05Y,46
|
|
8
|
+
lfx_paddle-0.1.0.dist-info/RECORD,,
|