deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,538 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import base64
17
+ import json
18
+ import logging
19
+ import os
20
+ import shutil
21
+ import tempfile
22
+ import time
23
+ import traceback
24
+ import types
25
+ import zipfile
26
+ from datetime import datetime
27
+ from io import BytesIO
28
+ from os import PathLike
29
+ from pathlib import Path
30
+ from typing import Any, Callable, Optional
31
+
32
+ import requests
33
+ from tencentcloud.common import credential
34
+ from tencentcloud.common.profile.client_profile import ClientProfile
35
+ from tencentcloud.common.profile.http_profile import HttpProfile
36
+ from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
37
+ from tencentcloud.lkeap.v20240522 import lkeap_client, models
38
+
39
+ from ..common.config_utils import get_base_config
40
+ from deepdoc.config import PdfModelConfig, TokenizerConfig
41
+ from deepdoc.parser.pdf_parser import RAGFlowPdfParser
42
+
43
+
44
+ class TencentCloudAPIClient:
45
+ """Tencent Cloud API client using official SDK"""
46
+
47
+ def __init__(self, secret_id, secret_key, region):
48
+ self.secret_id = secret_id
49
+ self.secret_key = secret_key
50
+ self.region = region
51
+ self.outlines = []
52
+
53
+ # Create credentials
54
+ self.cred = credential.Credential(secret_id, secret_key)
55
+
56
+ # Instantiate an http option, optional, can be skipped if no special requirements
57
+ self.httpProfile = HttpProfile()
58
+ self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
59
+
60
+ # Instantiate a client option, optional, can be skipped if no special requirements
61
+ self.clientProfile = ClientProfile()
62
+ self.clientProfile.httpProfile = self.httpProfile
63
+
64
+ # Instantiate the client object for the product to be requested, clientProfile is optional
65
+ self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)
66
+
67
+ def reconstruct_document_sse(self, file_type, file_url=None, file_base64=None, file_start_page=1, file_end_page=1000, config=None):
68
+ """Call document parsing API using official SDK"""
69
+ try:
70
+ # Instantiate a request object, each interface corresponds to a request object
71
+ req = models.ReconstructDocumentSSERequest()
72
+
73
+ # Build request parameters
74
+ params = {
75
+ "FileType": file_type,
76
+ "FileStartPageNumber": file_start_page,
77
+ "FileEndPageNumber": file_end_page,
78
+ }
79
+
80
+ # According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
81
+ if file_url:
82
+ params["FileUrl"] = file_url
83
+ logging.info(f"[TCADP] Using file URL: {file_url}")
84
+ elif file_base64:
85
+ params["FileBase64"] = file_base64
86
+ logging.info(f"[TCADP] Using Base64 data, length: {len(file_base64)} characters")
87
+ else:
88
+ raise ValueError("Must provide either FileUrl or FileBase64 parameter")
89
+
90
+ if config:
91
+ params["Config"] = config
92
+
93
+ req.from_json_string(json.dumps(params))
94
+
95
+ # The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
96
+ resp = self.client.ReconstructDocumentSSE(req)
97
+ parser_result = {}
98
+
99
+ # Output json format string response
100
+ if isinstance(resp, types.GeneratorType): # Streaming response
101
+ logging.info("[TCADP] Detected streaming response")
102
+ for event in resp:
103
+ logging.info(f"[TCADP] Received event: {event}")
104
+ if event.get('data'):
105
+ try:
106
+ data_dict = json.loads(event['data'])
107
+ logging.info(f"[TCADP] Parsed data: {data_dict}")
108
+
109
+ if data_dict.get('Progress') == "100":
110
+ parser_result = data_dict
111
+ logging.info("[TCADP] Document parsing completed!")
112
+ logging.info(f"[TCADP] Task ID: {data_dict.get('TaskId')}")
113
+ logging.info(f"[TCADP] Success pages: {data_dict.get('SuccessPageNum')}")
114
+ logging.info(f"[TCADP] Failed pages: {data_dict.get('FailPageNum')}")
115
+
116
+ # Print failed page information
117
+ failed_pages = data_dict.get("FailedPages", [])
118
+ if failed_pages:
119
+ logging.warning("[TCADP] Failed parsing pages:")
120
+ for page in failed_pages:
121
+ logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
122
+
123
+ # Check if there is a download link
124
+ download_url = data_dict.get("DocumentRecognizeResultUrl")
125
+ if download_url:
126
+ logging.info(f"[TCADP] Got download link: {download_url}")
127
+ else:
128
+ logging.warning("[TCADP] No download link obtained")
129
+
130
+ break # Found final result, exit loop
131
+ else:
132
+ # Print progress information
133
+ progress = data_dict.get("Progress", "0")
134
+ logging.info(f"[TCADP] Progress: {progress}%")
135
+ except json.JSONDecodeError as e:
136
+ logging.error(f"[TCADP] Failed to parse JSON data: {e}")
137
+ logging.error(f"[TCADP] Raw data: {event.get('data')}")
138
+ continue
139
+ else:
140
+ logging.info(f"[TCADP] Event without data: {event}")
141
+ else: # Non-streaming response
142
+ logging.info("[TCADP] Detected non-streaming response")
143
+ if hasattr(resp, 'data') and resp.data:
144
+ try:
145
+ data_dict = json.loads(resp.data)
146
+ parser_result = data_dict
147
+ logging.info(f"[TCADP] JSON parsing successful: {parser_result}")
148
+ except json.JSONDecodeError as e:
149
+ logging.error(f"[TCADP] JSON parsing failed: {e}")
150
+ return None
151
+ else:
152
+ logging.error("[TCADP] No data in response")
153
+ return None
154
+
155
+ return parser_result
156
+
157
+ except TencentCloudSDKException as err:
158
+ logging.error(f"[TCADP] Tencent Cloud SDK error: {err}")
159
+ return None
160
+ except Exception as e:
161
+ logging.error(f"[TCADP] Unknown error: {e}")
162
+ logging.error(f"[TCADP] Error stack trace: {traceback.format_exc()}")
163
+ return None
164
+
165
+ def download_result_file(self, download_url, output_dir):
166
+ """Download parsing result file"""
167
+ if not download_url:
168
+ logging.warning("[TCADP] No downloadable result file")
169
+ return None
170
+
171
+ try:
172
+ response = requests.get(download_url)
173
+ response.raise_for_status()
174
+
175
+ # Ensure output directory exists
176
+ os.makedirs(output_dir, exist_ok=True)
177
+
178
+ # Generate filename
179
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
180
+ filename = f"tcadp_result_{timestamp}.zip"
181
+ file_path = os.path.join(output_dir, filename)
182
+
183
+ # Save file
184
+ with open(file_path, "wb") as f:
185
+ f.write(response.content)
186
+
187
+ logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
188
+ return file_path
189
+
190
+ except requests.exceptions.RequestException as e:
191
+ logging.error(f"[TCADP] Failed to download file: {e}")
192
+ return None
193
+
194
+
195
+ class TCADPParser(RAGFlowPdfParser):
196
+ def __init__(
197
+ self,
198
+ model_cfg: PdfModelConfig,
199
+ tokenizer_cfg: TokenizerConfig,
200
+ secret_id: str = None,
201
+ secret_key: str = None,
202
+ region: str = "ap-guangzhou",
203
+ table_result_type: str = None,
204
+ markdown_image_response_type: str = None,
205
+ ):
206
+ super().__init__(model_cfg=model_cfg, tokenizer_cfg=tokenizer_cfg)
207
+
208
+ # First initialize logger
209
+ self.logger = logging.getLogger(self.__class__.__name__)
210
+
211
+ # Log received parameters
212
+ self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
213
+
214
+ # Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
215
+ try:
216
+ tcadp_parser = get_base_config("tcadp_config", {})
217
+ if isinstance(tcadp_parser, dict) and tcadp_parser:
218
+ self.secret_id = secret_id or tcadp_parser.get("secret_id")
219
+ self.secret_key = secret_key or tcadp_parser.get("secret_key")
220
+ self.region = region or tcadp_parser.get("region", "ap-guangzhou")
221
+ # Set table_result_type and markdown_image_response_type from config or parameters
222
+ self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
223
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
224
+
225
+ else:
226
+ self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
227
+ # If config file is empty, use provided parameters or defaults
228
+ self.secret_id = secret_id
229
+ self.secret_key = secret_key
230
+ self.region = region or "ap-guangzhou"
231
+ self.table_result_type = table_result_type if table_result_type is not None else "1"
232
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
233
+
234
+ except ImportError:
235
+ self.logger.info("[TCADP] Configuration module import failed")
236
+ # If config file is not available, use provided parameters or defaults
237
+ self.secret_id = secret_id
238
+ self.secret_key = secret_key
239
+ self.region = region or "ap-guangzhou"
240
+ self.table_result_type = table_result_type if table_result_type is not None else "1"
241
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
242
+
243
+ # Log final values
244
+ self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
245
+
246
+ if not self.secret_id or not self.secret_key:
247
+ raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
248
+
249
+ def check_installation(self) -> bool:
250
+ """Check if Tencent Cloud API configuration is correct"""
251
+ try:
252
+ # Check necessary configuration parameters
253
+ if not self.secret_id or not self.secret_key:
254
+ self.logger.error("[TCADP] Tencent Cloud API configuration incomplete")
255
+ return False
256
+
257
+ # Try to create client to verify configuration
258
+ TencentCloudAPIClient(self.secret_id, self.secret_key, self.region)
259
+ self.logger.info("[TCADP] Tencent Cloud API configuration check passed")
260
+ return True
261
+ except Exception as e:
262
+ self.logger.error(f"[TCADP] Tencent Cloud API configuration check failed: {e}")
263
+ return False
264
+
265
+ def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
266
+ """Convert file to Base64 format"""
267
+
268
+ if binary:
269
+ # If binary data is directly available, convert directly
270
+ return base64.b64encode(binary).decode('utf-8')
271
+ else:
272
+ # Read from file path and convert
273
+ with open(file_path, 'rb') as f:
274
+ file_data = f.read()
275
+ return base64.b64encode(file_data).decode('utf-8')
276
+
277
+ def _extract_content_from_zip(self, zip_path: str) -> list[dict[str, Any]]:
278
+ """Extract parsing results from downloaded ZIP file"""
279
+ results = []
280
+
281
+ try:
282
+ with zipfile.ZipFile(zip_path, "r") as zip_file:
283
+ # Find JSON result files
284
+ json_files = [f for f in zip_file.namelist() if f.endswith(".json")]
285
+
286
+ for json_file in json_files:
287
+ with zip_file.open(json_file) as f:
288
+ data = json.load(f)
289
+ if isinstance(data, list):
290
+ results.extend(data)
291
+ else:
292
+ results.append(data)
293
+
294
+ # Find Markdown files
295
+ md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
296
+ for md_file in md_files:
297
+ with zip_file.open(md_file) as f:
298
+ content = f.read().decode("utf-8")
299
+ results.append({"type": "text", "content": content, "file": md_file})
300
+
301
+ except Exception as e:
302
+ self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
303
+
304
+ return results
305
+
306
+ def _parse_content_to_sections(self, content_data: list[dict[str, Any]]) -> list[tuple[str, str]]:
307
+ """Convert parsing results to sections format"""
308
+ sections = []
309
+
310
+ for item in content_data:
311
+ content_type = item.get("type", "text")
312
+ content = item.get("content", "")
313
+
314
+ if not content:
315
+ continue
316
+
317
+ # Process based on content type
318
+ if content_type == "text" or content_type == "paragraph":
319
+ section_text = content
320
+ elif content_type == "table":
321
+ # Handle table content
322
+ table_data = item.get("table_data", {})
323
+ if isinstance(table_data, dict):
324
+ # Convert table data to text
325
+ rows = table_data.get("rows", [])
326
+ section_text = "\n".join([" | ".join(row) for row in rows])
327
+ else:
328
+ section_text = str(table_data)
329
+ elif content_type == "image":
330
+ # Handle image content
331
+ caption = item.get("caption", "")
332
+ section_text = f"[Image] {caption}" if caption else "[Image]"
333
+ elif content_type == "equation":
334
+ # Handle equation content
335
+ section_text = f"$${content}$$"
336
+ else:
337
+ section_text = content
338
+
339
+ if section_text.strip():
340
+ # Generate position tag (simplified version)
341
+ position_tag = "@@1\t0.0\t1000.0\t0.0\t100.0##"
342
+ sections.append((section_text, position_tag))
343
+
344
+ return sections
345
+
346
+ def _parse_content_to_tables(self, content_data: list[dict[str, Any]]) -> list:
347
+ """Convert parsing results to tables format"""
348
+ tables = []
349
+
350
+ for item in content_data:
351
+ if item.get("type") == "table":
352
+ table_data = item.get("table_data", {})
353
+ if isinstance(table_data, dict):
354
+ rows = table_data.get("rows", [])
355
+ if rows:
356
+ # Convert to table format
357
+ table_html = "<table>\n"
358
+ for i, row in enumerate(rows):
359
+ table_html += " <tr>\n"
360
+ for cell in row:
361
+ tag = "th" if i == 0 else "td"
362
+ table_html += f" <{tag}>{cell}</{tag}>\n"
363
+ table_html += " </tr>\n"
364
+ table_html += "</table>"
365
+ tables.append(table_html)
366
+
367
+ return tables
368
+
369
+ def parse_pdf(
370
+ self,
371
+ filepath: str | PathLike[str],
372
+ binary: BytesIO | bytes,
373
+ callback: Optional[Callable] = None,
374
+ *,
375
+ output_dir: Optional[str] = None,
376
+ file_type: str = "PDF",
377
+ file_start_page: Optional[int] = 1,
378
+ file_end_page: Optional[int] = 1000,
379
+ delete_output: Optional[bool] = True,
380
+ max_retries: Optional[int] = 1,
381
+ ) -> tuple:
382
+ """Parse PDF document"""
383
+
384
+ temp_file = None
385
+ created_tmp_dir = False
386
+
387
+ try:
388
+ # Handle input file
389
+ if binary:
390
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
391
+ temp_file.write(binary)
392
+ temp_file.close()
393
+ file_path = temp_file.name
394
+ self.logger.info(f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}")
395
+ if callback:
396
+ callback(0.1, f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}")
397
+ else:
398
+ file_path = str(filepath)
399
+ if not os.path.exists(file_path):
400
+ if callback:
401
+ callback(-1, f"[TCADP] PDF file does not exist: {file_path}")
402
+ raise FileNotFoundError(f"[TCADP] PDF file does not exist: {file_path}")
403
+
404
+ # Convert file to Base64 format
405
+ if callback:
406
+ callback(0.2, "[TCADP] Converting file to Base64 format")
407
+
408
+ file_base64 = self._file_to_base64(file_path, binary)
409
+ if callback:
410
+ callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
411
+
412
+ # Create Tencent Cloud API client
413
+ client = TencentCloudAPIClient(self.secret_id, self.secret_key, self.region)
414
+
415
+ # Call document parsing API (with retry mechanism)
416
+ if callback:
417
+ callback(0.3, "[TCADP] Starting to call Tencent Cloud document parsing API")
418
+
419
+ result = None
420
+ for attempt in range(max_retries):
421
+ try:
422
+ if attempt > 0:
423
+ self.logger.info(f"[TCADP] Retry attempt {attempt + 1}")
424
+ if callback:
425
+ callback(0.3 + attempt * 0.1, f"[TCADP] Retry attempt {attempt + 1}")
426
+ time.sleep(2 ** attempt) # Exponential backoff
427
+
428
+ config = {
429
+ "TableResultType": self.table_result_type,
430
+ "MarkdownImageResponseType": self.markdown_image_response_type
431
+ }
432
+
433
+ self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
434
+
435
+ result = client.reconstruct_document_sse(
436
+ file_type=file_type,
437
+ file_base64=file_base64,
438
+ file_start_page=file_start_page,
439
+ file_end_page=file_end_page,
440
+ config=config
441
+ )
442
+
443
+ if result:
444
+ self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
445
+ break
446
+ else:
447
+ self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
448
+
449
+ except Exception as e:
450
+ self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
451
+ if attempt == max_retries - 1:
452
+ raise
453
+
454
+ if not result:
455
+ error_msg = f"[TCADP] Document parsing failed, retried {max_retries} times"
456
+ self.logger.error(error_msg)
457
+ if callback:
458
+ callback(-1, error_msg)
459
+ raise RuntimeError(error_msg)
460
+
461
+ # Get download link
462
+ download_url = result.get("DocumentRecognizeResultUrl")
463
+ if not download_url:
464
+ if callback:
465
+ callback(-1, "[TCADP] No parsing result download link obtained")
466
+ raise RuntimeError("[TCADP] No parsing result download link obtained")
467
+
468
+ if callback:
469
+ callback(0.6, f"[TCADP] Parsing result download link: {download_url}")
470
+
471
+ # Set output directory
472
+ if output_dir:
473
+ out_dir = Path(output_dir)
474
+ out_dir.mkdir(parents=True, exist_ok=True)
475
+ else:
476
+ out_dir = Path(tempfile.mkdtemp(prefix="adp_pdf_"))
477
+ created_tmp_dir = True
478
+
479
+ # Download result file
480
+ zip_path = client.download_result_file(download_url, str(out_dir))
481
+ if not zip_path:
482
+ if callback:
483
+ callback(-1, "[TCADP] Failed to download parsing result")
484
+ raise RuntimeError("[TCADP] Failed to download parsing result")
485
+
486
+ if callback:
487
+ # Shorten file path display, only show filename
488
+ zip_filename = os.path.basename(zip_path)
489
+ callback(0.8, f"[TCADP] Parsing result downloaded: {zip_filename}")
490
+
491
+ # Extract ZIP file content
492
+ content_data = self._extract_content_from_zip(zip_path)
493
+ self.logger.info(f"[TCADP] Extracted {len(content_data)} content blocks")
494
+
495
+ if callback:
496
+ callback(0.9, f"[TCADP] Extracted {len(content_data)} content blocks")
497
+
498
+ # Convert to sections and tables format
499
+ sections = self._parse_content_to_sections(content_data)
500
+ tables = self._parse_content_to_tables(content_data)
501
+
502
+ self.logger.info(f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables")
503
+
504
+ if callback:
505
+ callback(1.0, f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables")
506
+
507
+ return sections, tables
508
+
509
+ finally:
510
+ # Clean up temporary files
511
+ if temp_file and os.path.exists(temp_file.name):
512
+ try:
513
+ os.unlink(temp_file.name)
514
+ except Exception:
515
+ pass
516
+
517
+ if delete_output and created_tmp_dir and out_dir.exists():
518
+ try:
519
+ shutil.rmtree(out_dir)
520
+ except Exception:
521
+ pass
522
+
523
+
524
+ if __name__ == "__main__":
525
+ # Test ADP parser
526
+ model_cfg = PdfModelConfig.from_env()
527
+ tokenizer_cfg = TokenizerConfig.from_env()
528
+ parser = TCADPParser(model_cfg=model_cfg, tokenizer_cfg=tokenizer_cfg)
529
+ print("ADP available:", parser.check_installation())
530
+
531
+ # Test parsing
532
+ filepath = ""
533
+ if filepath and os.path.exists(filepath):
534
+ with open(filepath, "rb") as file:
535
+ sections, tables = parser.parse_pdf(filepath=filepath, binary=file.read())
536
+ print(f"Parsing result: {len(sections)} sections, {len(tables)} tables")
537
+ for i, (section, tag) in enumerate(sections[:3]): # Only print first 3
538
+ print(f"Section {i + 1}: {section[:100]}...")
@@ -0,0 +1,64 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import re
18
+
19
+ from deepdoc.parser.utils import get_text
20
+ from ..common.token_utils import num_tokens_from_string
21
+
22
+
23
+ class RAGFlowTxtParser:
24
+ def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
25
+ txt = get_text(fnm, binary)
26
+ return self.parser_txt(txt, chunk_token_num, delimiter)
27
+
28
+ @classmethod
29
+ def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
30
+ if not isinstance(txt, str):
31
+ raise TypeError("txt type should be str!")
32
+ cks = [""]
33
+ tk_nums = [0]
34
+ delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
35
+
36
+ def add_chunk(t):
37
+ nonlocal cks, tk_nums, delimiter
38
+ tnum = num_tokens_from_string(t)
39
+ if tk_nums[-1] > chunk_token_num:
40
+ cks.append(t)
41
+ tk_nums.append(tnum)
42
+ else:
43
+ cks[-1] += t
44
+ tk_nums[-1] += tnum
45
+
46
+ dels = []
47
+ s = 0
48
+ for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
49
+ f, t = m.span()
50
+ dels.append(m.group(1))
51
+ dels.extend(list(delimiter[s: f]))
52
+ s = t
53
+ if s < len(delimiter):
54
+ dels.extend(list(delimiter[s:]))
55
+ dels = [re.escape(d) for d in dels if d]
56
+ dels = [d for d in dels if d]
57
+ dels = "|".join(dels)
58
+ secs = re.split(r"(%s)" % dels, txt)
59
+ for sec in secs:
60
+ if re.match(f"^{dels}$", sec):
61
+ continue
62
+ add_chunk(sec)
63
+
64
+ return [[c, ""] for c in cks]
@@ -0,0 +1,33 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from ..depend.find_codec import find_codec
18
+
19
+
20
+ def get_text(fnm: str, binary=None) -> str:
21
+ txt = ""
22
+ if binary:
23
+ encoding = find_codec(binary)
24
+ txt = binary.decode(encoding, errors="ignore")
25
+ else:
26
+ with open(fnm, "r") as f:
27
+ while True:
28
+ line = f.readline()
29
+ if not line:
30
+ break
31
+ txt += line
32
+ return txt
33
+