deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,469 @@
1
+ import base64
2
+ import io
3
+ import json
4
+ import os
5
+ import uuid
6
+ from abc import ABC
7
+ from io import BytesIO
8
+ from urllib.parse import urljoin
9
+
10
+ import requests
11
+ from openai import OpenAI
12
+ from openai.lib.azure import AzureOpenAI
13
+ from PIL import Image
14
+ from zhipuai import ZhipuAI
15
+
16
+ # 可选导入 ollama
17
+ try:
18
+ from ollama import Client
19
+ OLLAMA_AVAILABLE = True
20
+ except ImportError:
21
+ OLLAMA_AVAILABLE = False
22
+ Client = None
23
+
24
+ # 修复导入路径问题
25
+ try:
26
+ from .prompts import vision_llm_describe_prompt
27
+ except ImportError:
28
+ try:
29
+ from ..depend.prompts import vision_llm_describe_prompt
30
+ except ImportError:
31
+ # 如果都失败,提供默认提示词
32
+ vision_llm_describe_prompt = """请详细描述这张图片的内容,包括:
33
+ 1. 图片中的主要对象和场景
34
+ 2. 任何可见的文字内容
35
+ 3. 图片的布局和结构
36
+ 4. 如果有表格,请描述表格的结构和内容
37
+ 5. 如果有图表,请描述图表的数据和趋势
38
+
39
+ 请用中文回答,描述要准确、详细。"""
40
+
41
+
42
+ class Base(ABC):
43
+ def __init__(self, key, model_name, lang="Chinese"):
44
+ self.key = key
45
+ self.model_name = model_name
46
+ self.lang = lang
47
+
48
+ def image2base64(self, binary):
49
+ """将图片转换为base64编码"""
50
+ if isinstance(binary, BytesIO):
51
+ binary.seek(0)
52
+ img_data = binary.read()
53
+ elif isinstance(binary, bytes):
54
+ img_data = binary
55
+ else:
56
+ raise ValueError("binary must be BytesIO or bytes")
57
+
58
+ return base64.b64encode(img_data).decode('utf-8')
59
+
60
+ def prompt(self, image):
61
+ """生成提示词"""
62
+ return self.vision_llm_prompt(image)
63
+
64
+ def vision_llm_prompt(self, image):
65
+ """生成视觉LLM提示词"""
66
+ if self.lang.lower() == "chinese":
67
+ prompt_text = vision_llm_describe_prompt()
68
+ else:
69
+ prompt_text = "Please describe this image in detail."
70
+
71
+ return prompt_text
72
+
73
+
74
+ class GptV4(Base):
75
+ _FACTORY_NAME = "OpenAI"
76
+
77
+ def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", **kwargs):
78
+ super().__init__(key, model_name, lang)
79
+ self.client = OpenAI(api_key=key)
80
+
81
+ def describe_with_prompt(self, image, prompt=None):
82
+ """使用OpenAI GPT-4 Vision模型描述图片"""
83
+ try:
84
+ base64_image = self.image2base64(image)
85
+
86
+ messages = [
87
+ {
88
+ "role": "user",
89
+ "content": [
90
+ {
91
+ "type": "text",
92
+ "text": prompt or self.prompt(image)
93
+ },
94
+ {
95
+ "type": "image_url",
96
+ "image_url": {
97
+ "url": f"data:image/jpeg;base64,{base64_image}"
98
+ }
99
+ }
100
+ ]
101
+ }
102
+ ]
103
+
104
+ response = self.client.chat.completions.create(
105
+ model=self.model_name,
106
+ messages=messages,
107
+ max_tokens=1000
108
+ )
109
+
110
+ return response.choices[0].message.content.strip()
111
+ except Exception as e:
112
+ return "**ERROR**: " + str(e)
113
+
114
+
115
+ class QWenCV(Base):
116
+ _FACTORY_NAME = "Tongyi-Qianwen"
117
+
118
+ def __init__(self, key, model_name="qwen-vl-max", lang="Chinese", **kwargs):
119
+ super().__init__(key, model_name, lang)
120
+ # 使用兼容模式的OpenAI API
121
+ self.client = OpenAI(
122
+ api_key=key,
123
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
124
+ )
125
+
126
+ def describe_with_prompt(self, image, prompt=None):
127
+ """使用通义千问视觉模型描述图片"""
128
+ try:
129
+ base64_image = self.image2base64(image)
130
+
131
+ messages = [
132
+ {
133
+ "role": "user",
134
+ "content": [
135
+ {
136
+ "type": "text",
137
+ "text": prompt or self.prompt(image)
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {
142
+ "url": f"data:image/jpeg;base64,{base64_image}"
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ ]
148
+
149
+ response = self.client.chat.completions.create(
150
+ model=self.model_name,
151
+ messages=messages,
152
+ max_tokens=1000
153
+ )
154
+
155
+ return response.choices[0].message.content.strip()
156
+ except Exception as e:
157
+ return "**ERROR**: " + str(e)
158
+
159
+
160
+ class Zhipu4V(Base):
161
+ _FACTORY_NAME = "ZhipuAI"
162
+
163
+ def __init__(self, key, model_name="glm-4v", lang="Chinese", **kwargs):
164
+ super().__init__(key, model_name, lang)
165
+ self.client = ZhipuAI(api_key=key)
166
+
167
+ def describe_with_prompt(self, image, prompt=None):
168
+ """使用智谱AI视觉模型描述图片"""
169
+ try:
170
+ base64_image = self.image2base64(image)
171
+
172
+ response = self.client.chat.completions.create(
173
+ model=self.model_name,
174
+ messages=[
175
+ {
176
+ "role": "user",
177
+ "content": [
178
+ {
179
+ "type": "text",
180
+ "text": prompt or self.prompt(image)
181
+ },
182
+ {
183
+ "type": "image_url",
184
+ "image_url": {
185
+ "url": f"data:image/jpeg;base64,{base64_image}"
186
+ }
187
+ }
188
+ ]
189
+ }
190
+ ],
191
+ max_tokens=1000
192
+ )
193
+
194
+ return response.choices[0].message.content.strip()
195
+ except Exception as e:
196
+ return "**ERROR**: " + str(e)
197
+
198
+
199
+ class OllamaCV(Base):
200
+ _FACTORY_NAME = "Ollama"
201
+
202
+ def __init__(self, key, model_name, lang="Chinese", **kwargs):
203
+ if not OLLAMA_AVAILABLE:
204
+ raise ImportError("Ollama客户端未安装,请运行: pip install ollama")
205
+ self.client = Client(host=kwargs.get("base_url", "http://localhost:11434"))
206
+ self.model_name = model_name
207
+ self.lang = lang
208
+
209
+ def describe(self, image):
210
+ prompt = self.prompt("")
211
+ try:
212
+ response = self.client.generate(
213
+ model=self.model_name,
214
+ prompt=prompt[0]["content"][1]["text"],
215
+ images=[image],
216
+ )
217
+ ans = response["response"].strip()
218
+ return ans, 128
219
+ except Exception as e:
220
+ return "**ERROR**: " + str(e), 0
221
+
222
+ def describe_with_prompt(self, image, prompt=None):
223
+ vision_prompt = self.vision_llm_prompt("", prompt) if prompt else self.vision_llm_prompt("")
224
+ try:
225
+ response = self.client.generate(
226
+ model=self.model_name,
227
+ prompt=vision_prompt[0]["content"][1]["text"],
228
+ images=[image],
229
+ )
230
+ ans = response["response"].strip()
231
+ return ans, 128
232
+ except Exception as e:
233
+ return "**ERROR**: " + str(e), 0
234
+
235
+ def chat(self, system, history, gen_conf, image=""):
236
+ if system:
237
+ history[-1]["content"] = system + history[-1]["content"] + "user query: " + history[-1]["content"]
238
+
239
+ try:
240
+ for his in history:
241
+ if his["role"] == "user":
242
+ his["images"] = [image]
243
+ options = {}
244
+ if "temperature" in gen_conf:
245
+ options["temperature"] = gen_conf["temperature"]
246
+ if "top_p" in gen_conf:
247
+ options["top_k"] = gen_conf["top_p"]
248
+ if "presence_penalty" in gen_conf:
249
+ options["presence_penalty"] = gen_conf["presence_penalty"]
250
+ if "frequency_penalty" in gen_conf:
251
+ options["frequency_penalty"] = gen_conf["frequency_penalty"]
252
+ response = self.client.chat(
253
+ model=self.model_name,
254
+ messages=history,
255
+ options=options,
256
+ keep_alive=-1,
257
+ )
258
+
259
+ ans = response["message"]["content"].strip()
260
+ return ans, response["eval_count"] + response.get("prompt_eval_count", 0)
261
+ except Exception as e:
262
+ return "**ERROR**: " + str(e), 0
263
+
264
+ def chat_streamly(self, system, history, gen_conf, image=""):
265
+ if system:
266
+ history[-1]["content"] = system + history[-1]["content"] + "user query: " + history[-1]["content"]
267
+
268
+ for his in history:
269
+ if his["role"] == "user":
270
+ his["images"] = [image]
271
+ options = {}
272
+ if "temperature" in gen_conf:
273
+ options["temperature"] = gen_conf["temperature"]
274
+ if "top_p" in gen_conf:
275
+ options["top_k"] = gen_conf["top_p"]
276
+ if "presence_penalty" in gen_conf:
277
+ options["presence_penalty"] = gen_conf["presence_penalty"]
278
+ if "frequency_penalty" in gen_conf:
279
+ options["frequency_penalty"] = gen_conf["frequency_penalty"]
280
+ ans = ""
281
+ try:
282
+ response = self.client.chat(
283
+ model=self.model_name,
284
+ messages=history,
285
+ stream=True,
286
+ options=options,
287
+ keep_alive=-1,
288
+ )
289
+ for resp in response:
290
+ if resp["done"]:
291
+ yield resp.get("prompt_eval_count", 0) + resp.get("eval_count", 0)
292
+ ans += resp["message"]["content"]
293
+ yield ans
294
+ except Exception as e:
295
+ yield ans + "\n**ERROR**: " + str(e)
296
+ yield 0
297
+
298
+
299
+ class GeminiCV(Base):
300
+ _FACTORY_NAME = "Google-Gemini"
301
+
302
+ def __init__(self, key, model_name="gemini-pro-vision", lang="Chinese", **kwargs):
303
+ super().__init__(key, model_name, lang)
304
+ import google.generativeai as genai
305
+ genai.configure(api_key=key)
306
+ self.model = genai.GenerativeModel(model_name)
307
+
308
+ def describe_with_prompt(self, image, prompt=None):
309
+ """使用Google Gemini模型描述图片"""
310
+ try:
311
+ if isinstance(image, BytesIO):
312
+ image.seek(0)
313
+ img_data = image.read()
314
+ elif isinstance(image, bytes):
315
+ img_data = image
316
+ else:
317
+ raise ValueError("image must be BytesIO or bytes")
318
+
319
+ pil_image = Image.open(BytesIO(img_data))
320
+
321
+ response = self.model.generate_content([
322
+ prompt or self.prompt(image),
323
+ pil_image
324
+ ])
325
+
326
+ return response.text.strip()
327
+ except Exception as e:
328
+ return "**ERROR**: " + str(e)
329
+
330
+
331
+ class AnthropicCV(Base):
332
+ _FACTORY_NAME = "Anthropic"
333
+
334
+ def __init__(self, key, model_name="claude-3-sonnet-20240229", lang="Chinese", **kwargs):
335
+ super().__init__(key, model_name, lang)
336
+ import anthropic
337
+ self.client = anthropic.Anthropic(api_key=key)
338
+
339
+ def describe_with_prompt(self, image, prompt=None):
340
+ """使用Anthropic Claude模型描述图片"""
341
+ try:
342
+ base64_image = self.image2base64(image)
343
+
344
+ message = self.client.messages.create(
345
+ model=self.model_name,
346
+ max_tokens=1000,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": prompt or self.prompt(image)
354
+ },
355
+ {
356
+ "type": "image",
357
+ "source": {
358
+ "type": "base64",
359
+ "media_type": "image/jpeg",
360
+ "data": base64_image
361
+ }
362
+ }
363
+ ]
364
+ }
365
+ ]
366
+ )
367
+
368
+ return message.content[0].text.strip()
369
+ except Exception as e:
370
+ return "**ERROR**: " + str(e)
371
+
372
+
373
+ class VisionModelFactory:
374
+ """视觉模型工厂类"""
375
+
376
+ _PROVIDERS = {
377
+ "openai": GptV4,
378
+ "qwen": QWenCV,
379
+ "zhipu": Zhipu4V,
380
+ "ollama": OllamaCV,
381
+ "gemini": GeminiCV,
382
+ "anthropic": AnthropicCV,
383
+ }
384
+
385
+ @classmethod
386
+ def create_model(cls, config):
387
+ """根据配置创建视觉模型实例"""
388
+ provider = config.get("provider", "openai").lower()
389
+ model_name = config.get("model_name", "")
390
+ api_key = config.get("api_key", "")
391
+ lang = config.get("lang", "Chinese")
392
+ base_url = config.get("base_url", "")
393
+
394
+ if provider not in cls._PROVIDERS:
395
+ raise ValueError(f"不支持的提供商: {provider}")
396
+
397
+ model_class = cls._PROVIDERS[provider]
398
+
399
+ # 根据提供商设置默认模型名称
400
+ if not model_name:
401
+ if provider == "openai":
402
+ model_name = "gpt-4-vision-preview"
403
+ elif provider == "qwen":
404
+ model_name = "qwen-vl-max"
405
+ elif provider == "zhipu":
406
+ model_name = "glm-4v"
407
+ elif provider == "gemini":
408
+ model_name = "gemini-pro-vision"
409
+ elif provider == "anthropic":
410
+ model_name = "claude-3-sonnet-20240229"
411
+
412
+ kwargs = {"lang": lang}
413
+ if base_url:
414
+ kwargs["base_url"] = base_url
415
+
416
+ return model_class(api_key, model_name, **kwargs)
417
+
418
+ @classmethod
419
+ def create_from_env(cls):
420
+ """从环境变量创建模型"""
421
+ config = {
422
+ "provider": os.getenv("DEEPDOC_VISION_PROVIDER", "openai"),
423
+ "model_name": os.getenv("DEEPDOC_VISION_MODEL", ""),
424
+ "api_key": os.getenv("DEEPDOC_VISION_API_KEY", ""),
425
+ "lang": os.getenv("DEEPDOC_VISION_LANG", "Chinese"),
426
+ "base_url": os.getenv("DEEPDOC_VISION_BASE_URL", ""),
427
+ }
428
+ return cls.create_model(config)
429
+
430
+ @classmethod
431
+ def create_from_config_file(cls, config_file="deepdoc_config.yaml"):
432
+ """从配置文件创建模型"""
433
+ try:
434
+ import yaml
435
+ with open(config_file, 'r', encoding='utf-8') as f:
436
+ config = yaml.safe_load(f)
437
+ vision_config = config.get("vision_model", {})
438
+ return cls.create_model(vision_config)
439
+ except Exception as e:
440
+ raise ValueError(f"Failed to load config from {config_file}: {e}")
441
+
442
+
443
+ def create_vision_model(config=None):
444
+ """创建视觉模型 - 支持多种配置方式"""
445
+ if config is None:
446
+ # 方式1:从环境变量创建
447
+ return VisionModelFactory.create_from_env()
448
+
449
+ elif isinstance(config, str):
450
+ # 方式2:字符串配置
451
+ if config.lower() in VisionModelFactory._PROVIDERS:
452
+ # 字符串是提供商名称
453
+ return VisionModelFactory.create_model({"provider": config.lower()})
454
+ elif config.endswith(('.yaml', '.yml', '.json')):
455
+ # 字符串是配置文件路径
456
+ return VisionModelFactory.create_from_config_file(config)
457
+ else:
458
+ # 尝试作为配置文件路径处理
459
+ try:
460
+ return VisionModelFactory.create_from_config_file(config)
461
+ except:
462
+ raise ValueError(f"无效的配置: {config}。支持的提供商: {list(VisionModelFactory._PROVIDERS.keys())}")
463
+
464
+ elif isinstance(config, dict):
465
+ # 方式3:字典配置
466
+ return VisionModelFactory.create_model(config)
467
+
468
+ else:
469
+ raise ValueError(f"不支持的配置类型: {type(config)}")
@@ -0,0 +1,91 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """
18
+ Simple Chinese surname detection utility for resume parsing.
19
+ """
20
+
21
+ # Common Chinese surnames (most frequent ones)
22
+ CHINESE_SURNAMES = {
23
+ # Single character surnames (most common)
24
+ '王', '李', '张', '刘', '陈', '杨', '赵', '黄', '周', '吴',
25
+ '徐', '孙', '胡', '朱', '高', '林', '何', '郭', '马', '罗',
26
+ '梁', '宋', '郑', '谢', '韩', '唐', '冯', '于', '董', '萧',
27
+ '程', '曹', '袁', '邓', '许', '傅', '沈', '曾', '彭', '吕',
28
+ '苏', '卢', '蒋', '蔡', '贾', '丁', '魏', '薛', '叶', '阎',
29
+ '余', '潘', '杜', '戴', '夏', '钟', '汪', '田', '任', '姜',
30
+ '范', '方', '石', '姚', '谭', '廖', '邹', '熊', '金', '陆',
31
+ '郝', '孔', '白', '崔', '康', '毛', '邱', '秦', '江', '史',
32
+ '顾', '侯', '邵', '孟', '龙', '万', '段', '漕', '钱', '汤',
33
+ '尹', '黎', '易', '常', '武', '乔', '贺', '赖', '龚', '文',
34
+
35
+ # Double character surnames (common compound surnames)
36
+ '欧阳', '太史', '端木', '上官', '司马', '东方', '公孙', '万俟', '闻人',
37
+ '夏侯', '诸葛', '尉迟', '公西', '澹台', '赫连', '皇甫', '宗政', '濮阳',
38
+ '公冶', '太叔', '申屠', '公孙', '慕容', '仲孙', '钟离', '长孙', '司徒',
39
+ '鲜于', '司空', '宇文', '长孙', '慕容', '司徒'
40
+ }
41
+
42
+
43
+ class SurnameChecker:
44
+ """Chinese surname checker utility"""
45
+
46
+ def __init__(self):
47
+ self.surnames = CHINESE_SURNAMES
48
+
49
+ def isit(self, text: str) -> bool:
50
+ """
51
+ Check if the given text starts with a Chinese surname.
52
+
53
+ Args:
54
+ text: Text to check
55
+
56
+ Returns:
57
+ bool: True if text starts with a Chinese surname
58
+ """
59
+ if not text or not isinstance(text, str):
60
+ return False
61
+
62
+ text = text.strip()
63
+
64
+ # Check single character surnames
65
+ if len(text) >= 1 and text[0] in self.surnames:
66
+ return True
67
+
68
+ # Check double character surnames
69
+ if len(text) >= 2:
70
+ double_surname = text[:2]
71
+ if double_surname in self.surnames:
72
+ return True
73
+
74
+ return False
75
+
76
+
77
+ # Global instance for backward compatibility
78
+ surname = SurnameChecker()
79
+
80
+
81
+ def is_chinese_surname(text: str) -> bool:
82
+ """
83
+ Check if text starts with a Chinese surname.
84
+
85
+ Args:
86
+ text: Text to check
87
+
88
+ Returns:
89
+ bool: True if starts with Chinese surname
90
+ """
91
+ return surname.isit(text)
@@ -0,0 +1,73 @@
1
+ import asyncio
2
+ import queue
3
+ import trio
4
+ import threading
5
+ from functools import wraps
6
+ from typing import Any, Callable, Coroutine, Optional, Type, Union
7
+
8
+ TimeoutException = Union[Type[BaseException], BaseException]
9
+ OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
10
+
11
+ def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
12
+ def decorator(func):
13
+ @wraps(func)
14
+ def wrapper(*args, **kwargs):
15
+ result_queue = queue.Queue(maxsize=1)
16
+
17
+ def target():
18
+ try:
19
+ result = func(*args, **kwargs)
20
+ result_queue.put(result)
21
+ except Exception as e:
22
+ result_queue.put(e)
23
+
24
+ thread = threading.Thread(target=target)
25
+ thread.daemon = True
26
+ thread.start()
27
+
28
+ for a in range(attempts):
29
+ try:
30
+ result = result_queue.get(timeout=seconds)
31
+ if isinstance(result, Exception):
32
+ raise result
33
+ return result
34
+ except queue.Empty:
35
+ pass
36
+ raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
37
+
38
+ @wraps(func)
39
+ async def async_wrapper(*args, **kwargs) -> Any:
40
+ if seconds is None:
41
+ return await func(*args, **kwargs)
42
+
43
+ for a in range(attempts):
44
+ try:
45
+ with trio.fail_after(seconds):
46
+ return await func(*args, **kwargs)
47
+ except trio.TooSlowError:
48
+ if a < attempts - 1:
49
+ continue
50
+ if on_timeout is not None:
51
+ if callable(on_timeout):
52
+ result = on_timeout()
53
+ if isinstance(result, Coroutine):
54
+ return await result
55
+ return result
56
+ return on_timeout
57
+
58
+ if exception is None:
59
+ raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
60
+
61
+ if isinstance(exception, BaseException):
62
+ raise exception
63
+
64
+ if isinstance(exception, type) and issubclass(exception, BaseException):
65
+ raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
66
+
67
+ raise RuntimeError("Invalid exception type provided")
68
+
69
+ if asyncio.iscoroutinefunction(func):
70
+ return async_wrapper
71
+ return wrapper
72
+
73
+ return decorator
@@ -0,0 +1,35 @@
1
+ import io
2
+ import re
3
+
4
+ def clean_markdown_block(text):
5
+ text = re.sub(r'^\s*```markdown\s*\n?', '', text)
6
+ text = re.sub(r'\n?\s*```\s*$', '', text)
7
+ return text.strip()
8
+
9
+ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
10
+ """
11
+ A simple wrapper to process image to markdown texts via VLM.
12
+
13
+ Returns:
14
+ Simple markdown texts generated by VLM.
15
+ """
16
+ callback = callback or (lambda prog, msg: None)
17
+
18
+ img = binary
19
+ txt = ""
20
+
21
+ try:
22
+ img_binary = io.BytesIO()
23
+ img.save(img_binary, format='JPEG')
24
+ img_binary.seek(0)
25
+
26
+ ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
27
+
28
+ txt += "\n" + ans
29
+
30
+ return txt
31
+
32
+ except Exception as e:
33
+ callback(-1, str(e))
34
+
35
+ return ""
deepdoc/dict/README.md ADDED
@@ -0,0 +1,19 @@
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ ### Model Loading
6
+ ```python
7
+ import xgboost as xgb
8
+ import torch
9
+
10
+ model = xgb.Booster()
11
+ if torch.cuda.is_available():
12
+ model.set_param({"device": "cuda"})
13
+ model.load_model('InfiniFlow/text_concat_xgb_v1.0')
14
+ ```
15
+
16
+ ### Prediction
17
+ ```python
18
+ model.predict(xgb.DMatrix([feature]))[0]
19
+ ```