deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from urllib.parse import urljoin
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
from openai.lib.azure import AzureOpenAI
|
|
13
|
+
from PIL import Image
|
|
14
|
+
from zhipuai import ZhipuAI
|
|
15
|
+
|
|
16
|
+
# 可选导入 ollama
|
|
17
|
+
try:
|
|
18
|
+
from ollama import Client
|
|
19
|
+
OLLAMA_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
OLLAMA_AVAILABLE = False
|
|
22
|
+
Client = None
|
|
23
|
+
|
|
24
|
+
# 修复导入路径问题
|
|
25
|
+
try:
|
|
26
|
+
from .prompts import vision_llm_describe_prompt
|
|
27
|
+
except ImportError:
|
|
28
|
+
try:
|
|
29
|
+
from ..depend.prompts import vision_llm_describe_prompt
|
|
30
|
+
except ImportError:
|
|
31
|
+
# 如果都失败,提供默认提示词
|
|
32
|
+
vision_llm_describe_prompt = """请详细描述这张图片的内容,包括:
|
|
33
|
+
1. 图片中的主要对象和场景
|
|
34
|
+
2. 任何可见的文字内容
|
|
35
|
+
3. 图片的布局和结构
|
|
36
|
+
4. 如果有表格,请描述表格的结构和内容
|
|
37
|
+
5. 如果有图表,请描述图表的数据和趋势
|
|
38
|
+
|
|
39
|
+
请用中文回答,描述要准确、详细。"""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Base(ABC):
|
|
43
|
+
def __init__(self, key, model_name, lang="Chinese"):
|
|
44
|
+
self.key = key
|
|
45
|
+
self.model_name = model_name
|
|
46
|
+
self.lang = lang
|
|
47
|
+
|
|
48
|
+
def image2base64(self, binary):
|
|
49
|
+
"""将图片转换为base64编码"""
|
|
50
|
+
if isinstance(binary, BytesIO):
|
|
51
|
+
binary.seek(0)
|
|
52
|
+
img_data = binary.read()
|
|
53
|
+
elif isinstance(binary, bytes):
|
|
54
|
+
img_data = binary
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError("binary must be BytesIO or bytes")
|
|
57
|
+
|
|
58
|
+
return base64.b64encode(img_data).decode('utf-8')
|
|
59
|
+
|
|
60
|
+
def prompt(self, image):
|
|
61
|
+
"""生成提示词"""
|
|
62
|
+
return self.vision_llm_prompt(image)
|
|
63
|
+
|
|
64
|
+
def vision_llm_prompt(self, image):
|
|
65
|
+
"""生成视觉LLM提示词"""
|
|
66
|
+
if self.lang.lower() == "chinese":
|
|
67
|
+
prompt_text = vision_llm_describe_prompt()
|
|
68
|
+
else:
|
|
69
|
+
prompt_text = "Please describe this image in detail."
|
|
70
|
+
|
|
71
|
+
return prompt_text
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class GptV4(Base):
|
|
75
|
+
_FACTORY_NAME = "OpenAI"
|
|
76
|
+
|
|
77
|
+
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", **kwargs):
|
|
78
|
+
super().__init__(key, model_name, lang)
|
|
79
|
+
self.client = OpenAI(api_key=key)
|
|
80
|
+
|
|
81
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
82
|
+
"""使用OpenAI GPT-4 Vision模型描述图片"""
|
|
83
|
+
try:
|
|
84
|
+
base64_image = self.image2base64(image)
|
|
85
|
+
|
|
86
|
+
messages = [
|
|
87
|
+
{
|
|
88
|
+
"role": "user",
|
|
89
|
+
"content": [
|
|
90
|
+
{
|
|
91
|
+
"type": "text",
|
|
92
|
+
"text": prompt or self.prompt(image)
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"type": "image_url",
|
|
96
|
+
"image_url": {
|
|
97
|
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
]
|
|
101
|
+
}
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
response = self.client.chat.completions.create(
|
|
105
|
+
model=self.model_name,
|
|
106
|
+
messages=messages,
|
|
107
|
+
max_tokens=1000
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return response.choices[0].message.content.strip()
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return "**ERROR**: " + str(e)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class QWenCV(Base):
|
|
116
|
+
_FACTORY_NAME = "Tongyi-Qianwen"
|
|
117
|
+
|
|
118
|
+
def __init__(self, key, model_name="qwen-vl-max", lang="Chinese", **kwargs):
|
|
119
|
+
super().__init__(key, model_name, lang)
|
|
120
|
+
# 使用兼容模式的OpenAI API
|
|
121
|
+
self.client = OpenAI(
|
|
122
|
+
api_key=key,
|
|
123
|
+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
127
|
+
"""使用通义千问视觉模型描述图片"""
|
|
128
|
+
try:
|
|
129
|
+
base64_image = self.image2base64(image)
|
|
130
|
+
|
|
131
|
+
messages = [
|
|
132
|
+
{
|
|
133
|
+
"role": "user",
|
|
134
|
+
"content": [
|
|
135
|
+
{
|
|
136
|
+
"type": "text",
|
|
137
|
+
"text": prompt or self.prompt(image)
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"type": "image_url",
|
|
141
|
+
"image_url": {
|
|
142
|
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
response = self.client.chat.completions.create(
|
|
150
|
+
model=self.model_name,
|
|
151
|
+
messages=messages,
|
|
152
|
+
max_tokens=1000
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return response.choices[0].message.content.strip()
|
|
156
|
+
except Exception as e:
|
|
157
|
+
return "**ERROR**: " + str(e)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class Zhipu4V(Base):
|
|
161
|
+
_FACTORY_NAME = "ZhipuAI"
|
|
162
|
+
|
|
163
|
+
def __init__(self, key, model_name="glm-4v", lang="Chinese", **kwargs):
|
|
164
|
+
super().__init__(key, model_name, lang)
|
|
165
|
+
self.client = ZhipuAI(api_key=key)
|
|
166
|
+
|
|
167
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
168
|
+
"""使用智谱AI视觉模型描述图片"""
|
|
169
|
+
try:
|
|
170
|
+
base64_image = self.image2base64(image)
|
|
171
|
+
|
|
172
|
+
response = self.client.chat.completions.create(
|
|
173
|
+
model=self.model_name,
|
|
174
|
+
messages=[
|
|
175
|
+
{
|
|
176
|
+
"role": "user",
|
|
177
|
+
"content": [
|
|
178
|
+
{
|
|
179
|
+
"type": "text",
|
|
180
|
+
"text": prompt or self.prompt(image)
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
"type": "image_url",
|
|
184
|
+
"image_url": {
|
|
185
|
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
]
|
|
189
|
+
}
|
|
190
|
+
],
|
|
191
|
+
max_tokens=1000
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return response.choices[0].message.content.strip()
|
|
195
|
+
except Exception as e:
|
|
196
|
+
return "**ERROR**: " + str(e)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class OllamaCV(Base):
|
|
200
|
+
_FACTORY_NAME = "Ollama"
|
|
201
|
+
|
|
202
|
+
def __init__(self, key, model_name, lang="Chinese", **kwargs):
|
|
203
|
+
if not OLLAMA_AVAILABLE:
|
|
204
|
+
raise ImportError("Ollama客户端未安装,请运行: pip install ollama")
|
|
205
|
+
self.client = Client(host=kwargs.get("base_url", "http://localhost:11434"))
|
|
206
|
+
self.model_name = model_name
|
|
207
|
+
self.lang = lang
|
|
208
|
+
|
|
209
|
+
def describe(self, image):
|
|
210
|
+
prompt = self.prompt("")
|
|
211
|
+
try:
|
|
212
|
+
response = self.client.generate(
|
|
213
|
+
model=self.model_name,
|
|
214
|
+
prompt=prompt[0]["content"][1]["text"],
|
|
215
|
+
images=[image],
|
|
216
|
+
)
|
|
217
|
+
ans = response["response"].strip()
|
|
218
|
+
return ans, 128
|
|
219
|
+
except Exception as e:
|
|
220
|
+
return "**ERROR**: " + str(e), 0
|
|
221
|
+
|
|
222
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
223
|
+
vision_prompt = self.vision_llm_prompt("", prompt) if prompt else self.vision_llm_prompt("")
|
|
224
|
+
try:
|
|
225
|
+
response = self.client.generate(
|
|
226
|
+
model=self.model_name,
|
|
227
|
+
prompt=vision_prompt[0]["content"][1]["text"],
|
|
228
|
+
images=[image],
|
|
229
|
+
)
|
|
230
|
+
ans = response["response"].strip()
|
|
231
|
+
return ans, 128
|
|
232
|
+
except Exception as e:
|
|
233
|
+
return "**ERROR**: " + str(e), 0
|
|
234
|
+
|
|
235
|
+
def chat(self, system, history, gen_conf, image=""):
|
|
236
|
+
if system:
|
|
237
|
+
history[-1]["content"] = system + history[-1]["content"] + "user query: " + history[-1]["content"]
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
for his in history:
|
|
241
|
+
if his["role"] == "user":
|
|
242
|
+
his["images"] = [image]
|
|
243
|
+
options = {}
|
|
244
|
+
if "temperature" in gen_conf:
|
|
245
|
+
options["temperature"] = gen_conf["temperature"]
|
|
246
|
+
if "top_p" in gen_conf:
|
|
247
|
+
options["top_k"] = gen_conf["top_p"]
|
|
248
|
+
if "presence_penalty" in gen_conf:
|
|
249
|
+
options["presence_penalty"] = gen_conf["presence_penalty"]
|
|
250
|
+
if "frequency_penalty" in gen_conf:
|
|
251
|
+
options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
|
252
|
+
response = self.client.chat(
|
|
253
|
+
model=self.model_name,
|
|
254
|
+
messages=history,
|
|
255
|
+
options=options,
|
|
256
|
+
keep_alive=-1,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
ans = response["message"]["content"].strip()
|
|
260
|
+
return ans, response["eval_count"] + response.get("prompt_eval_count", 0)
|
|
261
|
+
except Exception as e:
|
|
262
|
+
return "**ERROR**: " + str(e), 0
|
|
263
|
+
|
|
264
|
+
def chat_streamly(self, system, history, gen_conf, image=""):
|
|
265
|
+
if system:
|
|
266
|
+
history[-1]["content"] = system + history[-1]["content"] + "user query: " + history[-1]["content"]
|
|
267
|
+
|
|
268
|
+
for his in history:
|
|
269
|
+
if his["role"] == "user":
|
|
270
|
+
his["images"] = [image]
|
|
271
|
+
options = {}
|
|
272
|
+
if "temperature" in gen_conf:
|
|
273
|
+
options["temperature"] = gen_conf["temperature"]
|
|
274
|
+
if "top_p" in gen_conf:
|
|
275
|
+
options["top_k"] = gen_conf["top_p"]
|
|
276
|
+
if "presence_penalty" in gen_conf:
|
|
277
|
+
options["presence_penalty"] = gen_conf["presence_penalty"]
|
|
278
|
+
if "frequency_penalty" in gen_conf:
|
|
279
|
+
options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
|
280
|
+
ans = ""
|
|
281
|
+
try:
|
|
282
|
+
response = self.client.chat(
|
|
283
|
+
model=self.model_name,
|
|
284
|
+
messages=history,
|
|
285
|
+
stream=True,
|
|
286
|
+
options=options,
|
|
287
|
+
keep_alive=-1,
|
|
288
|
+
)
|
|
289
|
+
for resp in response:
|
|
290
|
+
if resp["done"]:
|
|
291
|
+
yield resp.get("prompt_eval_count", 0) + resp.get("eval_count", 0)
|
|
292
|
+
ans += resp["message"]["content"]
|
|
293
|
+
yield ans
|
|
294
|
+
except Exception as e:
|
|
295
|
+
yield ans + "\n**ERROR**: " + str(e)
|
|
296
|
+
yield 0
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class GeminiCV(Base):
|
|
300
|
+
_FACTORY_NAME = "Google-Gemini"
|
|
301
|
+
|
|
302
|
+
def __init__(self, key, model_name="gemini-pro-vision", lang="Chinese", **kwargs):
|
|
303
|
+
super().__init__(key, model_name, lang)
|
|
304
|
+
import google.generativeai as genai
|
|
305
|
+
genai.configure(api_key=key)
|
|
306
|
+
self.model = genai.GenerativeModel(model_name)
|
|
307
|
+
|
|
308
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
309
|
+
"""使用Google Gemini模型描述图片"""
|
|
310
|
+
try:
|
|
311
|
+
if isinstance(image, BytesIO):
|
|
312
|
+
image.seek(0)
|
|
313
|
+
img_data = image.read()
|
|
314
|
+
elif isinstance(image, bytes):
|
|
315
|
+
img_data = image
|
|
316
|
+
else:
|
|
317
|
+
raise ValueError("image must be BytesIO or bytes")
|
|
318
|
+
|
|
319
|
+
pil_image = Image.open(BytesIO(img_data))
|
|
320
|
+
|
|
321
|
+
response = self.model.generate_content([
|
|
322
|
+
prompt or self.prompt(image),
|
|
323
|
+
pil_image
|
|
324
|
+
])
|
|
325
|
+
|
|
326
|
+
return response.text.strip()
|
|
327
|
+
except Exception as e:
|
|
328
|
+
return "**ERROR**: " + str(e)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class AnthropicCV(Base):
|
|
332
|
+
_FACTORY_NAME = "Anthropic"
|
|
333
|
+
|
|
334
|
+
def __init__(self, key, model_name="claude-3-sonnet-20240229", lang="Chinese", **kwargs):
|
|
335
|
+
super().__init__(key, model_name, lang)
|
|
336
|
+
import anthropic
|
|
337
|
+
self.client = anthropic.Anthropic(api_key=key)
|
|
338
|
+
|
|
339
|
+
def describe_with_prompt(self, image, prompt=None):
|
|
340
|
+
"""使用Anthropic Claude模型描述图片"""
|
|
341
|
+
try:
|
|
342
|
+
base64_image = self.image2base64(image)
|
|
343
|
+
|
|
344
|
+
message = self.client.messages.create(
|
|
345
|
+
model=self.model_name,
|
|
346
|
+
max_tokens=1000,
|
|
347
|
+
messages=[
|
|
348
|
+
{
|
|
349
|
+
"role": "user",
|
|
350
|
+
"content": [
|
|
351
|
+
{
|
|
352
|
+
"type": "text",
|
|
353
|
+
"text": prompt or self.prompt(image)
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
"type": "image",
|
|
357
|
+
"source": {
|
|
358
|
+
"type": "base64",
|
|
359
|
+
"media_type": "image/jpeg",
|
|
360
|
+
"data": base64_image
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
]
|
|
364
|
+
}
|
|
365
|
+
]
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return message.content[0].text.strip()
|
|
369
|
+
except Exception as e:
|
|
370
|
+
return "**ERROR**: " + str(e)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class VisionModelFactory:
|
|
374
|
+
"""视觉模型工厂类"""
|
|
375
|
+
|
|
376
|
+
_PROVIDERS = {
|
|
377
|
+
"openai": GptV4,
|
|
378
|
+
"qwen": QWenCV,
|
|
379
|
+
"zhipu": Zhipu4V,
|
|
380
|
+
"ollama": OllamaCV,
|
|
381
|
+
"gemini": GeminiCV,
|
|
382
|
+
"anthropic": AnthropicCV,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def create_model(cls, config):
|
|
387
|
+
"""根据配置创建视觉模型实例"""
|
|
388
|
+
provider = config.get("provider", "openai").lower()
|
|
389
|
+
model_name = config.get("model_name", "")
|
|
390
|
+
api_key = config.get("api_key", "")
|
|
391
|
+
lang = config.get("lang", "Chinese")
|
|
392
|
+
base_url = config.get("base_url", "")
|
|
393
|
+
|
|
394
|
+
if provider not in cls._PROVIDERS:
|
|
395
|
+
raise ValueError(f"不支持的提供商: {provider}")
|
|
396
|
+
|
|
397
|
+
model_class = cls._PROVIDERS[provider]
|
|
398
|
+
|
|
399
|
+
# 根据提供商设置默认模型名称
|
|
400
|
+
if not model_name:
|
|
401
|
+
if provider == "openai":
|
|
402
|
+
model_name = "gpt-4-vision-preview"
|
|
403
|
+
elif provider == "qwen":
|
|
404
|
+
model_name = "qwen-vl-max"
|
|
405
|
+
elif provider == "zhipu":
|
|
406
|
+
model_name = "glm-4v"
|
|
407
|
+
elif provider == "gemini":
|
|
408
|
+
model_name = "gemini-pro-vision"
|
|
409
|
+
elif provider == "anthropic":
|
|
410
|
+
model_name = "claude-3-sonnet-20240229"
|
|
411
|
+
|
|
412
|
+
kwargs = {"lang": lang}
|
|
413
|
+
if base_url:
|
|
414
|
+
kwargs["base_url"] = base_url
|
|
415
|
+
|
|
416
|
+
return model_class(api_key, model_name, **kwargs)
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def create_from_env(cls):
|
|
420
|
+
"""从环境变量创建模型"""
|
|
421
|
+
config = {
|
|
422
|
+
"provider": os.getenv("DEEPDOC_VISION_PROVIDER", "openai"),
|
|
423
|
+
"model_name": os.getenv("DEEPDOC_VISION_MODEL", ""),
|
|
424
|
+
"api_key": os.getenv("DEEPDOC_VISION_API_KEY", ""),
|
|
425
|
+
"lang": os.getenv("DEEPDOC_VISION_LANG", "Chinese"),
|
|
426
|
+
"base_url": os.getenv("DEEPDOC_VISION_BASE_URL", ""),
|
|
427
|
+
}
|
|
428
|
+
return cls.create_model(config)
|
|
429
|
+
|
|
430
|
+
@classmethod
|
|
431
|
+
def create_from_config_file(cls, config_file="deepdoc_config.yaml"):
|
|
432
|
+
"""从配置文件创建模型"""
|
|
433
|
+
try:
|
|
434
|
+
import yaml
|
|
435
|
+
with open(config_file, 'r', encoding='utf-8') as f:
|
|
436
|
+
config = yaml.safe_load(f)
|
|
437
|
+
vision_config = config.get("vision_model", {})
|
|
438
|
+
return cls.create_model(vision_config)
|
|
439
|
+
except Exception as e:
|
|
440
|
+
raise ValueError(f"Failed to load config from {config_file}: {e}")
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def create_vision_model(config=None):
|
|
444
|
+
"""创建视觉模型 - 支持多种配置方式"""
|
|
445
|
+
if config is None:
|
|
446
|
+
# 方式1:从环境变量创建
|
|
447
|
+
return VisionModelFactory.create_from_env()
|
|
448
|
+
|
|
449
|
+
elif isinstance(config, str):
|
|
450
|
+
# 方式2:字符串配置
|
|
451
|
+
if config.lower() in VisionModelFactory._PROVIDERS:
|
|
452
|
+
# 字符串是提供商名称
|
|
453
|
+
return VisionModelFactory.create_model({"provider": config.lower()})
|
|
454
|
+
elif config.endswith(('.yaml', '.yml', '.json')):
|
|
455
|
+
# 字符串是配置文件路径
|
|
456
|
+
return VisionModelFactory.create_from_config_file(config)
|
|
457
|
+
else:
|
|
458
|
+
# 尝试作为配置文件路径处理
|
|
459
|
+
try:
|
|
460
|
+
return VisionModelFactory.create_from_config_file(config)
|
|
461
|
+
except:
|
|
462
|
+
raise ValueError(f"无效的配置: {config}。支持的提供商: {list(VisionModelFactory._PROVIDERS.keys())}")
|
|
463
|
+
|
|
464
|
+
elif isinstance(config, dict):
|
|
465
|
+
# 方式3:字典配置
|
|
466
|
+
return VisionModelFactory.create_model(config)
|
|
467
|
+
|
|
468
|
+
else:
|
|
469
|
+
raise ValueError(f"不支持的配置类型: {type(config)}")
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Simple Chinese surname detection utility for resume parsing.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Common Chinese surnames (most frequent ones)
|
|
22
|
+
CHINESE_SURNAMES = {
|
|
23
|
+
# Single character surnames (most common)
|
|
24
|
+
'王', '李', '张', '刘', '陈', '杨', '赵', '黄', '周', '吴',
|
|
25
|
+
'徐', '孙', '胡', '朱', '高', '林', '何', '郭', '马', '罗',
|
|
26
|
+
'梁', '宋', '郑', '谢', '韩', '唐', '冯', '于', '董', '萧',
|
|
27
|
+
'程', '曹', '袁', '邓', '许', '傅', '沈', '曾', '彭', '吕',
|
|
28
|
+
'苏', '卢', '蒋', '蔡', '贾', '丁', '魏', '薛', '叶', '阎',
|
|
29
|
+
'余', '潘', '杜', '戴', '夏', '钟', '汪', '田', '任', '姜',
|
|
30
|
+
'范', '方', '石', '姚', '谭', '廖', '邹', '熊', '金', '陆',
|
|
31
|
+
'郝', '孔', '白', '崔', '康', '毛', '邱', '秦', '江', '史',
|
|
32
|
+
'顾', '侯', '邵', '孟', '龙', '万', '段', '漕', '钱', '汤',
|
|
33
|
+
'尹', '黎', '易', '常', '武', '乔', '贺', '赖', '龚', '文',
|
|
34
|
+
|
|
35
|
+
# Double character surnames (common compound surnames)
|
|
36
|
+
'欧阳', '太史', '端木', '上官', '司马', '东方', '公孙', '万俟', '闻人',
|
|
37
|
+
'夏侯', '诸葛', '尉迟', '公西', '澹台', '赫连', '皇甫', '宗政', '濮阳',
|
|
38
|
+
'公冶', '太叔', '申屠', '公孙', '慕容', '仲孙', '钟离', '长孙', '司徒',
|
|
39
|
+
'鲜于', '司空', '宇文', '长孙', '慕容', '司徒'
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SurnameChecker:
|
|
44
|
+
"""Chinese surname checker utility"""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
self.surnames = CHINESE_SURNAMES
|
|
48
|
+
|
|
49
|
+
def isit(self, text: str) -> bool:
|
|
50
|
+
"""
|
|
51
|
+
Check if the given text starts with a Chinese surname.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to check
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
bool: True if text starts with a Chinese surname
|
|
58
|
+
"""
|
|
59
|
+
if not text or not isinstance(text, str):
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
text = text.strip()
|
|
63
|
+
|
|
64
|
+
# Check single character surnames
|
|
65
|
+
if len(text) >= 1 and text[0] in self.surnames:
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
# Check double character surnames
|
|
69
|
+
if len(text) >= 2:
|
|
70
|
+
double_surname = text[:2]
|
|
71
|
+
if double_surname in self.surnames:
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# Global instance for backward compatibility
|
|
78
|
+
surname = SurnameChecker()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def is_chinese_surname(text: str) -> bool:
|
|
82
|
+
"""
|
|
83
|
+
Check if text starts with a Chinese surname.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
text: Text to check
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
bool: True if starts with Chinese surname
|
|
90
|
+
"""
|
|
91
|
+
return surname.isit(text)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import queue
|
|
3
|
+
import trio
|
|
4
|
+
import threading
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import Any, Callable, Coroutine, Optional, Type, Union
|
|
7
|
+
|
|
8
|
+
TimeoutException = Union[Type[BaseException], BaseException]
|
|
9
|
+
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
|
10
|
+
|
|
11
|
+
def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
|
|
12
|
+
def decorator(func):
|
|
13
|
+
@wraps(func)
|
|
14
|
+
def wrapper(*args, **kwargs):
|
|
15
|
+
result_queue = queue.Queue(maxsize=1)
|
|
16
|
+
|
|
17
|
+
def target():
|
|
18
|
+
try:
|
|
19
|
+
result = func(*args, **kwargs)
|
|
20
|
+
result_queue.put(result)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
result_queue.put(e)
|
|
23
|
+
|
|
24
|
+
thread = threading.Thread(target=target)
|
|
25
|
+
thread.daemon = True
|
|
26
|
+
thread.start()
|
|
27
|
+
|
|
28
|
+
for a in range(attempts):
|
|
29
|
+
try:
|
|
30
|
+
result = result_queue.get(timeout=seconds)
|
|
31
|
+
if isinstance(result, Exception):
|
|
32
|
+
raise result
|
|
33
|
+
return result
|
|
34
|
+
except queue.Empty:
|
|
35
|
+
pass
|
|
36
|
+
raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
|
|
37
|
+
|
|
38
|
+
@wraps(func)
|
|
39
|
+
async def async_wrapper(*args, **kwargs) -> Any:
|
|
40
|
+
if seconds is None:
|
|
41
|
+
return await func(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
for a in range(attempts):
|
|
44
|
+
try:
|
|
45
|
+
with trio.fail_after(seconds):
|
|
46
|
+
return await func(*args, **kwargs)
|
|
47
|
+
except trio.TooSlowError:
|
|
48
|
+
if a < attempts - 1:
|
|
49
|
+
continue
|
|
50
|
+
if on_timeout is not None:
|
|
51
|
+
if callable(on_timeout):
|
|
52
|
+
result = on_timeout()
|
|
53
|
+
if isinstance(result, Coroutine):
|
|
54
|
+
return await result
|
|
55
|
+
return result
|
|
56
|
+
return on_timeout
|
|
57
|
+
|
|
58
|
+
if exception is None:
|
|
59
|
+
raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
|
60
|
+
|
|
61
|
+
if isinstance(exception, BaseException):
|
|
62
|
+
raise exception
|
|
63
|
+
|
|
64
|
+
if isinstance(exception, type) and issubclass(exception, BaseException):
|
|
65
|
+
raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
|
66
|
+
|
|
67
|
+
raise RuntimeError("Invalid exception type provided")
|
|
68
|
+
|
|
69
|
+
if asyncio.iscoroutinefunction(func):
|
|
70
|
+
return async_wrapper
|
|
71
|
+
return wrapper
|
|
72
|
+
|
|
73
|
+
return decorator
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
def clean_markdown_block(text):
|
|
5
|
+
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
|
|
6
|
+
text = re.sub(r'\n?\s*```\s*$', '', text)
|
|
7
|
+
return text.strip()
|
|
8
|
+
|
|
9
|
+
def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
|
|
10
|
+
"""
|
|
11
|
+
A simple wrapper to process image to markdown texts via VLM.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Simple markdown texts generated by VLM.
|
|
15
|
+
"""
|
|
16
|
+
callback = callback or (lambda prog, msg: None)
|
|
17
|
+
|
|
18
|
+
img = binary
|
|
19
|
+
txt = ""
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
img_binary = io.BytesIO()
|
|
23
|
+
img.save(img_binary, format='JPEG')
|
|
24
|
+
img_binary.seek(0)
|
|
25
|
+
|
|
26
|
+
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
|
|
27
|
+
|
|
28
|
+
txt += "\n" + ans
|
|
29
|
+
|
|
30
|
+
return txt
|
|
31
|
+
|
|
32
|
+
except Exception as e:
|
|
33
|
+
callback(-1, str(e))
|
|
34
|
+
|
|
35
|
+
return ""
|
deepdoc/dict/README.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
---
|
|
2
|
+
license: apache-2.0
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
### Model Loading
|
|
6
|
+
```python
|
|
7
|
+
import xgboost as xgb
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
model = xgb.Booster()
|
|
11
|
+
if torch.cuda.is_available():
|
|
12
|
+
model.set_param({"device": "cuda"})
|
|
13
|
+
model.load_model('InfiniFlow/text_concat_xgb_v1.0')
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### Prediction
|
|
17
|
+
```python
|
|
18
|
+
model.predict(xgb.DMatrix([feature]))[0]
|
|
19
|
+
```
|