deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
deepdoc/README.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
English | [简体中文](./README_zh.md)
|
|
2
|
+
|
|
3
|
+
# *Deep*Doc
|
|
4
|
+
|
|
5
|
+
- [1. Introduction](#1)
|
|
6
|
+
- [2. Vision](#2)
|
|
7
|
+
- [3. Parser](#3)
|
|
8
|
+
|
|
9
|
+
<a name="1"></a>
|
|
10
|
+
## 1. Introduction
|
|
11
|
+
|
|
12
|
+
With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
|
|
13
|
+
an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
|
|
14
|
+
There are 2 parts in *Deep*Doc so far: vision and parser.
|
|
15
|
+
You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
|
|
16
|
+
```bash
|
|
17
|
+
python deepdoc/vision/t_ocr.py -h
|
|
18
|
+
usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
|
|
19
|
+
|
|
20
|
+
options:
|
|
21
|
+
-h, --help show this help message and exit
|
|
22
|
+
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
|
|
23
|
+
--output_dir OUTPUT_DIR
|
|
24
|
+
Directory where to store the output images. Default: './ocr_outputs'
|
|
25
|
+
```
|
|
26
|
+
```bash
|
|
27
|
+
python deepdoc/vision/t_recognizer.py -h
|
|
28
|
+
usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
|
|
29
|
+
|
|
30
|
+
options:
|
|
31
|
+
-h, --help show this help message and exit
|
|
32
|
+
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
|
|
33
|
+
--output_dir OUTPUT_DIR
|
|
34
|
+
Directory where to store the output images. Default: './layouts_outputs'
|
|
35
|
+
--threshold THRESHOLD
|
|
36
|
+
A threshold to filter out detections. Default: 0.5
|
|
37
|
+
--mode {layout,tsr} Task mode: layout recognition or table structure recognition
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
|
|
41
|
+
```bash
|
|
42
|
+
export HF_ENDPOINT=https://hf-mirror.com
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
<a name="2"></a>
|
|
46
|
+
## 2. Vision
|
|
47
|
+
|
|
48
|
+
We use vision information to resolve problems as human being.
|
|
49
|
+
- OCR. Since a lot of documents presented as images or at least be able to transform to image,
|
|
50
|
+
OCR is a very essential and fundamental or even universal solution for text extraction.
|
|
51
|
+
```bash
|
|
52
|
+
python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
|
|
53
|
+
```
|
|
54
|
+
The inputs could be directory to images or PDF, or an image or PDF.
|
|
55
|
+
You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
|
|
56
|
+
txt files which contain the OCR text.
|
|
57
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
58
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
- Layout recognition. Documents from different domain may have various layouts,
|
|
62
|
+
like, newspaper, magazine, book and résumé are distinct in terms of layout.
|
|
63
|
+
Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
|
|
64
|
+
or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
|
|
65
|
+
We have 10 basic layout components which covers most cases:
|
|
66
|
+
- Text
|
|
67
|
+
- Title
|
|
68
|
+
- Figure
|
|
69
|
+
- Figure caption
|
|
70
|
+
- Table
|
|
71
|
+
- Table caption
|
|
72
|
+
- Header
|
|
73
|
+
- Footer
|
|
74
|
+
- Reference
|
|
75
|
+
- Equation
|
|
76
|
+
|
|
77
|
+
Have a try on the following command to see the layout detection results.
|
|
78
|
+
```bash
|
|
79
|
+
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
|
|
80
|
+
```
|
|
81
|
+
The inputs could be directory to images or PDF, or an image or PDF.
|
|
82
|
+
You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
|
|
83
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
84
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
|
|
85
|
+
</div>
|
|
86
|
+
|
|
87
|
+
- Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
|
|
88
|
+
And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
|
|
89
|
+
Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
|
|
90
|
+
We have five labels for TSR task:
|
|
91
|
+
- Column
|
|
92
|
+
- Row
|
|
93
|
+
- Column header
|
|
94
|
+
- Projected row header
|
|
95
|
+
- Spanning cell
|
|
96
|
+
|
|
97
|
+
Have a try on the following command to see the layout detection results.
|
|
98
|
+
```bash
|
|
99
|
+
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
|
|
100
|
+
```
|
|
101
|
+
The inputs could be directory to images or PDF, or a image or PDF.
|
|
102
|
+
You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
|
|
103
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
104
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
|
|
105
|
+
</div>
|
|
106
|
+
|
|
107
|
+
<a name="3"></a>
|
|
108
|
+
## 3. Parser
|
|
109
|
+
|
|
110
|
+
Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
|
|
111
|
+
The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
|
|
112
|
+
- Text chunks with their own positions in PDF(page number and rectangular positions).
|
|
113
|
+
- Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
|
|
114
|
+
- Figures with caption and text in the figures.
|
|
115
|
+
|
|
116
|
+
### Résumé
|
|
117
|
+
|
|
118
|
+
The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
|
|
119
|
+
with various layouts could be resolved into structured data composed of nearly a hundred of fields.
|
|
120
|
+
We haven't opened the parser yet, as we open the processing method after parsing procedure.
|
|
121
|
+
|
|
122
|
+
|
deepdoc/README_zh.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
[English](./README.md) | 简体中文
|
|
2
|
+
|
|
3
|
+
# *Deep*Doc
|
|
4
|
+
|
|
5
|
+
- [*Deep*Doc](#deepdoc)
|
|
6
|
+
- [1. 介绍](#1-介绍)
|
|
7
|
+
- [2. 视觉处理](#2-视觉处理)
|
|
8
|
+
- [3. 解析器](#3-解析器)
|
|
9
|
+
- [简历](#简历)
|
|
10
|
+
|
|
11
|
+
<a name="1"></a>
|
|
12
|
+
## 1. 介绍
|
|
13
|
+
|
|
14
|
+
对于来自不同领域、具有不同格式和不同检索要求的大量文档,准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止,*Deep*Doc 中有两个组成部分:视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣,您可以运行下面的测试程序。
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
python deepdoc/vision/t_ocr.py -h
|
|
18
|
+
usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
|
|
19
|
+
|
|
20
|
+
options:
|
|
21
|
+
-h, --help show this help message and exit
|
|
22
|
+
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
|
|
23
|
+
--output_dir OUTPUT_DIR
|
|
24
|
+
Directory where to store the output images. Default: './ocr_outputs'
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
python deepdoc/vision/t_recognizer.py -h
|
|
29
|
+
usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
|
|
30
|
+
|
|
31
|
+
options:
|
|
32
|
+
-h, --help show this help message and exit
|
|
33
|
+
--inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
|
|
34
|
+
--output_dir OUTPUT_DIR
|
|
35
|
+
Directory where to store the output images. Default: './layouts_outputs'
|
|
36
|
+
--threshold THRESHOLD
|
|
37
|
+
A threshold to filter out detections. Default: 0.5
|
|
38
|
+
--mode {layout,tsr} Task mode: layout recognition or table structure recognition
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题,这可能会有所帮助!!
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
export HF_ENDPOINT=https://hf-mirror.com
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
<a name="2"></a>
|
|
48
|
+
## 2. 视觉处理
|
|
49
|
+
|
|
50
|
+
作为人类,我们使用视觉信息来解决问题。
|
|
51
|
+
|
|
52
|
+
- **OCR(Optical Character Recognition,光学字符识别)**。由于许多文档都是以图像形式呈现的,或者至少能够转换为图像,因此OCR是文本提取的一个非常重要、基本,甚至通用的解决方案。
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有演示结果位置的图像,以及包含OCR文本的txt文件。
|
|
59
|
+
|
|
60
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
61
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
|
|
62
|
+
</div>
|
|
63
|
+
|
|
64
|
+
- 布局识别(Layout recognition)。来自不同领域的文件可能有不同的布局,如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时,它才能决定这些文本部分是连续的还是不连续的,或者这个部分需要表结构识别(Table Structure Recognition,TSR)来处理,或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件,涵盖了大多数情况:
|
|
65
|
+
- 文本
|
|
66
|
+
- 标题
|
|
67
|
+
- 配图
|
|
68
|
+
- 配图标题
|
|
69
|
+
- 表格
|
|
70
|
+
- 表格标题
|
|
71
|
+
- 页头
|
|
72
|
+
- 页尾
|
|
73
|
+
- 参考引用
|
|
74
|
+
- 公式
|
|
75
|
+
|
|
76
|
+
请尝试以下命令以查看布局检测结果。
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有显示检测结果的图像,如下所示:
|
|
83
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
84
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
|
|
85
|
+
</div>
|
|
86
|
+
|
|
87
|
+
- **TSR(Table Structure Recognition,表结构识别)**。数据表是一种常用的结构,用于表示包括数字或文本在内的数据。表的结构可能非常复杂,比如层次结构标题、跨单元格和投影行标题。除了TSR,我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签:
|
|
88
|
+
- 列
|
|
89
|
+
- 行
|
|
90
|
+
- 列标题
|
|
91
|
+
- 行标题
|
|
92
|
+
- 合并单元格
|
|
93
|
+
|
|
94
|
+
请尝试以下命令以查看布局检测结果。
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中包含图像和html页面,这些页面展示了以下检测结果:
|
|
101
|
+
|
|
102
|
+
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
|
103
|
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
|
|
104
|
+
</div>
|
|
105
|
+
|
|
106
|
+
<a name="3"></a>
|
|
107
|
+
## 3. 解析器
|
|
108
|
+
|
|
109
|
+
PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器,因为PDF具有灵活性。PDF解析器的输出包括:
|
|
110
|
+
- 在PDF中有自己位置的文本块(页码和矩形位置)。
|
|
111
|
+
- 带有PDF裁剪图像的表格,以及已经翻译成自然语言句子的内容。
|
|
112
|
+
- 图中带标题和文字的图。
|
|
113
|
+
|
|
114
|
+
### 简历
|
|
115
|
+
|
|
116
|
+
简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器,因为在解析过程之后才会启动处理方法。
|
deepdoc/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
from beartype.claw import beartype_this_package
|
|
18
|
+
beartype_this_package()
|
|
19
|
+
|
|
20
|
+
from .parser import *
|
|
21
|
+
from .depend.simple_cv_model import *
|
|
22
|
+
from .config import PdfModelConfig, TokenizerConfig, ParserRuntimeConfig
|
|
23
|
+
from .llm_adapter import LLMAdapter, LLMType, vision_llm_chunk
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"PdfParser",
|
|
27
|
+
"PlainParser",
|
|
28
|
+
"DocxParser",
|
|
29
|
+
"DoclingParser",
|
|
30
|
+
"ExcelParser",
|
|
31
|
+
"PptParser",
|
|
32
|
+
"HtmlParser",
|
|
33
|
+
"JsonParser",
|
|
34
|
+
"MarkdownParser",
|
|
35
|
+
"TxtParser",
|
|
36
|
+
"TokenizerConfig",
|
|
37
|
+
"PdfModelConfig",
|
|
38
|
+
"ParserRuntimeConfig",
|
|
39
|
+
# LLM Adapter exports
|
|
40
|
+
"LLMAdapter",
|
|
41
|
+
"LLMType",
|
|
42
|
+
"vision_llm_chunk",
|
|
43
|
+
]
|
deepdoc/_version.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.2.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 0)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common utilities for DeepDoc independent library.
|
|
3
|
+
|
|
4
|
+
This module provides common utilities that were previously imported from RAGFlow.
|
|
5
|
+
These are simplified versions suitable for an independent library.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .file_utils import get_project_base_directory, traversal_files
|
|
9
|
+
from .token_utils import num_tokens_from_string, total_token_count_from_response, truncate
|
|
10
|
+
from .misc_utils import pip_install_torch
|
|
11
|
+
from .connection_utils import timeout
|
|
12
|
+
from .config_utils import get_base_config, get_config_value
|
|
13
|
+
from .settings import PARALLEL_DEVICES, check_and_install_torch
|
|
14
|
+
from .model_store import (
|
|
15
|
+
resolve_bundle_dir,
|
|
16
|
+
resolve_tokenizer_dict_prefix,
|
|
17
|
+
resolve_vision_model_dir,
|
|
18
|
+
resolve_xgb_model_dir,
|
|
19
|
+
validate_bundle_dir,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
# file_utils
|
|
24
|
+
"get_project_base_directory",
|
|
25
|
+
"traversal_files",
|
|
26
|
+
|
|
27
|
+
# token_utils
|
|
28
|
+
"num_tokens_from_string",
|
|
29
|
+
"total_token_count_from_response",
|
|
30
|
+
"truncate",
|
|
31
|
+
|
|
32
|
+
# misc_utils
|
|
33
|
+
"pip_install_torch",
|
|
34
|
+
|
|
35
|
+
# connection_utils
|
|
36
|
+
"timeout",
|
|
37
|
+
|
|
38
|
+
# config_utils
|
|
39
|
+
"get_base_config",
|
|
40
|
+
"get_config_value",
|
|
41
|
+
|
|
42
|
+
# settings
|
|
43
|
+
"PARALLEL_DEVICES",
|
|
44
|
+
"check_and_install_torch",
|
|
45
|
+
|
|
46
|
+
# model_store
|
|
47
|
+
"resolve_bundle_dir",
|
|
48
|
+
"resolve_tokenizer_dict_prefix",
|
|
49
|
+
"resolve_vision_model_dir",
|
|
50
|
+
"resolve_xgb_model_dir",
|
|
51
|
+
"validate_bundle_dir",
|
|
52
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Basic configuration utilities for DeepDoc independent library.
|
|
19
|
+
This is a simplified version compared to RAGFlow's complex configuration system.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from typing import Any, Dict, Optional
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_base_config(service_name: str, default_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
27
|
+
"""
|
|
28
|
+
Get base configuration for a service.
|
|
29
|
+
This is a simplified version that uses environment variables.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
service_name: Name of the service
|
|
33
|
+
default_config: Default configuration to return
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Configuration dictionary
|
|
37
|
+
"""
|
|
38
|
+
if default_config is None:
|
|
39
|
+
default_config = {}
|
|
40
|
+
|
|
41
|
+
# For tcadp_parser, provide basic Tencent Cloud configuration
|
|
42
|
+
if service_name == "tcadp" or "tencent" in service_name.lower():
|
|
43
|
+
return {
|
|
44
|
+
"secret_id": os.getenv("TENCENT_CLOUD_SECRET_ID", ""),
|
|
45
|
+
"secret_key": os.getenv("TENCENT_CLOUD_SECRET_KEY", ""),
|
|
46
|
+
"region": os.getenv("TENCENT_CLOUD_REGION", "ap-guangzhou"),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return default_config
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_config_value(key: str, default: Any = None) -> Any:
|
|
53
|
+
"""
|
|
54
|
+
Get configuration value from environment variables.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
key: Configuration key
|
|
58
|
+
default: Default value if not found
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Configuration value
|
|
62
|
+
"""
|
|
63
|
+
return os.getenv(key, default)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import queue
|
|
3
|
+
import trio
|
|
4
|
+
import threading
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from typing import Any, Callable, Coroutine, Optional, Type, Union
|
|
7
|
+
|
|
8
|
+
TimeoutException = Union[Type[BaseException], BaseException]
|
|
9
|
+
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
|
10
|
+
|
|
11
|
+
def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
|
|
12
|
+
def decorator(func):
|
|
13
|
+
@wraps(func)
|
|
14
|
+
def wrapper(*args, **kwargs):
|
|
15
|
+
result_queue = queue.Queue(maxsize=1)
|
|
16
|
+
|
|
17
|
+
def target():
|
|
18
|
+
try:
|
|
19
|
+
result = func(*args, **kwargs)
|
|
20
|
+
result_queue.put(result)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
result_queue.put(e)
|
|
23
|
+
|
|
24
|
+
thread = threading.Thread(target=target)
|
|
25
|
+
thread.daemon = True
|
|
26
|
+
thread.start()
|
|
27
|
+
|
|
28
|
+
for a in range(attempts):
|
|
29
|
+
try:
|
|
30
|
+
result = result_queue.get(timeout=seconds)
|
|
31
|
+
if isinstance(result, Exception):
|
|
32
|
+
raise result
|
|
33
|
+
return result
|
|
34
|
+
except queue.Empty:
|
|
35
|
+
pass
|
|
36
|
+
raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
|
|
37
|
+
|
|
38
|
+
@wraps(func)
|
|
39
|
+
async def async_wrapper(*args, **kwargs) -> Any:
|
|
40
|
+
if seconds is None:
|
|
41
|
+
return await func(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
for a in range(attempts):
|
|
44
|
+
try:
|
|
45
|
+
with trio.fail_after(seconds):
|
|
46
|
+
return await func(*args, **kwargs)
|
|
47
|
+
except trio.TooSlowError:
|
|
48
|
+
if a < attempts - 1:
|
|
49
|
+
continue
|
|
50
|
+
if on_timeout is not None:
|
|
51
|
+
if callable(on_timeout):
|
|
52
|
+
result = on_timeout()
|
|
53
|
+
if isinstance(result, Coroutine):
|
|
54
|
+
return await result
|
|
55
|
+
return result
|
|
56
|
+
return on_timeout
|
|
57
|
+
|
|
58
|
+
if exception is None:
|
|
59
|
+
raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
|
60
|
+
|
|
61
|
+
if isinstance(exception, BaseException):
|
|
62
|
+
raise exception
|
|
63
|
+
|
|
64
|
+
if isinstance(exception, type) and issubclass(exception, BaseException):
|
|
65
|
+
raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
|
66
|
+
|
|
67
|
+
raise RuntimeError("Invalid exception type provided")
|
|
68
|
+
|
|
69
|
+
if asyncio.iscoroutinefunction(func):
|
|
70
|
+
return async_wrapper
|
|
71
|
+
return wrapper
|
|
72
|
+
|
|
73
|
+
return decorator
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
# 去除对 ragflow 环境变量的依赖,改为基于 deepdoc 项目根目录的相对路径计算
|
|
4
|
+
def get_project_base_directory(*args):
|
|
5
|
+
# 计算 deepdoc 项目根目录,假设 depend 文件夹在 deepdoc 根目录下
|
|
6
|
+
deepdoc_root = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
|
|
7
|
+
if args:
|
|
8
|
+
return os.path.join(deepdoc_root, *args)
|
|
9
|
+
return deepdoc_root
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def traversal_files(base):
|
|
13
|
+
for root, ds, fs in os.walk(base):
|
|
14
|
+
for f in fs:
|
|
15
|
+
fullname = os.path.join(root, f)
|
|
16
|
+
yield fullname
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def pip_install_torch():
|
|
24
|
+
"""
|
|
25
|
+
Install torch based on system configuration.
|
|
26
|
+
This is a simplified version for the independent library.
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
import torch
|
|
30
|
+
|
|
31
|
+
logger.info("PyTorch is already installed")
|
|
32
|
+
return True
|
|
33
|
+
except ImportError:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def parse_bool(value: str | None, default: bool = False) -> bool:
|
|
38
|
+
if value is None:
|
|
39
|
+
return default
|
|
40
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def offline_mode_or_from_env(offline: bool | None = None) -> bool:
|
|
44
|
+
return offline if offline is not None else parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)
|