deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
deepdoc/README.md ADDED
@@ -0,0 +1,122 @@
1
+ English | [简体中文](./README_zh.md)
2
+
3
+ # *Deep*Doc
4
+
5
+ - [1. Introduction](#1)
6
+ - [2. Vision](#2)
7
+ - [3. Parser](#3)
8
+
9
+ <a name="1"></a>
10
+ ## 1. Introduction
11
+
12
+ With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
13
+ an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
14
+ There are 2 parts in *Deep*Doc so far: vision and parser.
15
+ You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
16
+ ```bash
17
+ python deepdoc/vision/t_ocr.py -h
18
+ usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
19
+
20
+ options:
21
+ -h, --help show this help message and exit
22
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
23
+ --output_dir OUTPUT_DIR
24
+ Directory where to store the output images. Default: './ocr_outputs'
25
+ ```
26
+ ```bash
27
+ python deepdoc/vision/t_recognizer.py -h
28
+ usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
29
+
30
+ options:
31
+ -h, --help show this help message and exit
32
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
33
+ --output_dir OUTPUT_DIR
34
+ Directory where to store the output images. Default: './layouts_outputs'
35
+ --threshold THRESHOLD
36
+ A threshold to filter out detections. Default: 0.5
37
+ --mode {layout,tsr} Task mode: layout recognition or table structure recognition
38
+ ```
39
+
40
+ Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
41
+ ```bash
42
+ export HF_ENDPOINT=https://hf-mirror.com
43
+ ```
44
+
45
+ <a name="2"></a>
46
+ ## 2. Vision
47
+
48
+ We use vision information to resolve problems as human being.
49
+ - OCR. Since a lot of documents presented as images or at least be able to transform to image,
50
+ OCR is a very essential and fundamental or even universal solution for text extraction.
51
+ ```bash
52
+ python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
53
+ ```
54
+ The inputs could be directory to images or PDF, or an image or PDF.
55
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
56
+ txt files which contain the OCR text.
57
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
58
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
59
+ </div>
60
+
61
+ - Layout recognition. Documents from different domain may have various layouts,
62
+ like, newspaper, magazine, book and résumé are distinct in terms of layout.
63
+ Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
64
+ or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
65
+ We have 10 basic layout components which covers most cases:
66
+ - Text
67
+ - Title
68
+ - Figure
69
+ - Figure caption
70
+ - Table
71
+ - Table caption
72
+ - Header
73
+ - Footer
74
+ - Reference
75
+ - Equation
76
+
77
+ Have a try on the following command to see the layout detection results.
78
+ ```bash
79
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
80
+ ```
81
+ The inputs could be directory to images or PDF, or an image or PDF.
82
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
83
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
84
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
85
+ </div>
86
+
87
+ - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
88
+ And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
89
+ Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
90
+ We have five labels for TSR task:
91
+ - Column
92
+ - Row
93
+ - Column header
94
+ - Projected row header
95
+ - Spanning cell
96
+
97
+ Have a try on the following command to see the layout detection results.
98
+ ```bash
99
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
100
+ ```
101
+ The inputs could be directory to images or PDF, or a image or PDF.
102
+ You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
103
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
104
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
105
+ </div>
106
+
107
+ <a name="3"></a>
108
+ ## 3. Parser
109
+
110
+ Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
111
+ The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
112
+ - Text chunks with their own positions in PDF(page number and rectangular positions).
113
+ - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
114
+ - Figures with caption and text in the figures.
115
+
116
+ ### Résumé
117
+
118
+ The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
119
+ with various layouts could be resolved into structured data composed of nearly a hundred of fields.
120
+ We haven't opened the parser yet, as we open the processing method after parsing procedure.
121
+
122
+
deepdoc/README_zh.md ADDED
@@ -0,0 +1,116 @@
1
+ [English](./README.md) | 简体中文
2
+
3
+ # *Deep*Doc
4
+
5
+ - [*Deep*Doc](#deepdoc)
6
+ - [1. 介绍](#1-介绍)
7
+ - [2. 视觉处理](#2-视觉处理)
8
+ - [3. 解析器](#3-解析器)
9
+ - [简历](#简历)
10
+
11
+ <a name="1"></a>
12
+ ## 1. 介绍
13
+
14
+ 对于来自不同领域、具有不同格式和不同检索要求的大量文档,准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止,*Deep*Doc 中有两个组成部分:视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣,您可以运行下面的测试程序。
15
+
16
+ ```bash
17
+ python deepdoc/vision/t_ocr.py -h
18
+ usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
19
+
20
+ options:
21
+ -h, --help show this help message and exit
22
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
23
+ --output_dir OUTPUT_DIR
24
+ Directory where to store the output images. Default: './ocr_outputs'
25
+ ```
26
+
27
+ ```bash
28
+ python deepdoc/vision/t_recognizer.py -h
29
+ usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
30
+
31
+ options:
32
+ -h, --help show this help message and exit
33
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
34
+ --output_dir OUTPUT_DIR
35
+ Directory where to store the output images. Default: './layouts_outputs'
36
+ --threshold THRESHOLD
37
+ A threshold to filter out detections. Default: 0.5
38
+ --mode {layout,tsr} Task mode: layout recognition or table structure recognition
39
+ ```
40
+
41
+ HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题,这可能会有所帮助!!
42
+
43
+ ```bash
44
+ export HF_ENDPOINT=https://hf-mirror.com
45
+ ```
46
+
47
+ <a name="2"></a>
48
+ ## 2. 视觉处理
49
+
50
+ 作为人类,我们使用视觉信息来解决问题。
51
+
52
+ - **OCR(Optical Character Recognition,光学字符识别)**。由于许多文档都是以图像形式呈现的,或者至少能够转换为图像,因此OCR是文本提取的一个非常重要、基本,甚至通用的解决方案。
53
+
54
+ ```bash
55
+ python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
56
+ ```
57
+
58
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有演示结果位置的图像,以及包含OCR文本的txt文件。
59
+
60
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
61
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
62
+ </div>
63
+
64
+ - 布局识别(Layout recognition)。来自不同领域的文件可能有不同的布局,如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时,它才能决定这些文本部分是连续的还是不连续的,或者这个部分需要表结构识别(Table Structure Recognition,TSR)来处理,或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件,涵盖了大多数情况:
65
+ - 文本
66
+ - 标题
67
+ - 配图
68
+ - 配图标题
69
+ - 表格
70
+ - 表格标题
71
+ - 页头
72
+ - 页尾
73
+ - 参考引用
74
+ - 公式
75
+
76
+ 请尝试以下命令以查看布局检测结果。
77
+
78
+ ```bash
79
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
80
+ ```
81
+
82
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有显示检测结果的图像,如下所示:
83
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
84
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
85
+ </div>
86
+
87
+ - **TSR(Table Structure Recognition,表结构识别)**。数据表是一种常用的结构,用于表示包括数字或文本在内的数据。表的结构可能非常复杂,比如层次结构标题、跨单元格和投影行标题。除了TSR,我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签:
88
+ - 列
89
+ - 行
90
+ - 列标题
91
+ - 行标题
92
+ - 合并单元格
93
+
94
+ 请尝试以下命令以查看布局检测结果。
95
+
96
+ ```bash
97
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
98
+ ```
99
+
100
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中包含图像和html页面,这些页面展示了以下检测结果:
101
+
102
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
103
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
104
+ </div>
105
+
106
+ <a name="3"></a>
107
+ ## 3. 解析器
108
+
109
+ PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器,因为PDF具有灵活性。PDF解析器的输出包括:
110
+ - 在PDF中有自己位置的文本块(页码和矩形位置)。
111
+ - 带有PDF裁剪图像的表格,以及已经翻译成自然语言句子的内容。
112
+ - 图中带标题和文字的图。
113
+
114
+ ### 简历
115
+
116
+ 简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器,因为在解析过程之后才会启动处理方法。
deepdoc/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from beartype.claw import beartype_this_package
18
+ beartype_this_package()
19
+
20
+ from .parser import *
21
+ from .depend.simple_cv_model import *
22
+ from .config import PdfModelConfig, TokenizerConfig, ParserRuntimeConfig
23
+ from .llm_adapter import LLMAdapter, LLMType, vision_llm_chunk
24
+
25
+ __all__ = [
26
+ "PdfParser",
27
+ "PlainParser",
28
+ "DocxParser",
29
+ "DoclingParser",
30
+ "ExcelParser",
31
+ "PptParser",
32
+ "HtmlParser",
33
+ "JsonParser",
34
+ "MarkdownParser",
35
+ "TxtParser",
36
+ "TokenizerConfig",
37
+ "PdfModelConfig",
38
+ "ParserRuntimeConfig",
39
+ # LLM Adapter exports
40
+ "LLMAdapter",
41
+ "LLMType",
42
+ "vision_llm_chunk",
43
+ ]
deepdoc/_version.py ADDED
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.2.0'
32
+ __version_tuple__ = version_tuple = (0, 2, 0)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,52 @@
1
+ """
2
+ Common utilities for DeepDoc independent library.
3
+
4
+ This module provides common utilities that were previously imported from RAGFlow.
5
+ These are simplified versions suitable for an independent library.
6
+ """
7
+
8
+ from .file_utils import get_project_base_directory, traversal_files
9
+ from .token_utils import num_tokens_from_string, total_token_count_from_response, truncate
10
+ from .misc_utils import pip_install_torch
11
+ from .connection_utils import timeout
12
+ from .config_utils import get_base_config, get_config_value
13
+ from .settings import PARALLEL_DEVICES, check_and_install_torch
14
+ from .model_store import (
15
+ resolve_bundle_dir,
16
+ resolve_tokenizer_dict_prefix,
17
+ resolve_vision_model_dir,
18
+ resolve_xgb_model_dir,
19
+ validate_bundle_dir,
20
+ )
21
+
22
+ __all__ = [
23
+ # file_utils
24
+ "get_project_base_directory",
25
+ "traversal_files",
26
+
27
+ # token_utils
28
+ "num_tokens_from_string",
29
+ "total_token_count_from_response",
30
+ "truncate",
31
+
32
+ # misc_utils
33
+ "pip_install_torch",
34
+
35
+ # connection_utils
36
+ "timeout",
37
+
38
+ # config_utils
39
+ "get_base_config",
40
+ "get_config_value",
41
+
42
+ # settings
43
+ "PARALLEL_DEVICES",
44
+ "check_and_install_torch",
45
+
46
+ # model_store
47
+ "resolve_bundle_dir",
48
+ "resolve_tokenizer_dict_prefix",
49
+ "resolve_vision_model_dir",
50
+ "resolve_xgb_model_dir",
51
+ "validate_bundle_dir",
52
+ ]
@@ -0,0 +1,63 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """
18
+ Basic configuration utilities for DeepDoc independent library.
19
+ This is a simplified version compared to RAGFlow's complex configuration system.
20
+ """
21
+
22
+ import os
23
+ from typing import Any, Dict, Optional
24
+
25
+
26
+ def get_base_config(service_name: str, default_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
27
+ """
28
+ Get base configuration for a service.
29
+ This is a simplified version that uses environment variables.
30
+
31
+ Args:
32
+ service_name: Name of the service
33
+ default_config: Default configuration to return
34
+
35
+ Returns:
36
+ Configuration dictionary
37
+ """
38
+ if default_config is None:
39
+ default_config = {}
40
+
41
+ # For tcadp_parser, provide basic Tencent Cloud configuration
42
+ if service_name == "tcadp" or "tencent" in service_name.lower():
43
+ return {
44
+ "secret_id": os.getenv("TENCENT_CLOUD_SECRET_ID", ""),
45
+ "secret_key": os.getenv("TENCENT_CLOUD_SECRET_KEY", ""),
46
+ "region": os.getenv("TENCENT_CLOUD_REGION", "ap-guangzhou"),
47
+ }
48
+
49
+ return default_config
50
+
51
+
52
+ def get_config_value(key: str, default: Any = None) -> Any:
53
+ """
54
+ Get configuration value from environment variables.
55
+
56
+ Args:
57
+ key: Configuration key
58
+ default: Default value if not found
59
+
60
+ Returns:
61
+ Configuration value
62
+ """
63
+ return os.getenv(key, default)
@@ -0,0 +1,73 @@
1
+ import asyncio
2
+ import queue
3
+ import trio
4
+ import threading
5
+ from functools import wraps
6
+ from typing import Any, Callable, Coroutine, Optional, Type, Union
7
+
8
+ TimeoutException = Union[Type[BaseException], BaseException]
9
+ OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
10
+
11
+ def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
12
+ def decorator(func):
13
+ @wraps(func)
14
+ def wrapper(*args, **kwargs):
15
+ result_queue = queue.Queue(maxsize=1)
16
+
17
+ def target():
18
+ try:
19
+ result = func(*args, **kwargs)
20
+ result_queue.put(result)
21
+ except Exception as e:
22
+ result_queue.put(e)
23
+
24
+ thread = threading.Thread(target=target)
25
+ thread.daemon = True
26
+ thread.start()
27
+
28
+ for a in range(attempts):
29
+ try:
30
+ result = result_queue.get(timeout=seconds)
31
+ if isinstance(result, Exception):
32
+ raise result
33
+ return result
34
+ except queue.Empty:
35
+ pass
36
+ raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
37
+
38
+ @wraps(func)
39
+ async def async_wrapper(*args, **kwargs) -> Any:
40
+ if seconds is None:
41
+ return await func(*args, **kwargs)
42
+
43
+ for a in range(attempts):
44
+ try:
45
+ with trio.fail_after(seconds):
46
+ return await func(*args, **kwargs)
47
+ except trio.TooSlowError:
48
+ if a < attempts - 1:
49
+ continue
50
+ if on_timeout is not None:
51
+ if callable(on_timeout):
52
+ result = on_timeout()
53
+ if isinstance(result, Coroutine):
54
+ return await result
55
+ return result
56
+ return on_timeout
57
+
58
+ if exception is None:
59
+ raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
60
+
61
+ if isinstance(exception, BaseException):
62
+ raise exception
63
+
64
+ if isinstance(exception, type) and issubclass(exception, BaseException):
65
+ raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
66
+
67
+ raise RuntimeError("Invalid exception type provided")
68
+
69
+ if asyncio.iscoroutinefunction(func):
70
+ return async_wrapper
71
+ return wrapper
72
+
73
+ return decorator
@@ -0,0 +1,19 @@
1
+ import os
2
+
3
+ # 去除对 ragflow 环境变量的依赖,改为基于 deepdoc 项目根目录的相对路径计算
4
+ def get_project_base_directory(*args):
5
+ # 计算 deepdoc 项目根目录,假设 depend 文件夹在 deepdoc 根目录下
6
+ deepdoc_root = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
7
+ if args:
8
+ return os.path.join(deepdoc_root, *args)
9
+ return deepdoc_root
10
+
11
+
12
+ def traversal_files(base):
13
+ for root, ds, fs in os.walk(base):
14
+ for f in fs:
15
+ fullname = os.path.join(root, f)
16
+ yield fullname
17
+
18
+
19
+
@@ -0,0 +1,44 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import os
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def pip_install_torch():
24
+ """
25
+ Install torch based on system configuration.
26
+ This is a simplified version for the independent library.
27
+ """
28
+ try:
29
+ import torch
30
+
31
+ logger.info("PyTorch is already installed")
32
+ return True
33
+ except ImportError:
34
+ return False
35
+
36
+
37
+ def parse_bool(value: str | None, default: bool = False) -> bool:
38
+ if value is None:
39
+ return default
40
+ return value.strip().lower() in {"1", "true", "yes", "on"}
41
+
42
+
43
+ def offline_mode_or_from_env(offline: bool | None = None) -> bool:
44
+ return offline if offline is not None else parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)